names=(${host}-${card} ${card} ${host}-${card}.${domn})
else
eval $_failure
+ errors+=1
continue
fi
--- /dev/null
+From c01faf2a8053f8968b9bac84a4cbd54a9952d472 Mon Sep 17 00:00:00 2001
+From: Phil Cayton <phil.cayton@intel.com>
+Date: Tue, 21 Jan 2014 08:59:29 -0800
+Subject: [PATCH 01/12] ib_core add mic node and scif transport types
+
+The OFED SCIF driver implements a software-emulated RDMA device to allow OFED
+based applications, such as Intel MPI, to run on Intel(R) MIC Architecture
+without the presence of a physical HCA. OFED SCIF is only targeted for inter-
+node communication within a single platform, where a node is a coprocessor
+or the host processor. This patch adds new node and transport types to the
+ib_core kernel module to distinguish this new RDMA interface type.
+---
+diff -urN a0/drivers/infiniband/core/sysfs.c a1/drivers/infiniband/core/sysfs.c
+--- a0/drivers/infiniband/core/sysfs.c 2015-01-05 13:35:35.692687746 -0800
++++ a1/drivers/infiniband/core/sysfs.c 2015-01-05 13:46:38.792659814 -0800
+@@ -253,6 +253,8 @@
+ return sprintf(buf, "%s\n", "InfiniBand");
+ case IB_LINK_LAYER_ETHERNET:
+ return sprintf(buf, "%s\n", "Ethernet");
++ case IB_LINK_LAYER_SCIF:
++ return sprintf(buf, "%s\n", "SCIF");
+ default:
+ return sprintf(buf, "%s\n", "Unknown");
+ }
+@@ -623,6 +625,7 @@
+ case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type);
+ case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
+ case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
++ case RDMA_NODE_MIC: return sprintf(buf, "%d: MIC\n", dev->node_type);
+ default: return sprintf(buf, "%d: <unknown>\n", dev->node_type);
+ }
+ }
+diff -urN a0/drivers/infiniband/core/verbs.c a1/drivers/infiniband/core/verbs.c
+--- a0/drivers/infiniband/core/verbs.c 2015-01-05 13:35:35.693687746 -0800
++++ a1/drivers/infiniband/core/verbs.c 2015-01-05 13:49:08.470653509 -0800
+@@ -121,6 +121,8 @@
+ return RDMA_TRANSPORT_USNIC;
+ case RDMA_NODE_USNIC_UDP:
+ return RDMA_TRANSPORT_USNIC_UDP;
++ case RDMA_NODE_MIC:
++ return RDMA_TRANSPORT_SCIF;
+ default:
+ BUG();
+ return 0;
+@@ -140,6 +142,8 @@
+ case RDMA_TRANSPORT_USNIC:
+ case RDMA_TRANSPORT_USNIC_UDP:
+ return IB_LINK_LAYER_ETHERNET;
++ case RDMA_TRANSPORT_SCIF:
++ return IB_LINK_LAYER_SCIF;
+ default:
+ return IB_LINK_LAYER_UNSPECIFIED;
+ }
+diff -urN a0/include/rdma/ib_verbs.h a1/include/rdma/ib_verbs.h
+--- a0/include/rdma/ib_verbs.h 2015-01-05 13:45:40.299662278 -0800
++++ a1/include/rdma/ib_verbs.h 2015-01-05 13:50:57.590648913 -0800
+@@ -75,13 +75,15 @@
+ RDMA_NODE_RNIC,
+ RDMA_NODE_USNIC,
+ RDMA_NODE_USNIC_UDP,
++ RDMA_NODE_MIC,
+ };
+
+ enum rdma_transport_type {
+ RDMA_TRANSPORT_IB,
+ RDMA_TRANSPORT_IWARP,
+ RDMA_TRANSPORT_USNIC,
+- RDMA_TRANSPORT_USNIC_UDP
++ RDMA_TRANSPORT_USNIC_UDP,
++ RDMA_TRANSPORT_SCIF,
+ };
+
+ __attribute_const__ enum rdma_transport_type
+@@ -91,6 +93,7 @@
+ IB_LINK_LAYER_UNSPECIFIED,
+ IB_LINK_LAYER_INFINIBAND,
+ IB_LINK_LAYER_ETHERNET,
++ IB_LINK_LAYER_SCIF
+ };
+
+ enum ib_device_cap_flags {
+Binary files a0/include/rdma/.ib_verbs.h.rej.swp and a1/include/rdma/.ib_verbs.h.rej.swp differ
--- /dev/null
+From faf3b3f931806d4f044068c4e9b2ca4482a9177a Mon Sep 17 00:00:00 2001
+From: Phil Cayton <phil.cayton@intel.com>
+Date: Tue, 3 Jun 2014 09:50:57 -0700
+Subject: [PATCH 02/12] rdma_cm add mic node and scif transport types
+
+The OFED SCIF driver can leverage the iWARP cm calls to establish connections.
+This patch utilizes the new node and transport types in the rdma cm to call
+the underlying driver as needed.
+---
+diff -urN a1/drivers/infiniband/core/cma.c a2/drivers/infiniband/core/cma.c
+--- a1/drivers/infiniband/core/cma.c 2015-01-05 13:46:27.953660271 -0800
++++ a2/drivers/infiniband/core/cma.c 2015-01-05 14:05:11.897612926 -0800
+@@ -747,6 +747,7 @@
+ qp_attr->rq_psn = id_priv->seq_num;
+ break;
+ case RDMA_TRANSPORT_IWARP:
++ case RDMA_TRANSPORT_SCIF:
+ if (!id_priv->cm_id.iw) {
+ qp_attr->qp_access_flags = 0;
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+@@ -1043,6 +1044,7 @@
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ break;
+ case RDMA_TRANSPORT_IWARP:
++ case RDMA_TRANSPORT_SCIF:
+ if (id_priv->cm_id.iw)
+ iw_destroy_cm_id(id_priv->cm_id.iw);
+ break;
+@@ -1994,6 +1996,7 @@
+ }
+ break;
+ case RDMA_TRANSPORT_IWARP:
++ case RDMA_TRANSPORT_SCIF:
+ ret = cma_resolve_iw_route(id_priv, timeout_ms);
+ break;
+ default:
+@@ -2184,6 +2187,25 @@
+ return ret;
+ }
+
++static int cma_resolve_scif(struct rdma_id_private *id_priv)
++{
++ struct cma_work *work;
++
++ work = kzalloc(sizeof *work, GFP_KERNEL);
++ if (!work)
++ return -ENOMEM;
++
++ /* we probably can leave it empty here */
++
++ work->id = id_priv;
++ INIT_WORK(&work->work, cma_work_handler);
++ work->old_state = RDMA_CM_ADDR_QUERY;
++ work->new_state = RDMA_CM_ADDR_RESOLVED;
++ work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
++ queue_work(cma_wq, &work->work);
++ return 0;
++}
++
+ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ struct sockaddr *dst_addr)
+ {
+@@ -2225,9 +2247,12 @@
+ if (cma_any_addr(dst_addr)) {
+ ret = cma_resolve_loopback(id_priv);
+ } else {
+- if (dst_addr->sa_family == AF_IB) {
++ if (dst_addr->sa_family == AF_IB)
+ ret = cma_resolve_ib_addr(id_priv);
+- } else {
++ else if ((id_priv->id.device != NULL) &&
++ (rdma_node_get_transport(id_priv->id.device->node_type) == RDMA_TRANSPORT_SCIF))
++ ret = cma_resolve_scif(id_priv);
++ else {
+ ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv),
+ dst_addr, &id->route.addr.dev_addr,
+ timeout_ms, addr_handler, id_priv);
+@@ -2598,6 +2623,7 @@
+ goto err;
+ break;
+ case RDMA_TRANSPORT_IWARP:
++ case RDMA_TRANSPORT_SCIF:
+ ret = cma_iw_listen(id_priv, backlog);
+ if (ret)
+ goto err;
+@@ -2946,6 +2972,7 @@
+ ret = cma_connect_ib(id_priv, conn_param);
+ break;
+ case RDMA_TRANSPORT_IWARP:
++ case RDMA_TRANSPORT_SCIF:
+ ret = cma_connect_iw(id_priv, conn_param);
+ break;
+ default:
+@@ -3073,6 +3100,7 @@
+ }
+ break;
+ case RDMA_TRANSPORT_IWARP:
++ case RDMA_TRANSPORT_SCIF:
+ ret = cma_accept_iw(id_priv, conn_param);
+ break;
+ default:
+@@ -3133,6 +3161,7 @@
+ 0, private_data, private_data_len);
+ break;
+ case RDMA_TRANSPORT_IWARP:
++ case RDMA_TRANSPORT_SCIF:
+ ret = iw_cm_reject(id_priv->cm_id.iw,
+ private_data, private_data_len);
+ break;
+@@ -3163,6 +3192,7 @@
+ ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
+ break;
+ case RDMA_TRANSPORT_IWARP:
++ case RDMA_TRANSPORT_SCIF:
+ ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
+ break;
+ default:
--- /dev/null
+From 2ddd9c09050d6f74a2ea9e3e21a76510bbdff155 Mon Sep 17 00:00:00 2001
+From: Phil Cayton <phil.cayton@intel.com>
+Date: Thu, 6 Feb 2014 14:23:36 -0800
+Subject: [PATCH 03/12] add context based udata support
+
+Normally the copy_to_user and copy_from_user calls are used to access vendor
+private data when allocating resources from processes. However, when the
+processes are running on MIC, this communication is proxied to the host kernel
+via SCIF. This patch allows setup of context-based udata access routines.
+---
+diff -urN a2/drivers/infiniband/core/uverbs_cmd.c a3/drivers/infiniband/core/uverbs_cmd.c
+--- a2/drivers/infiniband/core/uverbs_cmd.c 2015-01-05 13:59:55.217626266 -0800
++++ a3/drivers/infiniband/core/uverbs_cmd.c 2015-01-05 14:30:40.647548530 -0800
+@@ -57,6 +57,21 @@
+ static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" };
+ static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
+
++static int uverbs_copy_from_udata(void *dst, struct ib_udata *udata, size_t len)
++{
++ return copy_from_user(dst, udata->inbuf, len) ? -EFAULT : 0;
++}
++
++static int uverbs_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
++{
++ return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
++}
++
++struct ib_udata_ops uverbs_copy = {
++ .copy_from = uverbs_copy_from_udata,
++ .copy_to = uverbs_copy_to_udata
++};
++
+ /*
+ * The ib_uobject locking scheme is as follows:
+ *
+@@ -330,6 +345,7 @@
+ goto err;
+ }
+
++ ucontext->umem_ops = NULL;
+ ucontext->device = ibdev;
+ INIT_LIST_HEAD(&ucontext->pd_list);
+ INIT_LIST_HEAD(&ucontext->mr_list);
+Binary files a2/drivers/infiniband/core/.uverbs_cmd.c.rej.swp and a3/drivers/infiniband/core/.uverbs_cmd.c.rej.swp differ
+diff -urN a2/drivers/infiniband/core/uverbs.h a3/drivers/infiniband/core/uverbs.h
+--- a2/drivers/infiniband/core/uverbs.h 2015-01-05 13:59:55.216626266 -0800
++++ a3/drivers/infiniband/core/uverbs.h 2015-01-05 14:29:27.559551609 -0800
+@@ -47,8 +47,11 @@
+ #include <rdma/ib_umem.h>
+ #include <rdma/ib_user_verbs.h>
+
++extern struct ib_udata_ops uverbs_copy;
++
+ #define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \
+ do { \
++ (udata)->ops = &uverbs_copy; \
+ (udata)->inbuf = (const void __user *) (ibuf); \
+ (udata)->outbuf = (void __user *) (obuf); \
+ (udata)->inlen = (ilen); \
+@@ -57,6 +60,7 @@
+
+ #define INIT_UDATA_BUF_OR_NULL(udata, ibuf, obuf, ilen, olen) \
+ do { \
++ (udata)->ops = &uverbs_copy; \
+ (udata)->inbuf = (ilen) ? (const void __user *) (ibuf) : NULL; \
+ (udata)->outbuf = (olen) ? (void __user *) (obuf) : NULL; \
+ (udata)->inlen = (ilen); \
+diff -urN a2/include/rdma/ib_verbs.h a3/include/rdma/ib_verbs.h
+--- a2/include/rdma/ib_verbs.h 2015-01-05 13:59:55.219626266 -0800
++++ a3/include/rdma/ib_verbs.h 2015-01-05 14:18:48.871578512 -0800
+@@ -1147,7 +1147,14 @@
+ int live;
+ };
+
++struct ib_udata;
++struct ib_udata_ops {
++ int (*copy_from)(void *dest, struct ib_udata *udata, size_t len);
++ int (*copy_to)(struct ib_udata *udata, void *src, size_t len);
++};
++
+ struct ib_udata {
++ struct ib_udata_ops *ops;
+ const void __user *inbuf;
+ void __user *outbuf;
+ size_t inlen;
+@@ -1664,12 +1671,12 @@
+
+ static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
+ {
+- return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0;
++ return udata->ops->copy_from(dest, udata, len);
+ }
+
+ static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
+ {
+- return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
++ return udata->ops->copy_to(udata, src, len);
+ }
+
+ /**
--- /dev/null
+From 8b06f1090da0e12c6012d0d13d8b48c69640a6a7 Mon Sep 17 00:00:00 2001
+From: Phil Cayton <phil.cayton@intel.com>
+Date: Thu, 6 Feb 2014 14:08:02 -0800
+Subject: [PATCH 04/12] add context based umem support
+
+The ib_umem_get routine calls get_user_pages to pin pages and create the
+ib_umem structure. Memory on MIC, however, must be mapped through SCIF for
+access across PCI. This patch allows setup of context-based ib_umem mapping
+routines.
+
+Also update mthca to support these changes
+---
+diff -urN a3/drivers/infiniband/core/umem.c a4/drivers/infiniband/core/umem.c
+--- a3/drivers/infiniband/core/umem.c 2015-01-05 14:12:52.117593540 -0800
++++ a4/drivers/infiniband/core/umem.c 2015-01-05 14:41:51.927520253 -0800
+@@ -57,6 +57,10 @@
+ for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
+
+ page = sg_page(sg);
++
++ if (!pfn_valid(page_to_pfn(page)))
++ continue;
++
+ if (umem->writable && dirty)
+ set_page_dirty_lock(page);
+ put_page(page);
+@@ -68,14 +72,71 @@
+ }
+
+ /**
+- * ib_umem_get - Pin and DMA map userspace memory.
++ * get_remap_pages() - get pages remapped to user virtual space
++ * @mm: mm struct of target mm
++ * @start: starting user address
++ * @nr_pages: number of pages to lookup
++ * @write flag to verify if vma is writable
++ * @pages: array that receives pointers to the pages. Should
++ * be at least nr_pages long. Or NULL, if caller only
++ * intends to ensure the pages are valid.
++ * @vmas: array of pointers to vmas corresponding to each page.
++ * Or NULL if the caller does not require them.
++ *
++ * Pages may be system ram or io space mmapped to user virtual
++ * space via remap_pfn_range or io_remap_page_range, respectively.
++ *
++ * Returns number of pages found, which may be less than the number
++ * requested. Returns 0 if nr_pages is 0.
++ *
++ * Must be called with mmap_sem held for read or write.
++ */
++static long get_remap_pages(struct mm_struct *mm, unsigned long start,
++ unsigned long nr_pages, int write,
++ struct page **pages, struct vm_area_struct **vmas)
++{
++ struct vm_area_struct *vma;
++ unsigned long pfn;
++ long i = 0;
++ int ret;
++
++ while (nr_pages) {
++ if (!(vma = find_vma(mm, start)))
++ return i ? : -EFAULT;
++ if (write && !(vma->vm_flags & VM_WRITE))
++ return i ? : -EFAULT;
++
++ do {
++ ret = follow_pfn(vma, start, &pfn);
++ if (ret)
++ return i ? : ret;
++
++ if (pages) {
++ pages[i] = pfn_to_page(pfn);
++ if (pfn_valid(pfn))
++ get_page(pages[i]);
++ }
++ if (vmas)
++ vmas[i] = vma;
++
++ start += PAGE_SIZE;
++ nr_pages--;
++ i++;
++ } while (nr_pages && start < vma->vm_end);
++ }
++
++ return i;
++}
++
++/**
++ * ib_get_umem - Pin and DMA map userspace memory.
+ * @context: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @dmasync: flush in-flight DMA when the memory region is written
+ */
+-struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
++struct ib_umem *ib_get_umem(struct ib_ucontext *context, unsigned long addr,
+ size_t size, int access, int dmasync)
+ {
+ struct ib_umem *umem;
+@@ -101,7 +162,6 @@
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+
+- umem->context = context;
+ umem->length = size;
+ umem->offset = addr & ~PAGE_MASK;
+ umem->page_size = PAGE_SIZE;
+@@ -163,11 +223,18 @@
+ sg_list_start = umem->sg_head.sgl;
+
+ while (npages) {
++
+ ret = get_user_pages(current, current->mm, cur_base,
+ min_t(unsigned long, npages,
+ PAGE_SIZE / sizeof (struct page *)),
+ 1, !umem->writable, page_list, vma_list);
+
++ if (ret == -EFAULT) /* may be a remapped area; try again */
++ ret = get_remap_pages(current->mm, cur_base,
++ min_t(unsigned long, npages,
++ PAGE_SIZE / sizeof (struct page *)),
++ !umem->writable, page_list, vma_list);
++
+ if (ret < 0)
+ goto out;
+
+@@ -219,7 +286,6 @@
+
+ return ret < 0 ? ERR_PTR(ret) : umem;
+ }
+-EXPORT_SYMBOL(ib_umem_get);
+
+ static void ib_umem_account(struct work_struct *work)
+ {
+@@ -237,10 +303,10 @@
+ }
+
+ /**
+- * ib_umem_release - release memory pinned with ib_umem_get
++ * ib_release_umem - release memory pinned with ib_umem_get
+ * @umem: umem struct to release
+ */
+-void ib_umem_release(struct ib_umem *umem)
++void ib_release_umem(struct ib_umem *umem)
+ {
+ struct ib_ucontext *context = umem->context;
+ struct mm_struct *mm;
+@@ -290,9 +356,8 @@
+ out:
+ kfree(umem);
+ }
+-EXPORT_SYMBOL(ib_umem_release);
+
+-int ib_umem_page_count(struct ib_umem *umem)
++int ib_page_count_umem(struct ib_umem *umem)
+ {
+ int shift;
+ int i;
+@@ -307,4 +372,40 @@
+
+ return n;
+ }
++
++struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
++ size_t size, int access, int dmasync)
++{
++ struct ib_umem_ops *ops = context->umem_ops;
++ struct ib_umem *umem;
++
++ umem = (ops && ops->get) ?
++ ops->get(context, addr, size, access, dmasync) :
++ ib_get_umem(context, addr, size, access, dmasync);
++
++ if (!IS_ERR(umem))
++ umem->context = context;
++
++ return umem;
++}
++EXPORT_SYMBOL(ib_umem_get);
++
++void ib_umem_release(struct ib_umem *umem)
++{
++ struct ib_umem_ops *ops = umem->context->umem_ops;
++
++ if (ops && ops->release)
++ ops->release(umem);
++ else
++ ib_release_umem(umem);
++}
++EXPORT_SYMBOL(ib_umem_release);
++
++int ib_umem_page_count(struct ib_umem *umem)
++{
++ struct ib_umem_ops *ops = umem->context->umem_ops;
++
++ return (ops && ops->page_count) ?
++ ops->page_count(umem) : ib_page_count_umem(umem);
++}
+ EXPORT_SYMBOL(ib_umem_page_count);
+diff -urN a3/drivers/infiniband/hw/mthca/mthca_memfree.c a4/drivers/infiniband/hw/mthca/mthca_memfree.c
+--- a3/drivers/infiniband/hw/mthca/mthca_memfree.c 2015-01-05 14:12:52.112593540 -0800
++++ a4/drivers/infiniband/hw/mthca/mthca_memfree.c 2015-01-05 14:36:00.825535043 -0800
+@@ -39,6 +39,12 @@
+
+ #include <asm/page.h>
+
++/* Must use the ib_umem routines to support the IB proxy server. */
++#define MTHCA_IB_UMEM
++#ifdef MTHCA_IB_UMEM
++#include <rdma/ib_umem.h>
++#endif
++
+ #include "mthca_memfree.h"
+ #include "mthca_dev.h"
+ #include "mthca_cmd.h"
+@@ -56,7 +62,11 @@
+ struct mutex mutex;
+ struct {
+ u64 uvirt;
++#ifdef MTHCA_IB_UMEM
++ struct ib_umem *umem;
++#else
+ struct scatterlist mem;
++#endif
+ int refcount;
+ } page[0];
+ };
+@@ -446,7 +456,12 @@
+ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+ struct mthca_user_db_table *db_tab, int index, u64 uaddr)
+ {
++#ifdef MTHCA_IB_UMEM
++ struct mthca_ucontext *context;
++ struct ib_umem_chunk *chunk;
++#else
+ struct page *pages[1];
++#endif
+ int ret = 0;
+ int i;
+
+@@ -472,6 +487,22 @@
+ goto out;
+ }
+
++#ifdef MTHCA_IB_UMEM
++ context = container_of(uar, struct mthca_ucontext, uar);
++
++ db_tab->page[i].umem = ib_umem_get(&context->ibucontext,
++ uaddr & PAGE_MASK, PAGE_SIZE, 0, 0);
++ if (IS_ERR(db_tab->page[i].umem)) {
++ ret = PTR_ERR(db_tab->page[i].umem);
++ goto out;
++ }
++
++ chunk = list_entry(db_tab->page[i].umem->chunk_list.next,
++ struct ib_umem_chunk, list);
++
++ ret = mthca_MAP_ICM_page(dev, sg_dma_address(&chunk->page_list[0]),
++ mthca_uarc_virt(dev, uar, i));
++#else
+ ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0,
+ pages, NULL);
+ if (ret < 0)
+@@ -488,9 +519,14 @@
+
+ ret = mthca_MAP_ICM_page(dev, sg_dma_address(&db_tab->page[i].mem),
+ mthca_uarc_virt(dev, uar, i));
++#endif
+ if (ret) {
++#ifdef MTHCA_IB_UMEM
++ ib_umem_release(db_tab->page[i].umem);
++#else
+ pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+ put_page(sg_page(&db_tab->page[i].mem));
++#endif
+ goto out;
+ }
+
+@@ -505,6 +541,9 @@
+ void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+ struct mthca_user_db_table *db_tab, int index)
+ {
++#ifdef MTHCA_IB_UMEM
++ int i;
++#endif
+ if (!mthca_is_memfree(dev))
+ return;
+
+@@ -515,7 +554,16 @@
+
+ mutex_lock(&db_tab->mutex);
+
++#ifdef MTHCA_IB_UMEM
++ i = index / MTHCA_DB_REC_PER_PAGE;
++ if (!--db_tab->page[i].refcount) {
++ mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1);
++ ib_umem_release(db_tab->page[i].umem);
++ db_tab->page[i].uvirt = 0;
++ }
++#else
+ --db_tab->page[index / MTHCA_DB_REC_PER_PAGE].refcount;
++#endif
+
+ mutex_unlock(&db_tab->mutex);
+ }
+@@ -538,7 +586,11 @@
+ for (i = 0; i < npages; ++i) {
+ db_tab->page[i].refcount = 0;
+ db_tab->page[i].uvirt = 0;
++#ifdef MTHCA_IB_UMEM
++ db_tab->page[i].umem = NULL;
++#else
+ sg_init_table(&db_tab->page[i].mem, 1);
++#endif
+ }
+
+ return db_tab;
+@@ -555,8 +607,12 @@
+ for (i = 0; i < dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE; ++i) {
+ if (db_tab->page[i].uvirt) {
+ mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1);
++#ifdef MTHCA_IB_UMEM
++ ib_umem_release(db_tab->page[i].umem);
++#else
+ pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+ put_page(sg_page(&db_tab->page[i].mem));
++#endif
+ }
+ }
+
+diff -urN a3/include/rdma/ib_verbs.h a4/include/rdma/ib_verbs.h
+--- a3/include/rdma/ib_verbs.h 2015-01-05 14:18:48.871578512 -0800
++++ a4/include/rdma/ib_verbs.h 2015-01-05 14:36:00.826535043 -0800
+@@ -1122,7 +1122,18 @@
+ u8 page_shift;
+ };
+
++struct ib_ucontext;
++struct ib_umem_ops {
++ struct ib_umem *(*get)(struct ib_ucontext *context,
++ unsigned long addr, size_t size,
++ int access, int dmasync);
++ void (*release)(struct ib_umem *umem);
++ int (*page_count)(struct ib_umem *umem);
++};
++
+ struct ib_ucontext {
++ struct ib_umem_ops *umem_ops; /* set to NULL for default ops */
++ void *umem_private_data;
+ struct ib_device *device;
+ struct list_head pd_list;
+ struct list_head mr_list;
--- /dev/null
+From 8e3cff460efe00954b4c99ea23e42527c234c3f9 Mon Sep 17 00:00:00 2001
+From: Phil Cayton <phil.cayton@intel.com>
+Date: Tue, 4 Feb 2014 12:22:38 -0800
+Subject: [PATCH 05/12] allow mic ipoib qp creation
+
+From the host point of view, each MIC kernel appears as a "user-mode process"
+to allow address translation to access the correct coprocessor mapped across
+PCI. To enable the IPoIB driver in MIC kernel, some QP creation flags must
+be checked regardless of whether the call originates from kernel or user
+space. Because these create_flags cannot be set by normal user-mode calls
+through ib_uverbs, moving the check is not an issue. This patch allows the
+IPoIB driver on MIC to create QPs correctly.
+---
+diff -urN a4/drivers/infiniband/hw/mlx4/qp.c a5/drivers/infiniband/hw/mlx4/qp.c
+--- a4/drivers/infiniband/hw/mlx4/qp.c 2015-01-05 14:35:38.055536002 -0800
++++ a5/drivers/infiniband/hw/mlx4/qp.c 2015-01-08 09:50:29.971123797 -0800
+@@ -692,6 +692,12 @@
+
+ qp->mlx4_ib_qp_type = qp_type;
+
++ if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
++ qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
++
++ if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
++ qp->flags |= MLX4_IB_QP_LSO;
++
+ mutex_init(&qp->mutex);
+ spin_lock_init(&qp->sq.lock);
+ spin_lock_init(&qp->rq.lock);
+@@ -744,13 +750,7 @@
+ }
+ } else {
+ qp->sq_no_prefetch = 0;
+-
+- if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+- qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+-
+- if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
+- qp->flags |= MLX4_IB_QP_LSO;
+-
++/*
+ if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
+ if (dev->steering_support ==
+ MLX4_STEERING_MODE_DEVICE_MANAGED)
+@@ -758,7 +758,7 @@
+ else
+ goto err;
+ }
+-
++*/
+ err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
+ if (err)
+ goto err;
+@@ -1060,6 +1060,7 @@
+
+ gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ?
+ GFP_NOIO : GFP_KERNEL;
++#if 0 /* Removed to allow Xeon Phi's use of ib_ipoib via CCL-Direct (ibp) */
+ /*
+ * We only support LSO, vendor flag1, and multicast loopback blocking,
+ * and only for kernel UD QPs.
+@@ -1084,6 +1085,7 @@
+ ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) &&
+ init_attr->qp_type > IB_QPT_GSI)))
+ return ERR_PTR(-EINVAL);
++#endif /* if 0 */
+
+ switch (init_attr->qp_type) {
+ case IB_QPT_XRC_TGT:
+@@ -1120,9 +1122,11 @@
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ {
++#if 0 /* Removed to allow Xeon Phi's use of ib_ipoib via CCL-Direct (ibp) */
+ /* Userspace is not allowed to create special QPs: */
+ if (udata)
+ return ERR_PTR(-EINVAL);
++#endif /* if 0 */
+
+ err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata,
+ get_sqp_num(to_mdev(pd->device), init_attr),
+diff -urN a4/drivers/infiniband/hw/mlx5/qp.c a5/drivers/infiniband/hw/mlx5/qp.c
+--- a4/drivers/infiniband/hw/mlx5/qp.c 2015-01-05 14:35:38.065536002 -0800
++++ a5/drivers/infiniband/hw/mlx5/qp.c 2015-01-05 14:46:41.322508063 -0800
+@@ -852,6 +852,9 @@
+ }
+
+ if (pd) {
++ if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
++ qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK;
++
+ if (pd->uobject) {
+ mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count);
+ if (ucmd.rq_wqe_shift != qp->rq.wqe_shift ||
--- /dev/null
+From 129a1e301d8567b8d79abe19fd2d998738951cda Mon Sep 17 00:00:00 2001
+From: Phil Cayton <phil.cayton@intel.com>
+Date: Tue, 4 Feb 2014 12:23:56 -0800
+Subject: [PATCH 06/12] add scif.h to the include directory matching the
+ location that is in the mpss installation
+
+Signed-off-by: Phil Cayton <phil.cayton@intel.com>
+---
+diff -urN a5/include/modules/scif.h a6/include/modules/scif.h
+--- a5/include/modules/scif.h 1969-12-31 16:00:00.000000000 -0800
++++ a6/include/modules/scif.h 2015-01-05 14:59:07.370476637 -0800
+@@ -0,0 +1,1748 @@
++/*
++ * Copyright 2010-2013 Intel Corporation.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License, version 2,
++ * as published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Disclaimer: The codes contained in these modules may be specific to
++ * the Intel Software Development Platform codenamed Knights Ferry,
++ * and the Intel product codenamed Knights Corner, and are not backward
++ * compatible with other Intel products. Additionally, Intel will NOT
++ * support the codes or instruction set in future products.
++ *
++ * Intel offers no warranty of any kind regarding the code. This code is
++ * licensed on an "AS IS" basis and Intel is not obligated to provide
++ * any support, assistance, installation, training, or other services
++ * of any kind. Intel is also not obligated to provide any updates,
++ * enhancements or extensions. Intel specifically disclaims any warranty
++ * of merchantability, non-infringement, fitness for any particular
++ * purpose, and any other warranty.
++ *
++ * Further, Intel disclaims all liability of any kind, including but
++ * not limited to liability for infringement of any proprietary rights,
++ * relating to the use of the code, even if Intel is notified of the
++ * possibility of such liability. Except as expressly stated in an Intel
++ * license agreement provided with this code and agreed upon with Intel,
++ * no license, express or implied, by estoppel or otherwise, to any
++ * intellectual property rights is granted herein.
++ */
++
++/*
++ * Revised 15:05 11/24/2010
++ * Derived from SCIF SAS v0.41 with additional corrections
++ */
++
++#ifndef __SCIF_H__
++#define __SCIF_H__
++
++#include <linux/types.h>
++#include <linux/errno.h>
++#include <linux/poll.h>
++#include <linux/pci.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++#define SCIF_ACCEPT_SYNC 1
++#define SCIF_SEND_BLOCK 1
++#define SCIF_RECV_BLOCK 1
++
++/**
++ * The purpose of SCIF_VERSION is to check for compatibility between host and
++ * card SCIF modules. This version should be incremented whenever any changes
++ * are made to the SCIF driver code that is common to both card and the host.
++ * Whenever this version is incremented, SCIF_LIB_VERSION in user mode libscif
++ * scif.h file should be incremented and vice versa. Both the versions should
++ * always match.
++ */
++#define SCIF_VERSION 1
++
++/* Start: Deprecated Temporary definition for compatability */
++#define ACCEPT_SYNC SCIF_ACCEPT_SYNC
++#define SEND_BLOCK SCIF_SEND_BLOCK
++#define RECV_BLOCK SCIF_RECV_BLOCK
++/* End: Deprecated Temporary definition for compatability */
++
++enum {
++ SCIF_PROT_READ = (1<<0),
++ SCIF_PROT_WRITE = (1<<1)
++};
++
++enum {
++ SCIF_MAP_FIXED = 0x10,
++ SCIF_MAP_KERNEL = 0x20
++};
++
++enum {
++ SCIF_FENCE_INIT_SELF = (1<<0),
++ SCIF_FENCE_INIT_PEER = (1<<1)
++};
++
++enum {
++ SCIF_FENCE_RAS_SELF = (1<<2),
++ SCIF_FENCE_RAS_PEER = (1<<3)
++};
++
++enum {
++ SCIF_SIGNAL_LOCAL = (1<<4),
++ SCIF_SIGNAL_REMOTE = (1<<5)
++};
++
++#define SCIF_RMA_USECPU 1
++#define SCIF_RMA_USECACHE (1<<1)
++#define SCIF_RMA_SYNC (1<<2)
++#define SCIF_RMA_ORDERED (1<<3)
++//! @cond (Prevent doxygen from including these)
++#define SCIF_POLLIN POLLIN
++#define SCIF_POLLOUT POLLOUT
++#define SCIF_POLLERR POLLERR
++#define SCIF_POLLHUP POLLHUP
++#define SCIF_POLLNVAL POLLNVAL
++
++/* SCIF Reserved Ports */
++/* COI */
++#define SCIF_COI_PORT_0 40
++#define SCIF_COI_PORT_1 41
++#define SCIF_COI_PORT_2 42
++#define SCIF_COI_PORT_3 43
++#define SCIF_COI_PORT_4 44
++#define SCIF_COI_PORT_5 45
++#define SCIF_COI_PORT_6 46
++#define SCIF_COI_PORT_7 47
++#define SCIF_COI_PORT_8 48
++#define SCIF_COI_PORT_9 49
++
++/* OFED */
++#define SCIF_OFED_PORT_0 60
++#define SCIF_OFED_PORT_1 61
++#define SCIF_OFED_PORT_2 62
++#define SCIF_OFED_PORT_3 63
++#define SCIF_OFED_PORT_4 64
++#define SCIF_OFED_PORT_5 65
++#define SCIF_OFED_PORT_6 66
++#define SCIF_OFED_PORT_7 67
++#define SCIF_OFED_PORT_8 68
++#define SCIF_OFED_PORT_9 69
++
++/* NETDEV */
++#define SCIF_NETDEV_PORT_0 80
++#define SCIF_NETDEV_PORT_1 81
++#define SCIF_NETDEV_PORT_2 82
++#define SCIF_NETDEV_PORT_3 83
++#define SCIF_NETDEV_PORT_4 84
++#define SCIF_NETDEV_PORT_5 85
++#define SCIF_NETDEV_PORT_6 86
++#define SCIF_NETDEV_PORT_7 87
++#define SCIF_NETDEV_PORT_8 88
++#define SCIF_NETDEV_PORT_9 89
++
++/* RAS */
++#define SCIF_RAS_PORT_0 100
++#define SCIF_RAS_PORT_1 101
++#define SCIF_RAS_PORT_2 102
++#define SCIF_RAS_PORT_3 103
++#define SCIF_RAS_PORT_4 104
++#define SCIF_RAS_PORT_5 105
++#define SCIF_RAS_PORT_6 106
++#define SCIF_RAS_PORT_7 107
++#define SCIF_RAS_PORT_8 108
++#define SCIF_RAS_PORT_9 109
++
++/* Power Management */
++#define SCIF_PM_PORT_0 120
++#define SCIF_PM_PORT_1 121
++#define SCIF_PM_PORT_2 122
++#define SCIF_PM_PORT_3 123
++#define SCIF_PM_PORT_4 124
++#define SCIF_PM_PORT_5 125
++#define SCIF_PM_PORT_6 126
++#define SCIF_PM_PORT_7 127
++#define SCIF_PM_PORT_8 128
++#define SCIF_PM_PORT_9 129
++
++/* Board Tools */
++#define SCIF_BT_PORT_0 130
++#define SCIF_BT_PORT_1 131
++#define SCIF_BT_PORT_2 132
++#define SCIF_BT_PORT_3 133
++#define SCIF_BT_PORT_4 134
++#define SCIF_BT_PORT_5 135
++#define SCIF_BT_PORT_6 136
++#define SCIF_BT_PORT_7 137
++#define SCIF_BT_PORT_8 138
++#define SCIF_BT_PORT_9 139
++
++/* MIC Boot/Configuration support */
++#define MPSSD_DOWNLOAD 160
++#define MIC_NOTIFY 161
++
++#define SCIF_ADMIN_PORT_END 1024
++
++/* MYO */
++#define SCIF_MYO_PORT_0 1025
++#define SCIF_MYO_PORT_1 1026
++#define SCIF_MYO_PORT_2 1027
++#define SCIF_MYO_PORT_3 1028
++#define SCIF_MYO_PORT_4 1029
++#define SCIF_MYO_PORT_5 1030
++#define SCIF_MYO_PORT_6 1031
++#define SCIF_MYO_PORT_7 1032
++#define SCIF_MYO_PORT_8 1033
++#define SCIF_MYO_PORT_9 1034
++
++/* SSG Tools */
++#define SCIF_ST_PORT_0 1044
++#define SCIF_ST_PORT_1 1045
++#define SCIF_ST_PORT_2 1046
++#define SCIF_ST_PORT_3 1047
++#define SCIF_ST_PORT_4 1048
++#define SCIF_ST_PORT_5 1049
++#define SCIF_ST_PORT_6 1050
++#define SCIF_ST_PORT_7 1051
++#define SCIF_ST_PORT_8 1052
++#define SCIF_ST_PORT_9 1053
++
++/* End of SCIF Reserved Ports */
++#define SCIF_PORT_RSVD 1088
++//! @endcond
++
++typedef struct endpt *scif_epd_t;
++
++typedef struct scif_pinned_pages *scif_pinned_pages_t;
++
++struct scif_range {
++ void *cookie; /* cookie */
++ int nr_pages; /* Number of Pages */
++ int prot_flags; /* R/W protection */
++ /* Arrays phys_addr/va below are virtually contiguous */
++ dma_addr_t *phys_addr; /* Array of physical addresses */
++ void **va; /* Array of virtual addresses
++ * and populated only when called
++ * on the host for a remote SCIF
++ * connection on MIC.
++ */
++};
++
++struct scif_pollepd {
++ scif_epd_t epd; /* endpoint descriptor */
++ short events; /* requested events */
++ short revents; /* returned events */
++};
++enum scif_event_type {
++ SCIF_NODE_ADDED = 1<<0,
++ SCIF_NODE_REMOVED = 1<<1
++};
++
++union eventd {
++ uint16_t scif_node_added;
++ uint16_t scif_node_removed;
++};
++
++typedef void (*scif_callback_t)(enum scif_event_type event, union eventd
++data);
++
++struct scif_callback {
++ struct list_head list_member;
++ scif_callback_t callback_handler;
++};
++
++#define SCIF_OPEN_FAILED ((scif_epd_t)-1)
++#define SCIF_REGISTER_FAILED ((off_t)-1)
++#define SCIF_MMAP_FAILED ((void *)-1)
++
++struct scif_portID {
++ uint16_t node; /* node on which port resides */
++ uint16_t port; /* Local port number */
++};
++
++/* Start: Deprecated Temporary definition for compatability */
++#define portID scif_portID
++typedef struct portID portID_t;
++/* End: Deprecated Temporary definition for compatability */
++
++/**
++ * scif_open - Create an endpoint
++ *
++ *\return
++ * The scif_open() function creates a new endpoint.
++ *
++ * Upon successful completion, scif_open() returns an endpoint descriptor to
++ * be used in subsequent SCIF functions calls to refer to that endpoint;
++ * otherwise: in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is
++ * returned and errno is set to indicate the error; in kernel mode a NULL
++ * scif_epd_t is returned.
++ *
++ *\par Errors:
++ *- ENOMEM
++ * - Insufficient kernel memory was available.
++ *- ENXIO
++ * - Version mismatch between micscif driver and libscif.
++ */
++scif_epd_t scif_open(void);
++
++/**
++ * scif _bind - Bind an endpoint to a port
++ * \param epd endpoint descriptor
++ * \param pn port number
++ *
++ * scif_bind() binds endpoint epd to port pn, where pn is a port number on the
++ * local node. If pn is zero, a port number greater than or equal to
++ * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to
++ * exactly one local port. Ports less than 1024 when requested can only be bound
++ * by system (or root) processes or by processes executed by privileged users.
++ *
++ *\return
++ * Upon successful completion, scif_bind() returns the port number to which epd
++ * is bound; otherwise: in user mode -1 is returned and errno is set to
++ * indicate the error; in kernel mode the negative of one of the following
++ * errors is returned.
++ *
++ *\par Errors:
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - The endpoint or the port are already bound.
++ *- EISCONN
++ * - The endpoint is already connected.
++ *- ENOSPC
++ * - No port number available for assignment (when pn==0).
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++ *- EACCES
++ * - The port requested is protected and the user is not the superuser.
++*/
++int scif_bind(scif_epd_t epd, uint16_t pn);
++
++/**
++ * scif_listen - Listen for connections on an endpoint
++ *
++ * \param epd endpoint descriptor
++ * \param backlog maximum pending connection requests
++ *
++ * scif_listen() marks the endpoint epd as a listening endpoint - that is, as
++ * an endpoint that will be used to accept incoming connection requests. Once
++ * so marked, the endpoint is said to be in the listening state and may not be
++ * used as the endpoint of a connection.
++ *
++ * The endpoint, epd, must have been bound to a port.
++ *
++ * The backlog argument defines the maximum length to which the queue of
++ * pending connections for epd may grow. If a connection request arrives when
++ * the queue is full, the client may receive an error with an indication that
++ * the connection was refused.
++ *
++ *\return
++ * Upon successful completion, scif_listen() returns 0; otherwise: in user mode
++ * -1 is returned and errno is set to indicate the error; in kernel mode the
++ * negative of one of the following errors is returned.
++ *
++ *\par Errors:
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - The endpoint is not bound to a port
++ *- EISCONN
++ * - The endpoint is already connected or listening
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++*/
++int scif_listen(scif_epd_t epd, int backlog);
++
++/**
++ * scif_connect - Initiate a connection on a port
++ * \param epd endpoint descriptor
++ * \param dst global id of port to which to connect
++ *
++ * The scif_connect() function requests the connection of endpoint epd to remote
++ * port dst. If the connection is successful, a peer endpoint, bound to dst, is
++ * created on node dst.node. On successful return, the connection is complete.
++ *
++ * If the endpoint epd has not already been bound to a port, scif_connect()
++ * will bind it to an unused local port.
++ *
++ * A connection is terminated when an endpoint of the connection is closed,
++ * either explicitly by scif_close(), or when a process that owns one of the
++ * endpoints of a connection is terminated.
++ *
++ *\return
++ * Upon successful completion, scif_connect() returns the port ID to which the
++ * endpoint, epd, is bound; otherwise: in user mode -1 is returned and errno is
++ * set to indicate the error; in kernel mode the negative of one of the
++ * following errors is returned.
++ *
++ *\par Errors:
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- ECONNREFUSED
++ * - The destination was not listening for connections or refused the
++ * connection request.
++ *- EINTR
++ * - Interrupted function
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - dst.port is not a valid port ID
++ *- EISCONN
++ * - The endpoint is already connected
++ *- ENOBUFS
++ * - No buffer space is available
++ *- ENODEV
++ * - The destination node does not exist, or
++ * - The node is lost.
++ *- ENOSPC
++ * - No port number available for assignment (when pn==0).
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++ *- EOPNOTSUPP
++ * - The endpoint is listening and cannot be connected
++*/
++int scif_connect(scif_epd_t epd, struct scif_portID *dst);
++
++/**
++ * scif_accept - Accept a connection on an endpoint
++ * \param epd endpoint descriptor
++ * \param peer global id of port to which connected
++ * \param newepd new connected endpoint descriptor
++ * \param flags flags
++ *
++ * The scif_accept() call extracts the first connection request on the queue of
++ * pending connections for the port on which epd is listening. scif_accept()
++ * creates a new endpoint, bound to the same port as epd, and allocates a new
++ * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new
++ * endpoint is connected to the endpoint through which the connection was
++ * requested. epd is unaffected by this call, and remains in the listening
++ * state.
++ *
++ * On successful return, peer holds the global port identifier (node id and
++ * local port number) of the port which requested the connection.
++ *
++ * If the peer endpoint which requested the connection is closed, the endpoint
++ * returned by scif_accept() is closed.
++ *
++ * The number of connections that can (subsequently) be accepted on epd is only
++ * limited by system resources (memory).
++ *
++ * The flags argument is formed by OR'ing together zero or more of the
++ * following values:
++ *- SCIF_ACCEPT_SYNC: block until a connection request is presented. If
++ * SCIF_ACCEPT_SYNC is not in flags, and no pending
++ * connections are present on the queue, scif_accept()fails
++ * with an EAGAIN error
++ *
++ * On Linux in user mode, the select() and poll() functions can be used to
++ * determine when there is a connection request. On Microsoft Windows* and on
++ * Linux in kernel mode, the scif_poll() function may be used for this purpose.
++ * A readable event will be delivered when a connection is requested.
++ *
++ *\return
++ * Upon successful completion, scif_accept() returns 0; otherwise: in user mode
++ * -1 is returned and errno is set to indicate the error; in kernel mode the
++ * negative of one of the following errors is returned.
++ *
++ *\par Errors:
++ *- EAGAIN
++ * - SCIF_ACCEPT_SYNC is not set and no connections are present to be accepted, or
++ * - SCIF_ACCEPT_SYNC is not set and remote node failed to complete its
++ * connection request
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- EINTR
++ * - Interrupted function
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - epd is not a listening endpoint
++ * - flags is invalid
++ * - peer is NULL
++ * - newepd is NULL
++ *- ENOBUFS
++ * - No buffer space is available
++ *- ENODEV
++ * - The requesting node is lost.
++ *- ENOMEM
++ * - Not enough space
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++ *- ENOENT
++ * - Secondary part of epd registeration failed.
++*/
++int scif_accept(scif_epd_t epd, struct scif_portID *peer, scif_epd_t
++*newepd, int flags);
++
++/**
++ * scif_close - Close an endpoint
++ * \param epd endpoint descriptor
++ *
++ * scif_close() closes an endpoint and performs necessary teardown of
++ * facilities associated with that endpoint.
++ *
++ * If epd is a listening endpoint then it will no longer accept connection
++ * requests on the port to which it is bound. Any pending connection requests
++ * are rejected.
++ *
++ * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs
++ * which are in-process through epd or its peer endpoint will complete before
++ * scif_close() returns. Registered windows of the local and peer endpoints are
++ * released as if scif_unregister() was called against each window.
++ *
++ * Closing an endpoint does not affect mappings to remote memory. These remain
++ * until explicitly removed by calling scif_munmap().
++ *
++ * If the peer endpoint's receive queue is not empty at the time that epd is
++ * closed, then the peer endpoint can be passed as the endpoint parameter to
++ * scif_recv() until the receive queue is empty.
++ *
++ * If epd is bound to a port, then the port is returned to the pool of
++ * available ports.
++ *
++ * epd is freed and may no longer be accessed.
++ *
++ *\return
++ * Upon successful completion, scif_close() returns 0; otherwise: in user mode
++ * -1 is returned and errno is set to indicate the error; in kernel mode the
++ * negative of one of the following errors is returned.
++ *
++ *\par Errors:
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor
++ */
++int scif_close(scif_epd_t epd);
++
++/**
++ * scif_send - Send a message
++ * \param epd endpoint descriptor
++ * \param msg message buffer address
++ * \param len message length
++ * \param flags blocking mode flags
++ *
++ * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data
++ * are copied from memory starting at address msg. On successful execution the
++ * return value of scif_send() is the number of bytes that were sent, and is
++ * zero if no bytes were sent because len was zero. scif_send() may be called
++ * only when the endpoint is in a connected state.
++ *
++ * If a scif_send() call is non-blocking, then it sends only those bytes which
++ * can be sent without waiting, up to a maximum of len bytes.
++ *
++ * If a scif_send() call is blocking, then it normally returns after sending
++ * all len bytes. If a blocking call is interrupted or the connection is
++ * forcibly closed, the call is considered successful if some bytes were sent
++ * or len is zero, otherwise the call is considered unsuccessful.
++ *
++ * On Linux in user mode, the select() and poll() functions can be used to
++ * determine when the send queue is not full. On Microsoft Windows* and on
++ * Linux in kernel mode, the scif_poll() function may be used for this purpose.
++ *
++ * It is recommended that scif_send()/scif_recv() only be used for short
++ * control-type message communication between SCIF endpoints. The SCIF RMA
++ * APIs are expected to provide better performance for transfer sizes of
++ * 1024 bytes or longer.
++ *
++ * The flags argument is formed by ORing together zero or more of the following
++ * values:
++ *- SCIF_SEND_BLOCK: block until the entire message is sent.
++ *
++ *\return
++ * Upon successful completion, scif_send() returns the number of bytes sent;
++ * otherwise: in user mode -1 is returned and errno is set to indicate the
++ * error; in kernel mode the negative of one of the following errors is
++ * returned.
++ *
++ *\par Errors:
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- ECONNRESET
++ * - A connection was forcibly closed by a peer.
++ *- EFAULT
++ * - An invalid address was specified for a parameter.
++ *- EINTR
++ * - epd was closed by scif_close()
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - flags is invalid
++ * - len is negative
++ *- ENODEV
++ * - The remote node is lost.
++ *- ENOMEM
++ * - Not enough space
++ *- ENOTCONN
++ * - The endpoint is not connected
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++ */
++int scif_send(scif_epd_t epd, void *msg, int len, int flags);
++
++/**
++ * scif_recv - Receive a message
++ * \param epd endpoint descriptor
++ * \param msg message buffer address
++ * \param len message buffer length
++ * \param flags blocking mode flags
++ *
++ * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of
++ * data are copied to memory starting at address msg. On successful execution
++ * the return value of scif_recv() is the number of bytes that were received,
++ * and is zero if no bytes were received because len was zero. scif_recv() may
++ * be called only when the endpoint is in a connected state.
++ *
++ * If a scif_recv() call is non-blocking, then it receives only those bytes
++ * which can be received without waiting, up to a maximum of len bytes.
++ *
++ * If a scif_recv() call is blocking, then it normally returns after receiving
++ * all len bytes. If a blocking call is interrupted or the connection is
++ * forcibly closed, the call is considered successful if some bytes were
++ * received or len is zero, otherwise the call is considered unsuccessful;
++ * subsequent calls to scif_recv() will successfully receive all data sent
++ * through peer endpoint interruption or the connection was forcibly closed.
++ *
++ * On Linux in user mode, the select() and poll() functions can be used to
++ * determine when data is available to be received. On Microsoft Windows* and
++ * on Linux in kernel mode, the scif_poll() function may be used for this
++ * purpose.
++ *
++ * It is recommended that scif_send()/scif_recv() only be used for short
++ * control-type message communication between SCIF endpoints. The SCIF RMA
++ * APIs are expected to provide better performance for transfer sizes of
++ * 1024 bytes or longer.
++ *
++ * The flags argument is formed by ORing together zero or more of the following
++ * values:
++ *- SCIF_RECV_BLOCK: block until the entire message is received.
++ *
++ *\return
++ * Upon successful completion, scif_recv() returns the number of bytes
++ * received; otherwise: in user mode -1 is returned and errno is set to
++ * indicate the error; in kernel mode the negative of one of the following
++ * errors is returned.
++ *
++ *\par Errors:
++ *- EAGAIN
++ * - The destination node is returning from a low power state.
++ *- EBADF
++ * - epd is not a valid endpoint descriptor .
++ *- ECONNRESET
++ * - A connection was forcibly closed by a peer.
++ *- EFAULT
++ * - An invalid address was specified for a parameter.
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - flags is invalid, or
++ * - len is negative.
++ *- ENODEV
++ * - The remote node is lost.
++ *- ENOMEM
++ * - Not enough space.
++ *- ENOTCONN
++ * - The endpoint is not connected.
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++ */
++int scif_recv(scif_epd_t epd, void *msg, int len, int flags);
++
++/**
++ * scif_register - Mark a memory region for remote access.
++ * \param epd endpoint descriptor
++ * \param addr starting virtual address
++ * \param len length of range
++ * \param offset offset of window
++ * \param prot_flags read/write protection flags
++ * \param map_flags mapping flags
++ *
++ * The scif_register() function opens a window, a range of whole pages of the
++ * registered address space of the endpoint epd, starting at offset po and
++ * continuing for len bytes. The value of po, further described below, is a
++ * function of the parameters offset and len, and the value of map_flags. Each
++ * page of the window represents the physical memory page which backs the
++ * corresponding page of the range of virtual address pages starting at addr
++ * and continuing for len bytes. addr and len are constrained to be multiples
++ * of the page size. addr is interpreted as a user space address. A successful
++ * scif_register() call returns po as the return value.
++ *
++ * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset
++ * exactly, and offset is constrained to be a multiple of the page size. The
++ * mapping established by scif_register() will not replace any existing
++ * registration; an error is returned if any page within the range [offset,
++ * offset+len-1] intersects an existing window.
++ * Note: When SCIF_MAP_FIXED is set the current implementation limits
++ * offset to the range [0..2^62-1] and returns EADDRINUSE if the offset
++ * requested with SCIF_MAP_FIXED is in the range [2^62..2^63-1].
++ *
++ * When SCIF_MAP_FIXED is not set, the implementation uses offset in an
++ * implementation-defined manner to arrive at po. The po value so chosen will
++ * be an area of the registered address space that the implementation deems
++ * suitable for a mapping of len bytes. An offset value of 0 is interpreted as
++ * granting the implementation complete freedom in selecting po, subject to
++ * constraints described below. A non-zero value of offset is taken to be a
++ * suggestion of an offset near which the mapping should be placed. When the
++ * implementation selects a value for po, it does not replace any extant
++ * window. In all cases, po will be a multiple of the page size.
++ *
++ * The physical pages which are so represented by a window are available for
++ * access in calls to scif_mmap(), scif_readfrom(), scif_writeto(),
++ * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the
++ * physical pages represented by the window will not be reused by the memory
++ * subsystem for any other purpose. Note that the same physical page may be
++ * represented by multiple windows.
++ *
++ * Subsequent operations which change the memory pages to which virtual
++ * addresses are mapped (such as mmap(), munmap(), scif_mmap() and
++ * scif_munmap()) have no effect on existing windows.
++ *
++ * On Linux, if the process will fork(), it is recommended that the registered
++ * virtual address range be marked with MADV_DONTFORK. Doing so will prevent
++ * problems due to copy-on-write semantics.
++ *
++ * The prot_flags argument is formed by OR'ing together one or more of the
++ * following values:
++ *- SCIF_PROT_READ: allow read operations from the window
++ *- SCIF_PROT_WRITE: allow write operations to the window
++ *
++ * The map_flags argument is formed by OR'ing together zero or more of
++ * the following values:
++ *- SCIF_MAP_FIXED: interpret offset exactly
++ *
++ *\return
++ * Upon successful completion, scif_register() returns the offset at which the
++ * mapping was placed (po); otherwise: in user mode SCIF_REGISTER_FAILED (that
++ * is (off_t *)-1) is returned and errno is set to indicate the error; in
++ * kernel mode the negative of one of the following errors is returned.
++ *
++ *\par Errors:
++ *- EADDRINUSE
++ * - SCIF_MAP_FIXED is set in map_flags, and pages in the range [offset,
++ * offset+len-1] are already registered
++ *- EAGAIN
++ * - The mapping could not be performed due to lack of resources
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- ECONNRESET
++ * - A connection was forcibly closed by a peer.
++ *- EFAULT
++ * - Addresses in the range [addr , addr + len - 1] are invalid
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - map_flags is invalid, or
++ * - prot_flags is invalid, or
++ * - SCIF_MAP_FIXED is set in flags, and offset is not a multiple of
++ * the page size, or
++ * - addr is not a multiple of the page size, or
++ * - len is not a multiple of the page size, or is 0, or
++ * - offset is negative
++ *- ENODEV
++ * - The remote node is lost.
++ *- ENOMEM
++ * - Not enough space
++ *- ENOTCONN
++ * - The endpoint is not connected
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++ */
++off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset,
++int prot_flags, int map_flags);
++
++/**
++ * scif_unregister - Mark a memory region for remote access.
++ * \param epd endpoint descriptor
++ * \param offset start of range to unregister
++ * \param len length of range to unregister
++ *
++ * The scif_unregister() function closes those previously registered windows
++ * which are entirely within the range [offset,offset+len-1]. It is an error to
++ * specify a range which intersects only a subrange of a window.
++ *
++ * On a successful return, pages within the window may no longer be specified
++ * in calls to scif_mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(),
++ * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window, however,
++ * continues to exist until all previous references against it are removed. A
++ * window is referenced if there is a mapping to it created by scif_mmap(), or if
++ * scif_get_pages() was called against the window (and the pages have not been
++ * returned via scif_put_pages()). A window is also referenced while an RMA, in
++ * which some range of the window is a source or destination, is in progress.
++ * Finally a window is referenced while some offset in that window was specified
++ * to scif_fence_signal(), and the RMAs marked by that call to
++ * scif_fence_signal() have not completed. While a window is in this state, its
++ * registered address space pages are not available for use in a new registered
++ * window.
++ *
++ * When all such references to the window have been removed, its references to
++ * all the physical pages which it represents are removed. Similarly, the
++ * registered address space pages of the window become available for
++ * registration in a new window.
++ *
++ *\return
++ * Upon successful completion, scif_unregister() returns 0; otherwise: in user
++ * mode -1 is returned and errno is set to indicate the error; in kernel mode
++ * the negative of one of the following errors is returned. In the event of an
++ * error, no windows are unregistered.
++ *
++ *\par Errors:
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- ECONNRESET
++ * - A connection was forcibly closed by a peer.
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - The range [offset,offset+len-1] intersects a subrange of a window, or
++ * - offset is negative
++ *- ENODEV
++ * -The remote node is lost.
++ *- ENOTCONN
++ * - The endpoint is not connected
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++ *- ENXIO
++ * - Addresses in the range [offset,offset+len-1] are invalid for the
++ * registered address space of epd.
++ */
++int scif_unregister(scif_epd_t epd, off_t offset, size_t len);
++
++
++/**
++ * scif_readfrom - Copy from a remote address space
++ * \param epd endpoint descriptor
++ * \param loffset offset in local registered address space to
++ * which to copy
++ * \param len length of range to copy
++ * \param roffset offset in remote registered address space
++ * from which to copy
++ * \param rma_flags transfer mode flags
++ *
++ * scif_readfrom() copies len bytes from the remote registered address space of
++ * the peer of endpoint epd, starting at the offset roffset to the local
++ * registered address space of epd, starting at the offset loffset.
++ *
++ * Each of the specified ranges [loffset,loffset+len-1] and [roffset,roffset+
++ * len-1] must be within some registered window or windows of the local and
++ * remote nodes respectively. A range may intersect multiple registered
++ * windows, but only if those windows are contiguous in the registered address
++ * space.
++ *
++ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
++ * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
++ * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the
++ * transfer is complete. Otherwise, the transfer may be performed asynchron-
++ * ously. The order in which any two aynchronous RMA operations complete
++ * is non-deterministic. The synchronization functions, scif_fence_mark()/
++ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
++ * the completion of asynchronous RMA operations.
++ *
++ * The DMA transfer of individual bytes is not guaranteed to complete in
++ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
++ * cacheline or partial cacheline of the source range will become visible on
++ * the destination node after all other transferred data in the source
++ * range has become visible on the destination node.
++ *
++ * The optimal DMA performance will likely be realized if both
++ * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
++ * performance will likely be realized if loffset and roffset are not
++ * cacheline aligned but are separated by some multiple of 64. The lowest level
++ * of performance is likely if loffset and roffset are not separated by a
++ * multiple of 64.
++ *
++ * The rma_flags argument is formed by ORing together zero or more of the
++ * following values:
++ *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA
++ * engine.
++ *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the
++ * transfer has completed. Passing this flag might result in
++ * the API busy waiting and consuming CPU cycles while the DMA
++ * transfer is in progress.
++ *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of
++ * the source range becomes visible on the destination node
++ * after all other transferred data in the source range has
++ * become visible on the destination
++ *
++ *\return
++ * Upon successful completion, scif_readfrom() returns 0; otherwise: in user
++ * mode -1 is returned and errno is set to indicate the error; in kernel mode
++ * the negative of one of the following errors is returned.
++ *
++ *\par Errors
++ *- EACCESS
++ * - Attempt to write to a read-only range or read from a write-only range
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- ECONNRESET
++ * - A connection was forcibly closed by a peer.
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - rma_flags is invalid
++ *- ENODEV
++ * -The remote node is lost.
++ *- ENOTCONN
++ * - The endpoint is not connected
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++ *- ENXIO
++ * - The range [loffset,loffset+len-1] is invalid for the registered address
++ * space of epd, or,
++ * - The range [roffset,roffset+len-1] is invalid for the registered address
++ * space of the peer of epd, or
++ * - loffset or roffset is negative
++*/
++int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t
++roffset, int rma_flags);
++
++/**
++ * scif_writeto - Copy to a remote address space
++ * \param epd endpoint descriptor
++ * \param loffset offset in local registered address space
++ * from which to copy
++ * \param len length of range to copy
++ * \param roffset offset in remote registered address space to
++ * which to copy
++ * \param rma_flags transfer mode flags
++ *
++ * scif_writeto() copies len bytes from the local registered address space of
++ * epd, starting at the offset loffset to the remote registered address space
++ * of the peer of endpoint epd, starting at the offset roffset.
++ *
++ * Each of the specified ranges [loffset,loffset+len-1] and [roffset,roffset+
++ * len-1] must be within some registered window or windows of the local and
++ * remote nodes respectively. A range may intersect multiple registered
++ * windows, but only if those windows are contiguous in the registered address
++ * space.
++ *
++ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
++ * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
++ * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the
++ * transfer is complete. Otherwise, the transfer may be performed asynchron-
++ * ously. The order in which any two aynchronous RMA operations complete
++ * is non-deterministic. The synchronization functions, scif_fence_mark()/
++ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
++ * the completion of asynchronous RMA operations.
++ *
++ * The DMA transfer of individual bytes is not guaranteed to complete in
++ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
++ * cacheline or partial cacheline of the source range will become visible on
++ * the destination node after all other transferred data in the source
++ * range has become visible on the destination node.
++ *
++ * The optimal DMA performance will likely be realized if both
++ * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
++ * performance will likely be realized if loffset and roffset are not cacheline
++ * aligned but are separated by some multiple of 64. The lowest level of
++ * performance is likely if loffset and roffset are not separated by a multiple
++ * of 64.
++ *
++ * The rma_flags argument is formed by ORing together zero or more of the
++ * following values:
++ *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA
++ * engine.
++ *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the
++ * transfer has completed. Passing this flag might result in
++ * the API busy waiting and consuming CPU cycles while the DMA
++ * transfer is in progress.
++ *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of
++ * the source range becomes visible on the destination node
++ * after all other transferred data in the source range has
++ * become visible on the destination
++ *
++ *\return
++ * Upon successful completion, scif_readfrom() returns 0; otherwise: in user
++ * mode -1 is returned and errno is set to indicate the error; in kernel mode
++ * the negative of one of the following errors is returned.
++ *
++ *\par Errors:
++ *- EACCESS
++ * - Attempt to write to a read-only range or read from a write-only range
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- ECONNRESET
++ * - A connection was forcibly closed by a peer.
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - rma_flags is invalid
++ *- ENODEV
++ * - The remote node is lost.
++ *- ENOTCONN
++ * - The endpoint is not connected
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++ *- ENXIO
++ * - The range [loffset,loffset+len-1] is invalid for the registered address
++ * space of epd, or,
++ * - The range [roffset , roffset + len -1] is invalid for the registered
++ * address space of the peer of epd, or
++ * - loffset or roffset is negative
++ */
++int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t
++roffset, int rma_flags);
++
++/**
++ * scif_vreadfrom - Copy from a remote address space
++ * \param epd endpoint descriptor
++ * \param addr address to which to copy
++ * \param len length of range to copy
++ * \param roffset offset in remote registered address space
++ * from which to copy
++ * \param rma_flags transfer mode flags
++ *
++ * scif_vreadfrom() copies len bytes from the remote registered address
++ * space of the peer of endpoint epd, starting at the offset roffset, to local
++ * memory, starting at addr. addr is interpreted as a user space address.
++ *
++ * The specified range [roffset,roffset+len-1] must be within some registered
++ * window or windows of the remote nodes respectively. The range may intersect
++ * multiple registered windows, but only if those windows are contiguous in the
++ * registered address space.
++ *
++ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
++ * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
++ * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the
++ * transfer is complete. Otherwise, the transfer may be performed asynchron-
++ * ously. The order in which any two aynchronous RMA operations complete
++ * is non-deterministic. The synchronization functions, scif_fence_mark()/
++ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
++ * the completion of asynchronous RMA operations.
++ *
++ * The DMA transfer of individual bytes is not guaranteed to complete in
++ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
++ * cacheline or partial cacheline of the source range will become visible on
++ * the destination node after all other transferred data in the source
++ * range has become visible on the destination node.
++ *
++ * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
++ * the specified local memory range may be remain in a pinned state even after
++ * the specified transfer completes. This may reduce overhead if some or all of
++ * the same virtual address range is referenced in a subsequent call of
++ * scif_vreadfrom() or scif_vwriteto().
++ *
++ * The optimal DMA performance will likely be realized if both
++ * loffset and roffset are cacheline aligned (are a multiple of 64). Lower
++ * performance will likely be realized if loffset and roffset are not
++ * cacheline aligned but are separated by some multiple of 64. The lowest level
++ * of performance is likely if loffset and roffset are not separated by a
++ * multiple of 64.
++ *
++ * The rma_flags argument is formed by ORing together zero or more of the
++ * following values:
++ *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA
++ * engine.
++ *- SCIF_RMA_USECACHE: enable registration caching
++ *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the
++ * transfer has completed. Passing this flag might result in
++ * the API busy waiting and consuming CPU cycles while the DMA
++ * transfer is in progress.
++ *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of
++ * the source range becomes visible on the destination node
++ * after all other transferred data in the source range has
++ * become visible on the destination
++ *
++ *\return
++ * Upon successful completion, scif_vreadfrom() returns 0; otherwise: in user
++ * mode -1 is returned and errno is set to indicate the error; in kernel mode
++ * the negative of one of the following errors is returned.
++ *
++ *\par Errors:
++ *- EACCESS
++ * - Attempt to write to a read-only range or read from a write-only range
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- ECONNRESET
++ * - A connection was forcibly closed by a peer.
++ *- EFAULT
++ * - Addresses in the range [addr,addr+len-1] are invalid
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - rma_flags is invalid
++ *- ENODEV
++ * - The remote node is lost.
++ *- ENOTCONN
++ * - The endpoint is not connected
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++ *- ENXIO
++ * - Addresses in the range [roffset,roffset+len-1] are invalid for the
++ * registered address space of epd.
++ */
++int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t offset,
++int rma_flags);
++
++/**
++ * scif_vwriteto - Copy to a remote address space
++ * \param epd endpoint descriptor
++ * \param addr address from which to copy
++ * \param len length of range to copy
++ * \param roffset offset in remote registered address space to
++ * which to copy
++ * \param rma_flags transfer mode flags
++ *
++ * scif_vwriteto() copies len bytes from the local memory, starting at addr, to
++ * the remote registered address space of the peer of endpoint epd, starting at
++ * the offset roffset. addr is interpreted as a user space address.
++ *
++ * The specified range [roffset,roffset+len-1] must be within some registered
++ * window or windows of the remote nodes respectively. The range may intersect
++ * multiple registered windows, but only if those windows are contiguous in the
++ * registered address space.
++ *
++ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using
++ * programmed read/writes. Otherwise the data is copied using DMA. If rma_-
++ * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the
++ * transfer is complete. Otherwise, the transfer may be performed asynchron-
++ * ously. The order in which any two aynchronous RMA operations complete
++ * is non-deterministic. The synchronization functions, scif_fence_mark()/
++ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to
++ * the completion of asynchronous RMA operations.
++ *
++ * The DMA transfer of individual bytes is not guaranteed to complete in
++ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last
++ * cacheline or partial cacheline of the source range will become visible on
++ * the destination node after all other transferred data in the source
++ * range has become visible on the destination node.
++ *
++ * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back
++ * the specified local memory range may be remain in a pinned state even after
++ * the specified transfer completes. This may reduce overhead if some or all of
++ * the same virtual address range is referenced in a subsequent call of
++ * scif_vreadfrom() or scif_vwriteto().
++ *
++ * The optimal DMA performance will likely be realized if both
++ * addr and offset are cacheline aligned (are a multiple of 64). Lower
++ * performance will likely be realized if addr and offset are not cacheline
++ * aligned but are separated by some multiple of 64. The lowest level of
++ * performance is likely if addr and offset are not separated by a multiple of
++ * 64.
++ *
++ * The rma_flags argument is formed by ORing together zero or more of the
++ * following values:
++ *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA
++ * engine.
++ *- SCIF_RMA_USECACHE: allow registration caching
++ *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the
++ * transfer has completed. Passing this flag might result in
++ * the API busy waiting and consuming CPU cycles while the DMA
++ * transfer is in progress.
++ *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of
++ * the source range becomes visible on the destination node
++ * after all other transferred data in the source range has
++ * become visible on the destination
++ *
++ *\return
++ * Upon successful completion, scif_vwriteto () returns 0; otherwise: in user
++ * mode -1 is returned and errno is set to indicate the error; in kernel mode
++ * the negative of one of the following errors is returned.
++ *
++ *\par Errors:
++ *- EACCESS
++ * - Attempt to write to a read-only range or read from a write-only range
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- ECONNRESET
++ * - A connection was forcibly closed by a peer.
++ *- EFAULT
++ * - Addresses in the range [addr,addr+len-1] are invalid
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - rma_flags is invalid
++ *- ENODEV
++ * - The remote node is lost.
++ *- ENOTCONN
++ * - The endpoint is not connected
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++ *- ENXIO
++ * - Addresses in the range [roffset,roffset+len-1] are invalid for the
++ * registered address space of epd.
++ */
++int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t offset,
++int rma_flags);
++
++/**
++ * scif_fence_mark - Mark previously issued RMAs
++ * \param epd endpoint descriptor
++ * \param flags control flags
++ * \param mark marked handle returned as output.
++ *
++ * scif_fence_mark() returns after marking the current set of all uncompleted
++ * RMAs initiated through the endpoint epd or the current set of all
++ * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are
++ * marked with a value returned at mark. The application may subsequently call
++ * scif_fence_wait(), passing the value returned at mark, to await completion
++ * of all RMAs so marked.
++ *
++ * The flags argument has exactly one of the following values:
++ *- SCIF_FENCE_INIT_SELF: RMA operations initiated through endpoint
++ * epd are marked
++ *- SCIF_FENCE_INIT_PEER: RMA operations initiated through the peer
++ * of endpoint epd are marked
++ *
++ * \return
++ * Upon successful completion, scif_fence_mark() returns 0; otherwise: in user
++ * mode -1 is returned and errno is set to indicate the error; in kernel mode
++ * the negative of one of the following errors is returned.
++ *
++ *\par Errors:
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- ECONNRESET
++ * - A connection was forcibly closed by a peer.
++ *- EINVAL
++ * - flags is invalid, or
++ * - epd is not a valid endpoint descriptor, or
++ *- ENODEV
++ * - The remote node is lost.
++ *- ENOTCONN
++ * - The endpoint is not connected
++ *- ENOMEM
++ * - Insufficient kernel memory was available.
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++ */
++int scif_fence_mark(scif_epd_t epd, int flags, int *mark);
++
++/**
++ * scif_fence_wait - Wait for completion of marked RMAs
++ *
++ * \param epd endpoint descriptor
++ * \param mark mark request
++ *
++ * scif_fence_wait() returns after all RMAs marked with mark have completed.
++ * The value passed in mark must have been obtained in a previous call to
++ * scif_fence_mark().
++ *
++ *\return
++ * Upon successful completion, scif_fence_wait() returns 0; otherwise: in user
++ * mode -1 is returned and errno is set to indicate the error; in kernel mode
++ * the negative of one of the following errors is returned.
++ *
++ *\par Errors:
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- ECONNRESET
++ * - A connection was forcibly closed by a peer.
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ *- ENODEV
++ * - The remote node is lost.
++ *- ENOTCONN
++ * - The endpoint is not connected
++ *- ENOMEM
++ * - Insufficient kernel memory was available.
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++ */
++int scif_fence_wait(scif_epd_t epd, int mark);
++
++/**
++ * scif_fence_signal - Request a signal on completion of RMAs
++ * \param loff local offset
++ * \param lval local value to write to loffset
++ * \param roff remote offset
++ * \param rval remote value to write to roffset
++ * \param flags flags
++ *
++ * scif_fence_signal() returns after marking the current set of all uncompleted
++ * RMAs initiated through the endpoint epd or marking the current set of all
++ * uncompleted RMAs initiated through the peer of endpoint epd.
++ *
++ * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the
++ * marked set, lval is written to memory at the address corresponding to offset
++ * loff in the local registered address space of epd. loff must be within a
++ * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion
++ * of the RMAs in the marked set, rval is written to memory at the * address
++ * corresponding to offset roff in the remote registered address space of epd.
++ * roff must be within a remote registered window of the peer of epd. Note
++ * that any specified offset must be DWORD (4 byte / 32 bit) aligned.
++ *
++ * The flags argument is formed by OR'ing together the following:
++ *- Exactly one of the following values:
++ * - SCIF_FENCE_INIT_SELF: RMA operations initiated through endpoint
++ * epd are marked
++ * - SCIF_FENCE_INIT_PEER: RMA operations initiated through the peer
++ * of endpoint epd are marked
++ *- One or more of the following values:
++ * - SCIF_SIGNAL_LOCAL: On completion of the marked set of RMAs, write lval to
++ * memory at the address corresponding to offset loff in the local registered
++ * address space of epd.
++ * - SCIF_SIGNAL_REMOTE: On completion of the marked set of RMAs, write lval to
++ * memory at the address corresponding to offset roff in the remote registered
++ * address space of epd.
++ *
++ *\return
++ * Upon successful completion, scif_fence_signal() returns 0; otherwise: in
++ * user mode -1 is returned and errno is set to indicate the error; in kernel
++ * mode the negative of one of the following errors is returned.
++ *\par Errors:
++ *- EBADF
++ * - epd is not a valid endpoint descriptor
++ *- ECONNRESET
++ * - A connection was forcibly closed by a peer.
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - flags is invalid, or
++ * - loff or roff are not DWORD aligned
++ *- ENODEV
++ * - The remote node is lost.
++ *- ENOTCONN
++ * - The endpoint is not connected
++ *- ENOTTY
++ * - epd is not a valid endpoint descriptor
++ *- ENXIO
++ * - loff is invalid for the registered address of epd, or
++ * - roff is invalid for the registered address space, of the peer of epd
++ */
++int scif_fence_signal(scif_epd_t epd, off_t loff, uint64_t lval, off_t roff,
++uint64_t rval, int flags);
++
++/**
++ * scif_get_nodeIDs - Return information about online nodes
++ * \param nodes array in which to return online node IDs
++ * \param len number of entries in the nodes array
++ * \param self address to place the node ID of the local node
++ *
++ * scif_get_nodeIDs() fills in the nodes array with up to len node IDs of the
++ * nodes in the SCIF network. If there is not enough space in nodes, as
++ * indicated by the len parameter, only len node IDs are returned in nodes. The
++ * return value of scif_get_nodeID() is the total number of nodes currently in
++ * the SCIF network. By checking the return value against the len parameter, the user may
++ * determine if enough space for nodes was allocated.
++ *
++ * The node ID of the local node is returned at self.
++ *
++ *\return
++ * Upon successful completion, scif_get_nodeIDs() returns the actual number of
++ * online nodes in the SCIF network including 'self'; otherwise: in user mode
++ * -1 is returned and errno is set to indicate the error; in kernel mode no
++ * errors are returned.
++ *
++ *\par Errors:
++ *- EFAULT
++ * - Bad address
++ */
++int scif_get_nodeIDs(uint16_t *nodes, int len, uint16_t *self);
++
++
++/**
++ * scif_pin_pages - Pin a set of pages
++ * \param addr Virtual address of range to pin
++ * \param len Length of range to pin
++ * \param prot_flags Page protection flags
++ * \param map_flags Page classification flags
++ * \param pinned_pages Opaque handle of pinned pages
++ *
++ * scif_pin_pages() pins (locks in physical memory) the physical pages which
++ * back the range of virtual address pages starting at addr and continuing for
++ * len bytes. addr and len are constrained to be multiples of the page size. A
++ * successful scif_register() call returns an opaque pointer value at
++ * pinned_pages which may be used in subsequent calls to
++ * scif_register_pinned_pages().
++ *
++ * The pages will remain pinned as long as there is a reference against the
++ * scif_pinned_pages_t value returned by scif_pin_pages() and until
++ * scif_unpin_pages() is called, passing the scif_pinned_pages_t value. A
++ * reference is added to a scif_pinned_pages_t value each time a window is
++ * created by calling scif_register_pinned_pages() and passing the
++ * scif_pinned_pages_t value. A reference is removed from a scif_pinned_pages_t value
++ * each time such a window is deleted.
++ *
++ * Subsequent operations which change the memory pages to which virtual
++ * addresses are mapped (such as mmap(), munmap(), scif_mmap() and
++ * scif_munmap()) have no effect on the scif_pinned_pages_t value or windows
++ * created against it.
++ *
++ * On Linux, if the process will fork(), it is recommended that the registered
++ * virtual address range be marked with MADV_DONTFORK. Doing so will prevent
++ * problems due to copy-on-write semantics.
++ *
++ * The prot_flags argument is formed by OR'ing together one or more of the
++ * following values:
++ *- SCIF_PROT_READ: allow read operations against the pages
++ *- SCIF_PROT_WRITE: allow write operations against the pages
++ * The map_flags argument is formed by OR'ing together zero or more of the
++ * following values:
++ *- SCIF_MAP_KERNEL: interpret addr as a kernel space address. By default, addr
++ * is interpreted as a user space address.
++ *
++ *\return
++ * Upon successful completion, scif_register() returns 0; otherwise the
++ * negative of one of the following errors is returned.
++ *\par Errors:
++ *- EFAULT
++ * - Addresses in the range [addr,addr+len-1] are invalid
++ *- EINVAL
++ * - prot_flags is invalid,
++ * - map_flags is invalid, or
++ * - offset is negative
++ *- ENOMEM
++ * - Not enough space
++ */
++int
++scif_pin_pages(
++ void *addr,
++ size_t len,
++ int prot_flags,
++ int map_flags,
++ scif_pinned_pages_t *pinned_pages);
++
++/**
++ * scif_unpin_pages - Unpin a set of pages
++ * \param pinned_pages Opaque handle of pages to be unpinned
++ *
++ * scif_unpin_pages() prevents scif_register_pinned_pages()from registering new
++ * windows against pinned_pages. The physical pages represented by pinned_pages
++ * will remain pinned until all windows previously registered against
++ * pinned_pages are deleted (the window is scif_unregister()'d and all
++ * references to the window are removed (see scif_unregister()).
++ *
++ * pinned_pages must have been obtain from a previous call to scif_pin_pages().
++ * After calling scif_unpin_pages(), it is an error to pass pinned_pages to
++ * scif_register_pinned_pages().
++ *
++ *\return:
++ * Upon successful completion, scif_unpin_pages() returns 0; otherwise the
++ * negative of one of the following errors is returned.
++ *
++ *\par Errors:
++ *- EINVAL
++ * - pinned_pages is not valid
++ */
++int
++scif_unpin_pages(
++ scif_pinned_pages_t pinned_pages);
++
++/**
++ * scif_register_pinned_pages - Mark a memory region for remote access.
++ * \param epd Endpoint descriptor
++ * \param pinned_pages Opaque handle of pinned pages
++ * \param offset Registered address space offset
++ * \param map_flags Flags which control where pages are mapped
++ *
++ * The scif_register_pinned_pages() function opens a window, a range of whole
++ * pages of the registered address space of the endpoint epd, starting at
++ * offset po. The value of po, further described below, is a function of the
++ * parameters offset and pinned_pages, and the value of map_flags. Each page of
++ * the window represents a corresponding physical memory page of the range
++ * represented by pinned_pages; the length of the window is the same as the
++ * length of range represented by pinned_pages. A successful scif_register()
++ * call returns po as the return value.
++ *
++ * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset
++ * exactly, and offset is constrained to be a multiple of the page size. The
++ * mapping established by scif_register() will not replace any existing
++ * registration; an error is returned if any page of the new window would
++ * intersect an existing window.
++ *
++ * When SCIF_MAP_FIXED is not set, the implementation uses offset in an
++ * implementation-defined manner to arrive at po. The po so chosen will be an
++ * area of the registered address space that the implementation deems suitable
++ * for a mapping of the required size. An offset value of 0 is interpreted as
++ * granting the implementation complete freedom in selecting po, subject to
++ * constraints described below. A non-zero value of offset is taken to be a
++ * suggestion of an offset near which the mapping should be placed. When the
++ * implementation selects a value for po, it does not replace any extant
++ * window. In all cases, po will be a multiple of the page size.
++ *
++ * The physical pages which are so represented by a window are available for
++ * access in calls to scif_get_pages(), scif_readfrom(), scif_writeto(),
++ * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the
++ * physical pages represented by the window will not be reused by the memory
++ * subsytem for any other purpose. Note that the same physical page may be
++ * represented by multiple windows.
++ *
++ * Windows created by scif_register_pinned_pages() are unregistered by
++ * scif_unregister().
++ *
++ * The map_flags argument is formed by OR'ing together zero or more of the
++ * following values:
++ *- SCIF_MAP_FIXED: interpret offset exactly
++ *
++ *\return
++ * Upon successful completion, scif_register_pinned_pages() returns the offset
++ * at which the mapping was placed (po); otherwise the negative of one of the
++ * following errors is returned.
++ *\par Errors:
++ *- EADDRINUSE
++ * - SCIF_MAP_FIXED is set in map_flags and pages in the new
++ * window would intersect an existing window
++ *- EAGAIN
++ * - The mapping could not be performed due to lack of resources
++ *- ECONNRESET
++ * - A connection was forcibly closed by a peer.
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - map_flags is invalid, or
++ * - SCIF_MAP_FIXED is set in map_flags, and offset is not a
++ * multiple of the page size, or
++ * - offset is negative
++ *- ENODEV
++ * - The remote node is lost.
++ *- ENOMEM
++ * - Not enough space
++ *- ENOTCONN
++ * - The endpoint is not connected
++ */
++off_t
++scif_register_pinned_pages(
++ scif_epd_t epd,
++ scif_pinned_pages_t pinned_pages,
++ off_t offset,
++ int map_flags);
++
++/**
++ * scif_get_pages - Add references to remote registered pages
++ * \param epd endpoint descriptor
++ * \param offset registered address space offset
++ * \param len length of range of pages
++ * \param pages returned scif_range structure
++ *
++ * scif_get_pages() returns the addresses of the physical pages represented by
++ * those pages of the registered address space of the peer of epd, starting at
++ * offset and continuing for len bytes. offset and len are constrained to be
++ * multiples of the page size.
++ *
++ * All of the pages in the specified range [offset,offset+len-1] must be within
++ * a single window of the registered address space of the peer of epd.
++ *
++ * The addresses are returned as a virtually contiguous array pointed to by the
++ * phys_addr component of the scif_range structure whose address is returned in
++ * pages. The nr_pages component of scif_range is the length of the array. The
++ * prot_flags component of scif_range holds the protection flag value passed
++ * when the pages were registered.
++ *
++ * Each physical page whose address is returned by scif_get_pages() remains
++ * available and will not be released for reuse until the scif_range structure
++ * is returned in a call to scif_put_pages(). The scif_range structure returned
++ * by scif_get_pages() must be unmodified.
++ *
++ * It is an error to call scif_close() on an endpoint on which a scif_range
++ * structure of that endpoint has not been returned to scif_put_pages().
++ *
++ *\return
++ * Upon successful completion, scif_get_pages() returns 0; otherwise the
++ * negative of one of the following errors is returned.
++ *\par Errors:
++ *- ECONNRESET
++ * - A connection was forcibly closed by a peer.
++ *- EINVAL
++ * - epd is not a valid endpoint descriptor, or
++ * - offset is not a multiple of the page size, or
++ * - offset is negative, or
++ * - len is not a multiple of the page size
++ *- ENODEV
++ * -The remote node is lost.
++ *- ENOTCONN
++ * - The endpoint is not connected
++ *- ENXIO
++ * - Addresses in the range [offset,offset+len-1] are invalid
++ * for the registered address space of the peer epd.
++ */
++int scif_get_pages(
++ scif_epd_t epd,
++ off_t offset,
++ size_t len,
++ struct scif_range **pages);
++
++/**
++ * scif_put_pages - Remove references from remote registered pages
++ * \param pages pages to be returned
++ *
++ * scif_put_pages() releases a scif_range structure previously obtained by
++ * calling scif_get_pages(). The physical pages represented by pages may
++ * be reused when the window which represented those pages is unregistered.
++ * Therefore, those pages must not be accessed after calling scif_put_pages().
++ *
++ *\return
++ * Upon successful completion, scif_put_pages() returns 0; otherwise the
++ * negative of one of the following errors is returned.
++ *\par Errors:
++ *- EINVAL
++ * - pages does not point to a valid scif_range structure, or
++ * - the scif_range structure pointed to by pages was already returned.
++ *- ENODEV
++ * - The remote node is lost.
++ *- ENOTCONN
++ * - The endpoint is not connected.
++ */
++int scif_put_pages(
++ struct scif_range *pages);
++
++/**
++ * scif_poll - Wait for some event on an endpoint
++ * \param epds Array of endpoint descriptors
++ * \param nepds Length of epds
++ * \param timeout Upper limit on time for which scif_poll() will
++ * block
++ *
++ * scif_poll() waits for one of a set of endpoints to become ready to perform
++ * an I/O operation. scif_poll() exposes a subset of the functionality of the
++ * POSIX standard poll() function.
++ *
++ * The epds argument specifies the endpoint descriptors to be examined and the
++ * events of interest for each endpoint descriptor. epds is a pointer to an
++ * array with one member for each open endpoint descriptor of interest.
++ *
++ * The number of items in the epds array is specified in nepds. The epd field
++ * of scif_pollepd is an endpoint descriptor of an open endpoint. The field
++ * events is a bitmask specifying the events which the application is
++ * interested in. The field revents is an output parameter, filled by the
++ * kernel with the events that actually occurred. The bits returned in revents
++ * can include any of those specified in events, or one of the values
++ * SCIF_POLLERR, SCIF_POLLHUP, or SCIF_POLLNVAL. (These three bits are
++ * meaningless in the events field, and will be set in the revents field
++ * whenever the corresponding condition is true.)
++ *
++ * If none of the events requested (and no error) has occurred for any of the
++ * endpoint descriptors, then scif_poll() blocks until one of the events occurs.
++ *
++ * The timeout argument specifies an upper limit on the time for which
++ * scif_poll() will block, in milliseconds. Specifying a negative value in
++ * timeout means an infinite timeout.
++ *
++ * The following bits may be set in events and returned in revents:
++ *- SCIF_POLLIN: Data may be received without blocking. For a connected
++ * endpoint, this means that scif_recv() may be called without blocking. For a
++ * listening endpoint, this means that scif_accept() may be called without
++ * blocking.
++ *- SCIF_POLLOUT: Data may be sent without blocking. For a connected endpoint,
++ * this means that scif_send() may be called without blocking. This bit value
++ * has no meaning for a listening endpoint and is ignored if specified.
++ *
++ * The following bits are only returned in revents, and are ignored if set in
++ * events:
++ *- SCIF_POLLERR: An error occurred on the endpoint
++ *- SCIF_POLLHUP: The connection to the peer endpoint was disconnected
++ *- SCIF_POLLNVAL: The specified endpoint descriptor is invalid.
++ *
++ *\return
++ * Upon successful completion, scif_poll()returns a non-negative value. A
++ * positive value indicates the total number of endpoint descriptors that have
++ * been selected (that is, endpoint descriptors for which the revents member is
++ * non-zero. A value of 0 indicates that the call timed out and no endpoint
++ * descriptors have been selected. Otherwise: in user mode -1 is returned and
++ * errno is set to indicate the error; in kernel mode the negative of one of
++ * the following errors is returned.
++ *
++ *\par Errors:
++ *- EFAULT
++ * - The array given as argument was not contained in the calling program's
++ * address space.
++ *- EINTR
++ * - A signal occurred before any requested event.
++ *- EINVAL
++ * - The nepds argument is greater than {OPEN_MAX}
++ *- ENOMEM
++ * - There was no space to allocate file descriptor tables.
++*/
++int
++scif_poll(
++ struct scif_pollepd *epds,
++ unsigned int nepds,
++ long timeout);
++
++/**
++ * scif_event_register - Register an event handler
++ * \param handler Event handler to be registered
++ *
++ * scif_event_register() registers a routine, handler, to be called when some
++ * event occurs. The event parameter to handler indicates the type of event
++ * which has occurred, and the corresponding component of the data parameter to
++ * handler provides additional data about the event.
++ *
++ * The following events are defined:
++ *- SCIF_NODE_ADDED: A node has been added to the SCIF network. The
++ * scif_node_added component of the data parameter to handler identifies the
++ * node. This event is informational. There are no requirements on the event
++ * handler.
++ *- SCIF_NODE_REMOVED: A node is being removed from the SCIF network. The
++ * scif_node_removed component of the data parameter to handler identifies the
++ * node. Upon being called, and before returning, the event handler must
++ * return, using scif_put_pages(), all structures obtained using
++ * scif_get_pages() against an endpoint connected to the lost node. It is
++ * recommended and expected that the handler will also scif_close() all
++ * endpoints connected to the lost node.
++ *
++ *\return
++ * Upon successful completion scif_event_register() returns 0.
++ *
++ *\par Errors:
++ *- ENOMEM
++ * - There was no space to allocate file descriptor tables.
++*/
++
++int
++scif_event_register(
++ scif_callback_t handler);
++
++/**
++ * scif_event_unregister - Unregister event handler
++ * \param handler Event handler to be unregistered
++ *
++ * scif_event_unregister() unregisters the handler which was registered
++ * previously by using scif_event_register().
++ *
++ * WARNING: scif_event_unregister must be called before the module
++ * (that registered handles) exits for every handler that is registered.
++ * Failure to do so will result in crash of the scif module.
++ *
++ *\return
++ * Upon successful completion scif_event_unregister() returns 0.
++ *\par Errors:
++ *- EINVAL
++ * -If the event handler was not found/registered.
++*/
++int
++scif_event_unregister(
++ scif_callback_t handler);
++
++/*
++ * Note: The callee can use pci_resource_start(dev, index) and
++ * pci_resource_len(dev, index) to obtain the PCI resource starting
++ * physical address and length for valid non null indexes of the va
++ * array. MMIO bars will not have IORESOURCE_PREFETCH set in the
++ * flags obtained from pci_resource_flags(dev, index). va[index]
++ * will be set to NULL for invalid resources.
++ */
++struct scif_pci_info {
++ /* pci_dev pointer associated with a node */
++ struct pci_dev *pdev;
++ /* Ioremapped virtual address base for every valid PCIe resource */
++ void __iomem *va[PCI_NUM_RESOURCES];
++};
++
++/**
++ * scif_pci_info - Populate the scif_pci_info structure for a node.
++ * \param node The node to query
++ * \param dev The scif_pci_info structure to populate.
++ *
++ * scif_pci_info() populates the provided scif_pci_info structure
++ * associated with a node. The requested node ID cannot be the same as
++ * the current node. This routine will only return success when called from
++ * the host.
++ *
++ *\return
++ * Upon successful completion, scif_pci_info() returns 0; otherwise the
++ * negative of one of the following errors is returned.
++ *
++ *\par Errors:
++ *- EINVAL
++ * - The requested node is not valid.
++ * - Called on MIC instead of the host.
++ *- ENODEV
++ * - No pci_dev association exists for the node.
++ */
++int
++scif_pci_info(
++ uint16_t node,
++ struct scif_pci_info *dev);
++
++
++#ifdef __cplusplus
++} /* extern "C" */
++#endif
++
++#endif /* __SCIF_H__ */
--- /dev/null
+From a6d3fc7a6f6d3b3b621dfbd71babbff5ae58d1dd Mon Sep 17 00:00:00 2001
+From: Phil Cayton <phil.cayton@intel.com>
+Date: Wed, 28 May 2014 15:50:26 -0700
+Subject: [PATCH 07/13] Add CCL-Direct (ibp) drivers to Infiniband
+
+This includes the base ibp server module as well as
+the server modules for sa and cm
+
+Signed-off-by: Phil Cayton <phil.cayton@intel.com>
+---
+diff -urN a6/drivers/infiniband/ibp/cm/cm_ibp_abi.h a7/drivers/infiniband/ibp/cm/cm_ibp_abi.h
+--- a6/drivers/infiniband/ibp/cm/cm_ibp_abi.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/cm/cm_ibp_abi.h 2015-02-23 10:01:30.289769309 -0800
+@@ -0,0 +1,399 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef CM_IBP_ABI_H
++#define CM_IBP_ABI_H
++
++#include <linux/types.h>
++#include <rdma/ib_verbs.h>
++#include <rdma/ib_cm.h>
++
++/* Increment this value if any changes break compatibility. */
++#define IBP_CM_ABI_VERSION 1
++
++/*
++ * Make sure that all structs defined in this file are laid out to pack
++ * the same way on different architectures to avoid incompatibility.
++ *
++ * Specifically:
++ * - Do not use pointer types -- pass pointers in a u64 instead.
++ * - Make sure that any structure larger than 4 bytes is padded
++ * to a multiple of 8 bytes; otherwise the structure size may
++ * be different between architectures.
++ */
++
++struct ibp_event_msg {
++ struct ibp_msg_header header;
++ u64 length;
++ u8 event[0];
++};
++
++
++struct ibp_sa_path_rec {
++ __be64 service_id;
++ u64 dgid_prefix;
++ u64 dgid_id;
++ u64 sgid_prefix;
++ u64 sgid_id;
++ __be16 dlid;
++ __be16 slid;
++ u32 raw_traffic;
++ __be32 flow_label;
++ u8 hop_limit;
++ u8 traffic_class;
++ u32 reversible;
++ u8 numb_path;
++ __be16 pkey;
++ __be16 qos_class;
++ u8 sl;
++ u8 mtu_selector;
++ u8 mtu;
++ u8 rate_selector;
++ u8 rate;
++ u8 packet_life_time_selector;
++ u8 packet_life_time;
++ u8 preference;
++};
++
++struct ibp_create_cm_id_cmd {
++ struct ibp_msg_header header;
++ u64 device;
++};
++
++struct ibp_create_cm_id_resp {
++ u64 ibp_cm_id;
++ __be64 service_id;
++ __be64 service_mask;
++ __be32 local_id;
++ __be32 remote_id;
++ u32 remote_cm_qpn;
++ u32 filler;
++};
++
++struct ibp_destroy_cm_id_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++};
++
++struct ibp_cm_listen_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++ __be64 service_id;
++ __be64 service_mask;
++ u64 null_comp_data;
++ struct ib_cm_compare_data compare_data;
++};
++
++struct ibp_send_cm_req_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++ struct ibp_sa_path_rec primary_path;
++ struct ibp_sa_path_rec alternate_path;
++ __be64 service_id;
++ u32 qp_num;
++ enum ib_qp_type qp_type;
++ u32 starting_psn;
++ u8 peer_to_peer;
++ u8 responder_resources;
++ u8 initiator_depth;
++ u8 remote_cm_response_timeout;
++ u8 flow_control;
++ u8 local_cm_response_timeout;
++ u8 retry_count;
++ u8 rnr_retry_count;
++ u8 max_cm_retries;
++ u8 srq;
++ u8 private_data_len;
++ char private_data[0];
++};
++
++struct ibp_send_cm_rep_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++ u32 qp_num;
++ u32 starting_psn;
++ u8 responder_resources;
++ u8 initiator_depth;
++ u8 failover_accepted;
++ u8 flow_control;
++ u8 rnr_retry_count;
++ u8 srq;
++ u8 private_data_len;
++ char private_data[0];
++};
++
++struct ibp_send_cm_rtu_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++ u8 private_data_len;
++ char private_data[0];
++};
++
++struct ibp_send_cm_dreq_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++ u8 private_data_len;
++ char private_data[0];
++};
++
++struct ibp_send_cm_drep_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++ u8 private_data_len;
++ char private_data[0];
++};
++
++struct ibp_send_cm_rej_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++ u64 reason;
++ u8 private_data_len;
++ u8 ari_length;
++ char data[0];
++};
++
++struct ibp_send_cm_mra_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++ u8 service_timeout;
++ u8 private_data_len;
++ char private_data[0];
++};
++
++struct ibp_send_cm_lap_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++ struct ibp_sa_path_rec alternate_path;
++ u8 private_data_len;
++ char private_data[0];
++};
++
++struct ibp_send_cm_apr_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++ u64 status;
++ u8 private_data_len;
++ u8 info_length;
++ char data[0];
++};
++
++struct ibp_send_cm_sidr_req_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++ struct ibp_sa_path_rec path;
++ __be64 service_id;
++ int timeout_ms;
++ u8 max_cm_retries;
++ u8 private_data_len;
++ char private_data[0];
++};
++
++struct ibp_send_cm_sidr_rep_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++ u32 qp_num;
++ u32 qkey;
++ u64 status;
++ u8 info_length;
++ u8 private_data_len;
++ char data[0];
++};
++
++struct ibp_cm_notify_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++ u64 event;
++};
++
++struct ibp_cm_init_qp_attr_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_cm_id;
++ u64 qp_attr_state;
++};
++
++struct ibp_cm_init_qp_attr_resp {
++ u64 qp_attr_mask;
++ u64 qp_access_flags;
++ u64 qp_state;
++ u64 cur_qp_state;
++ u64 path_mtu;
++ u64 path_mig_state;
++ u32 qkey;
++ u32 rq_psn;
++ u32 sq_psn;
++ u64 dest_qp_num;
++
++ u32 cap_max_send_wr;
++ u32 cap_max_recv_wr;
++ u32 cap_max_send_sge;
++ u32 cap_max_recv_sge;
++ u32 cap_max_inline_data;
++
++ u64 ah_attr_grh_dgid_subnet_prefix;
++ u64 ah_attr_grh_dgid_interface_id;
++ u32 ah_attr_grh_flow_label;
++ u8 ah_attr_grh_sgid_index;
++ u8 ah_attr_grh_hop_limit;
++ u8 ah_attr_grh_traffic_class;
++ u16 ah_attr_dlid;
++ u8 ah_attr_sl;
++ u8 ah_attr_src_path_bits;
++ u8 ah_attr_static_rate;
++ u8 ah_attr_ah_flags;
++ u8 ah_attr_port_num;
++
++ u64 alt_attr_grh_dgid_subnet_prefix;
++ u64 alt_attr_grh_dgid_interface_id;
++ u32 alt_attr_grh_flow_label;
++ u8 alt_attr_grh_sgid_index;
++ u8 alt_attr_grh_hop_limit;
++ u8 alt_attr_grh_traffic_class;
++ u16 alt_attr_dlid;
++ u8 alt_attr_sl;
++ u8 alt_attr_src_path_bits;
++ u8 alt_attr_static_rate;
++ u8 alt_attr_ah_flags;
++ u8 alt_attr_port_num;
++
++ u16 pkey_index;
++ u16 alt_pkey_index;
++ u8 en_sqd_async_notify;
++ u8 sq_draining;
++ u8 max_rd_atomic;
++ u8 max_dest_rd_atomic;
++ u8 min_rnr_timer;
++ u8 port_num;
++ u8 timeout;
++ u8 retry_cnt;
++ u8 rnr_retry;
++ u8 alt_port_num;
++ u8 alt_timeout;
++
++};
++
++struct ibp_cm_req_event_resp {
++ struct ibp_sa_path_rec primary_path;
++ struct ibp_sa_path_rec alternate_path;
++ u64 listen_id;
++ __be64 remote_ca_guid;
++ __u32 remote_qkey;
++ __u32 remote_qpn;
++ __u32 qp_type;
++ __u32 starting_psn;
++ __u8 responder_resources;
++ __u8 initiator_depth;
++ __u8 local_cm_response_timeout;
++ __u8 flow_control;
++ __u8 remote_cm_response_timeout;
++ __u8 retry_count;
++ __u8 rnr_retry_count;
++ __u8 srq;
++ __u8 port;
++ __u8 reserved[7];
++};
++
++struct ibp_cm_rep_event_resp {
++ __be64 remote_ca_guid;
++ __u32 remote_qkey;
++ __u32 remote_qpn;
++ __u32 starting_psn;
++ __u8 responder_resources;
++ __u8 initiator_depth;
++ __u8 target_ack_delay;
++ __u8 failover_accepted;
++ __u8 flow_control;
++ __u8 rnr_retry_count;
++ __u8 srq;
++ __u8 reserved[5];
++};
++
++struct ibp_cm_rej_event_resp {
++ __u32 reason;
++};
++
++struct ibp_cm_mra_event_resp {
++ __u8 timeout;
++ __u8 reserved[3];
++};
++
++struct ibp_cm_lap_event_resp {
++ struct ibp_sa_path_rec path;
++};
++
++struct ibp_cm_rtu_event_resp {
++ __u32 status;
++ __be32 local_id;
++ __be32 remote_id;
++};
++
++struct ibp_cm_apr_event_resp {
++ __u32 status;
++};
++
++struct ibp_cm_sidr_req_event_resp {
++ u64 listen_id;
++ __u16 pkey;
++ __u8 port;
++ __u8 reserved;
++};
++
++struct ibp_cm_sidr_rep_event_resp {
++ __u32 status;
++ __u32 qkey;
++ __u32 qpn;
++};
++
++struct ibp_cm_event {
++ enum ib_event_type event_type;
++ union {
++ struct ibp_cm_req_event_resp req_resp;
++ struct ibp_cm_rep_event_resp rep_resp;
++ struct ibp_cm_rej_event_resp rej_resp;
++ struct ibp_cm_rtu_event_resp rtu_resp;
++ struct ibp_cm_mra_event_resp mra_resp;
++ struct ibp_cm_lap_event_resp lap_resp;
++ struct ibp_cm_apr_event_resp apr_resp;
++ struct ibp_cm_sidr_req_event_resp sidr_req_resp;
++ struct ibp_cm_sidr_rep_event_resp sidr_rep_resp;
++
++ __u32 send_status;
++ } u;
++
++ u64 event_cm_id;
++ u64 ibp_cm_id;
++ u64 data_length;
++ u64 info_length;
++
++ u8 data[0];
++};
++
++#endif /* CM_IBP_ABI_H */
+diff -urN a6/drivers/infiniband/ibp/cm/cm_server_msg.c a7/drivers/infiniband/ibp/cm/cm_server_msg.c
+--- a6/drivers/infiniband/ibp/cm/cm_server_msg.c 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/cm/cm_server_msg.c 2015-02-23 10:18:09.042820508 -0800
+@@ -0,0 +1,1058 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#include "server.h"
++
++LIST_HEAD(cm_entry_list);
++
++void ibp_copy_sa_path_rec(struct ibp_sa_path_rec *a, struct ib_sa_path_rec *b)
++{
++ /*Copy ibp_sa_path_rec to ib_sa_path_rec*/
++ b->service_id = a->service_id;
++ b->dgid.global.subnet_prefix = a->dgid_prefix;
++ b->dgid.global.interface_id = a->dgid_id;
++ b->sgid.global.subnet_prefix = a->sgid_prefix;
++ b->sgid.global.interface_id = a->sgid_id;
++ b->dlid = a->dlid;
++ b->slid = a->slid;
++ b->raw_traffic = a->raw_traffic;
++ b->flow_label = a->flow_label;
++ b->hop_limit = a->hop_limit;
++ b->traffic_class = a->traffic_class;
++ b->reversible = a->reversible;
++ b->numb_path = a->numb_path;
++ b->pkey = a->pkey;
++ b->qos_class = a->qos_class;
++ b->sl = a->sl;
++ b->mtu_selector = a->mtu_selector;
++ b->mtu = a->mtu;
++ b->rate_selector = a->rate_selector;
++ b->rate = a->rate;
++ b->packet_life_time_selector = a->packet_life_time_selector;
++ b->packet_life_time = a->packet_life_time;
++ b->preference = a->preference;
++}
++
++void ib_copy_sa_path_rec(struct ibp_sa_path_rec *a, struct ib_sa_path_rec *b)
++{
++ /*Copy ib_sa_path_rec to ibp_sa_path_rec*/
++ a->service_id = b->service_id;
++ a->dgid_prefix = b->dgid.global.subnet_prefix;
++ a->dgid_id = b->dgid.global.interface_id;
++ a->sgid_prefix = b->sgid.global.subnet_prefix;
++ a->sgid_id = b->sgid.global.interface_id;
++ a->dlid = b->dlid;
++ a->slid = b->slid;
++ a->raw_traffic = b->raw_traffic;
++ a->flow_label = b->flow_label;
++ a->hop_limit = b->hop_limit;
++ a->traffic_class = b->traffic_class;
++ a->reversible = b->reversible;
++ a->numb_path = b->numb_path;
++ a->pkey = b->pkey;
++ a->qos_class = b->qos_class;
++ a->sl = b->sl;
++ a->mtu_selector = b->mtu_selector;
++ a->mtu = b->mtu;
++ a->rate_selector = b->rate_selector;
++ a->rate = b->rate;
++ a->packet_life_time_selector = b->packet_life_time_selector;
++ a->packet_life_time = b->packet_life_time;
++ a->preference = b->preference;
++}
++
++void cleanup_cm_entry_list(void)
++{
++ struct cm_entry *entry;
++ struct cm_entry *next;
++
++ down_write(&list_rwsem);
++
++ list_for_each_entry_safe(entry, next, &cm_entry_list, list)
++ kfree(entry);
++
++ up_write(&list_rwsem);
++}
++
++static struct cm_entry *find_cm_entry(struct ib_cm_id *cm_id)
++{
++ struct cm_entry *entry;
++
++ down_read(&list_rwsem);
++
++ list_for_each_entry(entry, &cm_entry_list, list)
++ if (entry->cm_id == cm_id)
++ goto out;
++
++ print_err("Could not find cm id %p\n", cm_id);
++ entry = NULL;
++
++out:
++ up_read(&list_rwsem);
++
++ return entry;
++}
++
++/* find the entry id for the listen cm id so we can add the new cm id
++ * that is being accepted to the list so it can be found on future events
++ */
++static struct cm_entry *find_cm_entry_and_add(struct ib_cm_id *listen_id,
++ struct ib_cm_id *cm_id)
++{
++ struct cm_entry *entry;
++ struct cm_entry *listen_entry;
++
++ listen_entry = find_cm_entry(listen_id);
++ if (!listen_entry) {
++ print_err("Could not find listen id %p\n", listen_id);
++ return NULL;
++ }
++
++ entry = kzalloc(sizeof(struct cm_entry), GFP_KERNEL);
++ if (!entry) {
++ print_err("kzalloc failed\n");
++ return NULL;
++ }
++
++ entry->client = listen_entry->client;
++ entry->cm_id = cm_id;
++
++ down_write(&list_rwsem);
++ list_add(&entry->list, &cm_entry_list);
++ up_write(&list_rwsem);
++
++ return listen_entry;
++}
++
++static void ibp_event_req_get(struct ibp_cm_req_event_resp *proxy_req,
++ struct ib_cm_req_event_param *req)
++
++{
++ proxy_req->listen_id = (u64) req->listen_id;
++ proxy_req->remote_ca_guid = req->remote_ca_guid;
++ proxy_req->remote_qkey = req->remote_qkey;
++ proxy_req->remote_qpn = req->remote_qpn;
++ proxy_req->qp_type = req->qp_type;
++ proxy_req->starting_psn = req->starting_psn;
++ proxy_req->responder_resources = req->responder_resources;
++ proxy_req->initiator_depth = req->initiator_depth;
++ proxy_req->local_cm_response_timeout = req->local_cm_response_timeout;
++ proxy_req->flow_control = req->flow_control;
++ proxy_req->remote_cm_response_timeout = req->remote_cm_response_timeout;
++ proxy_req->retry_count = req->retry_count;
++ proxy_req->rnr_retry_count = req->rnr_retry_count;
++ proxy_req->srq = req->srq;
++ proxy_req->port = req->port;
++ ib_copy_sa_path_rec(&proxy_req->primary_path, req->primary_path);
++ if (req->alternate_path)
++ ib_copy_sa_path_rec(&proxy_req->alternate_path,
++ req->alternate_path);
++}
++
++static void ibp_event_rep_get(struct ibp_cm_rep_event_resp *proxy_rep,
++ struct ib_cm_rep_event_param *rep)
++{
++ proxy_rep->remote_ca_guid = rep->remote_ca_guid;
++ proxy_rep->remote_qkey = rep->remote_qkey;
++ proxy_rep->remote_qpn = rep->remote_qpn;
++ proxy_rep->starting_psn = rep->starting_psn;
++ proxy_rep->responder_resources = rep->responder_resources;
++ proxy_rep->initiator_depth = rep->initiator_depth;
++ proxy_rep->target_ack_delay = rep->target_ack_delay;
++ proxy_rep->failover_accepted = rep->failover_accepted;
++ proxy_rep->flow_control = rep->flow_control;
++ proxy_rep->rnr_retry_count = rep->rnr_retry_count;
++ proxy_rep->srq = rep->srq;
++}
++
++static
++void ibp_event_sidr_rep_get(struct ibp_cm_sidr_rep_event_resp *proxy_resp,
++ struct ib_cm_sidr_rep_event_param *rep)
++{
++ proxy_resp->status = rep->status;
++ proxy_resp->qkey = rep->qkey;
++ proxy_resp->qpn = rep->qpn;
++}
++
++static void ibp_event(struct work_struct *work)
++{
++ struct ibp_event *event_work;
++ struct ibp_event_msg *msg;
++ int msg_len;
++ int event_len;
++
++ print_trace("in\n");
++
++ event_work = (struct ibp_event *) work;
++
++ event_len = event_work->event.data_length +
++ event_work->event.info_length +
++ sizeof(struct ibp_cm_event);
++
++ msg_len = sizeof(struct ibp_event_msg) + event_len;
++
++ msg = kzalloc(msg_len, GFP_KERNEL);
++ if (!msg) {
++ print_err("kzmalloc failed\n");
++ goto err;
++ }
++
++ memcpy(msg->event, &(event_work->event), event_len);
++ msg->length = event_len;
++
++ IBP_INIT_MSG(NULL, msg, msg_len, IBP_EVENT);
++
++ ibp_send(event_work->client->ep, msg, msg_len);
++err:
++ kfree(event_work);
++}
++
++static int ibp_event_handler(struct ib_cm_id *cm_id,
++ struct ib_cm_event *ib_cm_event)
++{
++ struct ibp_event *event_work;
++ struct ibp_client *client;
++ struct cm_entry *entry;
++ void *info = NULL;
++ int info_length = 0;
++ int data_length = 0;
++
++ print_trace("in\n");
++
++ switch (ib_cm_event->event) {
++ case IB_CM_REQ_RECEIVED:
++ data_length = IB_CM_REQ_PRIVATE_DATA_SIZE;
++ break;
++ case IB_CM_REP_RECEIVED:
++ data_length = IB_CM_REP_PRIVATE_DATA_SIZE;
++ break;
++ case IB_CM_RTU_RECEIVED:
++ data_length = IB_CM_RTU_PRIVATE_DATA_SIZE;
++ break;
++ case IB_CM_DREQ_RECEIVED:
++ data_length = IB_CM_DREQ_PRIVATE_DATA_SIZE;
++ break;
++ case IB_CM_DREP_RECEIVED:
++ data_length = IB_CM_DREP_PRIVATE_DATA_SIZE;
++ break;
++ case IB_CM_MRA_RECEIVED:
++ data_length = IB_CM_MRA_PRIVATE_DATA_SIZE;
++ break;
++ case IB_CM_REJ_RECEIVED:
++ data_length = IB_CM_REJ_PRIVATE_DATA_SIZE;
++ info_length = ib_cm_event->param.rej_rcvd.ari_length;
++ break;
++ case IB_CM_LAP_RECEIVED:
++ data_length = IB_CM_LAP_PRIVATE_DATA_SIZE;
++ break;
++ case IB_CM_APR_RECEIVED:
++ data_length = IB_CM_APR_PRIVATE_DATA_SIZE;
++ info_length = ib_cm_event->param.apr_rcvd.info_len;
++ break;
++ case IB_CM_SIDR_REQ_RECEIVED:
++ data_length = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE;
++ break;
++ case IB_CM_SIDR_REP_RECEIVED:
++ data_length = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
++ info_length = ib_cm_event->param.sidr_rep_rcvd.info_len;
++ break;
++ default:
++ break;
++ }
++ event_work = kzalloc((sizeof(struct ibp_event)) +
++ data_length + info_length, GFP_KERNEL);
++ if (!event_work) {
++ print_err("kzalloc failed\n");
++ return -ENOMEM;
++ }
++
++ if (ib_cm_event->event == IB_CM_REQ_RECEIVED) {
++ struct ib_cm_req_event_param *param;
++ param = &ib_cm_event->param.req_rcvd;
++ entry = find_cm_entry_and_add(param->listen_id, cm_id);
++ } else if (ib_cm_event->event == IB_CM_SIDR_REQ_RECEIVED) {
++ struct ib_cm_sidr_req_event_param *param;
++ param = &ib_cm_event->param.sidr_req_rcvd;
++ entry = find_cm_entry_and_add(param->listen_id, cm_id);
++ } else
++ entry = find_cm_entry(cm_id);
++
++ if (!entry) {
++ kfree(event_work);
++ return -EINVAL;
++ }
++
++ client = entry->client;
++
++ event_work->client = client;
++ event_work->event.ibp_cm_id = (u64) entry->cm_id;
++ event_work->event.event_cm_id = (u64) cm_id;
++ event_work->event.event_type = ib_cm_event->event;
++ event_work->event.data_length = data_length;
++ event_work->event.info_length = info_length;
++
++ /* parse and copy the proper event */
++ switch (ib_cm_event->event) {
++ case IB_CM_REQ_RECEIVED:
++ print_dbg("IB_CM_REQ_RECEIVED (%d)\n", ib_cm_event->event);
++ ibp_event_req_get(&event_work->event.u.req_resp,
++ &ib_cm_event->param.req_rcvd);
++ break;
++ case IB_CM_REP_RECEIVED:
++ print_dbg("IB_CM_REP_RECEIVED (%d)\n", ib_cm_event->event);
++ ibp_event_rep_get(&event_work->event.u.rep_resp,
++ &ib_cm_event->param.rep_rcvd);
++ break;
++ case IB_CM_MRA_RECEIVED:
++ print_dbg("IB_CM_MRA_RECEIVED (%d)\n", ib_cm_event->event);
++ event_work->event.u.mra_resp.timeout =
++ ib_cm_event->param.mra_rcvd.service_timeout;
++ break;
++ case IB_CM_REJ_RECEIVED:
++ print_dbg("IB_CM_REJ_RECEIVED (%d)\n", ib_cm_event->event);
++ event_work->event.u.rej_resp.reason =
++ ib_cm_event->param.rej_rcvd.reason;
++ info = ib_cm_event->param.rej_rcvd.ari;
++ break;
++ case IB_CM_RTU_RECEIVED:
++ print_dbg("IB_CM_RTU_RECEIVED (%d)\n", ib_cm_event->event);
++ event_work->event.u.rtu_resp.status =
++ ib_cm_event->param.send_status;
++ event_work->event.u.rtu_resp.local_id = cm_id->local_id;
++ event_work->event.u.rtu_resp.remote_id = cm_id->remote_id;
++ break;
++ case IB_CM_LAP_RECEIVED:
++ print_dbg("IB_CM_LAP_RECEIVED (%d)\n", ib_cm_event->event);
++ ib_copy_sa_path_rec(&event_work->event.u.lap_resp.path,
++ ib_cm_event->param.lap_rcvd.alternate_path);
++ break;
++ case IB_CM_APR_RECEIVED:
++ print_dbg("IB_CM_APR_RECEIVED (%d)\n", ib_cm_event->event);
++ event_work->event.u.apr_resp.status =
++ ib_cm_event->param.apr_rcvd.ap_status;
++ info = ib_cm_event->param.apr_rcvd.apr_info;
++ break;
++ case IB_CM_SIDR_REQ_RECEIVED:
++ print_dbg("IB_CM_SIDR_REQ_RECEIVED (%d)\n",
++ ib_cm_event->event);
++ event_work->event.u.sidr_req_resp.listen_id =
++ (u64) ib_cm_event->param.sidr_req_rcvd.listen_id;
++ event_work->event.u.sidr_req_resp.pkey =
++ ib_cm_event->param.sidr_req_rcvd.pkey;
++ event_work->event.u.sidr_req_resp.port =
++ ib_cm_event->param.sidr_req_rcvd.port;
++ break;
++ case IB_CM_SIDR_REP_RECEIVED:
++ print_dbg("IB_CM_SIDR_REP_RECEIVED (%d)\n",
++ ib_cm_event->event);
++ ibp_event_sidr_rep_get(&event_work->event.u.sidr_rep_resp,
++ &ib_cm_event->param.sidr_rep_rcvd);
++ info = ib_cm_event->param.sidr_rep_rcvd.info;
++ break;
++ case IB_CM_TIMEWAIT_EXIT:
++ case IB_CM_REQ_ERROR:
++ case IB_CM_REP_ERROR:
++ case IB_CM_DREQ_ERROR:
++ case IB_CM_LAP_ERROR:
++ case IB_CM_SIDR_REQ_ERROR:
++ print_dbg("IB_CM_..._ERROR (%d)\n", ib_cm_event->event);
++ event_work->event.u.send_status =
++ ib_cm_event->param.send_status;
++ break;
++
++ case IB_CM_USER_ESTABLISHED:
++ print_dbg("IB_CM_USER_ESTABLISHED (%d)\n",
++ ib_cm_event->event);
++ event_work->event.u.send_status =
++ ib_cm_event->param.send_status;
++ break;
++ case IB_CM_DREQ_RECEIVED:
++ print_dbg("IB_CM_DREQ_RECEIVED (%d)\n", ib_cm_event->event);
++ event_work->event.u.send_status =
++ ib_cm_event->param.send_status;
++ break;
++ case IB_CM_DREP_RECEIVED:
++ print_dbg("IB_CM_DREP_RECEIVED (%d)\n", ib_cm_event->event);
++ event_work->event.u.send_status =
++ ib_cm_event->param.send_status;
++ break;
++ default:
++ print_dbg("event not handled %d\n", ib_cm_event->event);
++ break;
++ }
++
++ if (data_length)
++ memcpy(event_work->event.data, ib_cm_event->private_data,
++ data_length);
++
++ if (info_length)
++ memcpy(event_work->event.data + data_length, info, info_length);
++
++ INIT_WORK(&event_work->work, ibp_event);
++ queue_work(client->workqueue, &event_work->work);
++
++ return 0;
++}
++
++int ibp_cmd_create_cm_id(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_create_cm_id_cmd *cmd;
++ struct ibp_create_cm_id_resp *resp;
++ struct ib_device *ib_device;
++ struct ib_cm_id *cm_id = NULL;
++ struct cm_entry *entry;
++ size_t len;
++ int status = 0;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_create_cm_id_cmd *) hdr;
++ ib_device = (struct ib_device *) cmd->device;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ entry = kzalloc(sizeof(struct cm_entry), GFP_KERNEL);
++ if (!entry) {
++ print_err("kzalloc failed\n");
++ status = -ENOMEM;
++ goto send_resp;
++ }
++
++ cm_id = ib_create_cm_id(ib_device,
++ (ib_cm_handler) ibp_event_handler,
++ NULL);
++ if (IS_ERR(cm_id)) {
++ status = PTR_ERR(cm_id);
++ print_err("ib_create_cm_id returned %d\n", status);
++ goto send_resp;
++ }
++
++ len += sizeof(*resp);
++
++ resp = (struct ibp_create_cm_id_resp *) msg->data;
++
++ resp->ibp_cm_id = (u64) cm_id;
++ resp->service_id = cm_id->service_id;
++ resp->service_mask = cm_id->service_mask;
++ resp->local_id = cm_id->local_id;
++ resp->remote_id = cm_id->remote_id;
++ resp->remote_cm_qpn = cm_id->remote_cm_qpn;
++
++send_resp:
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, status);
++
++ ret = ibp_send(client->ep, msg, len);
++ if (ret) {
++ kfree(entry);
++ print_err("ibp_send returned %d\n", ret);
++ return ret;
++ }
++ if (status) {
++ kfree(entry);
++ return status;
++ }
++
++ entry->client = client;
++ entry->cm_id = cm_id;
++
++ down_write(&list_rwsem);
++ list_add(&entry->list, &cm_entry_list);
++ up_write(&list_rwsem);
++
++ return 0;
++}
++
++int ibp_cmd_destroy_cm_id(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_destroy_cm_id_cmd *cmd;
++ struct ib_cm_id *cm_id;
++ struct cm_entry *entry;
++ size_t len;
++ int ret = 0;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_destroy_cm_id_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ entry = find_cm_entry(cm_id);
++ if (!entry)
++ goto send_resp;
++
++ down_write(&list_rwsem);
++ list_del(&entry->list);
++ up_write(&list_rwsem);
++
++ kfree(entry);
++
++ ib_destroy_cm_id(cm_id);
++
++send_resp:
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++int ibp_cmd_cm_listen(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_cm_listen_cmd *cmd;
++ struct ib_cm_id *cm_id;
++ struct ib_cm_compare_data *data = NULL;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_cm_listen_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ if (!cmd->null_comp_data)
++ data = &(cmd->compare_data);
++
++ ret = ib_cm_listen(cm_id, cmd->service_id, cmd->service_mask, data);
++ if (ret)
++ print_err("ib_cm_listen returned %d\n", ret);
++
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++
++ return ibp_send(client->ep, msg, len);
++}
++
++int ibp_cmd_send_cm_req(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_send_cm_req_cmd *cmd;
++ struct ib_cm_id *cm_id;
++ struct ib_cm_req_param param = {0};
++ struct ib_sa_path_rec primary_path;
++ struct ib_sa_path_rec alternate_path;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_send_cm_req_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ if (cmd->alternate_path.pkey) {
++ param.alternate_path = &alternate_path;
++ ibp_copy_sa_path_rec(&cmd->alternate_path, &alternate_path);
++ }
++
++ param.primary_path = &primary_path;
++ ibp_copy_sa_path_rec(&cmd->primary_path, &primary_path);
++
++ param.service_id = cmd->service_id;
++ param.qp_num = cmd->qp_num;
++ param.qp_type = cmd->qp_type;
++ param.starting_psn = cmd->starting_psn;
++ param.peer_to_peer = cmd->peer_to_peer;
++ param.responder_resources = cmd->responder_resources;
++ param.initiator_depth = cmd->initiator_depth;
++ param.remote_cm_response_timeout = cmd->remote_cm_response_timeout;
++ param.flow_control = cmd->flow_control;
++ param.local_cm_response_timeout = cmd->local_cm_response_timeout;
++ param.retry_count = cmd->retry_count;
++ param.rnr_retry_count = cmd->rnr_retry_count;
++ param.max_cm_retries = cmd->max_cm_retries;
++ param.srq = cmd->srq;
++ param.private_data_len = cmd->private_data_len;
++
++ if (cmd->private_data_len)
++ param.private_data = cmd->private_data;
++
++ ret = ib_send_cm_req(cm_id, ¶m);
++
++ if (ret)
++ print_err("send_cm_req returned %d\n", ret);
++
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++
++ return ibp_send(client->ep, msg, len);
++}
++
++int ibp_cmd_send_cm_rep(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_send_cm_rep_cmd *cmd;
++ struct ib_cm_id *cm_id;
++ struct ib_cm_rep_param param = {0};
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_send_cm_rep_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ param.qp_num = cmd->qp_num;
++ param.starting_psn = cmd->starting_psn;
++ param.responder_resources = cmd->responder_resources;
++ param.initiator_depth = cmd->initiator_depth;
++ param.failover_accepted = cmd->failover_accepted;
++ param.rnr_retry_count = cmd->rnr_retry_count;
++ param.srq = cmd->srq;
++ param.private_data_len = cmd->private_data_len;
++
++ if (cmd->private_data_len)
++ param.private_data = cmd->private_data;
++
++ ret = ib_send_cm_rep(cm_id, ¶m);
++ if (ret)
++ print_err("send_cm_rep returned %d\n", ret);
++
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++
++ return ibp_send(client->ep, msg, len);
++}
++
++int ibp_cmd_send_cm_rtu(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_send_cm_rtu_cmd *cmd;
++ struct ibp_response_msg *msg;
++ struct ib_cm_id *cm_id;
++ void *private_data = NULL;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_send_cm_rtu_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ if (cmd->private_data_len)
++ private_data = cmd->private_data;
++
++ ret = ib_send_cm_rtu(cm_id, private_data, cmd->private_data_len);
++ if (ret)
++ print_err("send_cm_rtu returned %d\n", ret);
++
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++
++ return ibp_send(client->ep, msg, len);
++}
++
++int ibp_cmd_send_cm_dreq(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_send_cm_dreq_cmd *cmd;
++ struct ib_cm_id *cm_id;
++ void *private_data = NULL;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_send_cm_dreq_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ if (cmd->private_data_len)
++ private_data = cmd->private_data;
++
++ ret = ib_send_cm_dreq(cm_id, private_data, cmd->private_data_len);
++ if (ret)
++ print_dbg("send_cm_dreq returned %d\n", ret);
++
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++
++ return ibp_send(client->ep, msg, len);
++}
++
++int ibp_cmd_send_cm_drep(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_send_cm_drep_cmd *cmd;
++ struct ib_cm_id *cm_id;
++ void *private_data = NULL;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_send_cm_drep_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ if (cmd->private_data_len)
++ private_data = cmd->private_data;
++
++ ret = ib_send_cm_drep(cm_id, private_data, cmd->private_data_len);
++ if (ret)
++ print_dbg("send_cm_drep returned %d\n", ret);
++
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++
++ return ibp_send(client->ep, msg, len);
++}
++
++int ibp_cmd_send_cm_rej(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_send_cm_rej_cmd *cmd;
++ struct ib_cm_id *cm_id;
++ void *ari;
++ void *private_data = NULL;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_send_cm_rej_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ if (cmd->private_data_len)
++ private_data = cmd->data;
++
++ ari = &(cmd->data[cmd->private_data_len]);
++
++ ret = ib_send_cm_rej(cm_id, cmd->reason, ari, cmd->ari_length,
++ private_data, cmd->private_data_len);
++ if (ret)
++ print_err("send_cm_rej returned %d\n", ret);
++
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++
++ return ibp_send(client->ep, msg, len);
++}
++
++int ibp_cmd_send_cm_mra(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_send_cm_mra_cmd *cmd;
++ struct ib_cm_id *cm_id;
++ void *private_data = NULL;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_send_cm_mra_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ if (cmd->private_data_len)
++ private_data = cmd->private_data;
++
++ ret = ib_send_cm_mra(cm_id, cmd->service_timeout,
++ private_data, cmd->private_data_len);
++ if (ret)
++ print_err("send_cm_mra returned %d\n", ret);
++
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++
++ return ibp_send(client->ep, msg, len);
++}
++
++int ibp_cmd_send_cm_lap(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_send_cm_lap_cmd *cmd;
++ struct ib_cm_id *cm_id;
++ struct ib_sa_path_rec alt_path;
++ void *private_data = NULL;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_send_cm_lap_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ if (cmd->private_data_len)
++ private_data = cmd->private_data;
++
++ ibp_copy_sa_path_rec(&cmd->alternate_path, &alt_path);
++
++ ret = ib_send_cm_lap(cm_id, &alt_path,
++ private_data, cmd->private_data_len);
++ if (ret)
++ print_err("send_cm_lap returned %d\n", ret);
++
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++
++ return ibp_send(client->ep, msg, len);
++}
++
++int ibp_cmd_send_cm_apr(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_send_cm_apr_cmd *cmd;
++ struct ib_cm_id *cm_id;
++ void *info = NULL;
++ void *private_data = NULL;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_send_cm_apr_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ if (cmd->private_data_len)
++ private_data = cmd->data;
++ if (cmd->info_length)
++ info = &(cmd->data[cmd->private_data_len]);
++
++ ret = ib_send_cm_apr(cm_id, cmd->status, info, cmd->info_length,
++ private_data, cmd->private_data_len);
++ if (ret)
++ print_err("send_cm_apr returned %d\n", ret);
++
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++
++ return ibp_send(client->ep, msg, len);
++}
++
++int
++ibp_cmd_send_cm_sidr_req(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_send_cm_sidr_req_cmd *cmd;
++ struct ib_cm_id *cm_id;
++ struct ib_cm_sidr_req_param param = {0};
++ struct ib_sa_path_rec path;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_send_cm_sidr_req_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ param.path = &path;
++ ibp_copy_sa_path_rec(&cmd->path, &path);
++
++ param.service_id = cmd->service_id;
++ param.timeout_ms = cmd->timeout_ms;
++ param.max_cm_retries = cmd->max_cm_retries;
++ param.private_data_len = cmd->private_data_len;
++
++ if (cmd->private_data_len)
++ param.private_data = cmd->private_data;
++
++ ret = ib_send_cm_sidr_req(cm_id, ¶m);
++ if (ret)
++ print_err("send_cm_sidr_req returned %d\n", ret);
++
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++
++ return ibp_send(client->ep, msg, len);
++}
++
++int
++ibp_cmd_send_cm_sidr_rep(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_send_cm_sidr_rep_cmd *cmd;
++ struct ib_cm_sidr_rep_param param = {0};
++ struct ib_cm_id *cm_id;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_send_cm_sidr_rep_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++
++ param.qp_num = cmd->qp_num;
++ param.qkey = cmd->qkey;
++ param.status = cmd->status;
++ param.info_length = cmd->info_length;
++ param.private_data_len = cmd->private_data_len;
++
++ if (cmd->private_data_len)
++ param.private_data = cmd->data;
++ if (cmd->info_length)
++ param.info = &(cmd->data[cmd->private_data_len]);
++
++ ret = ib_send_cm_sidr_rep(cm_id, ¶m);
++ if (ret)
++ print_err("send_cm_sidr_rep returned %d\n", ret);
++
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++
++ return ibp_send(client->ep, msg, len);
++}
++
++int ibp_cmd_cm_notify(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_cm_notify_cmd *cmd;
++ struct ib_cm_id *cm_id;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_cm_notify_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ ret = ib_cm_notify(cm_id, cmd->event);
++ if (ret)
++ print_err("cm_notify returned %d\n", ret);
++
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++
++ return ibp_send(client->ep, msg, len);
++}
++
++int
++ibp_cmd_cm_init_qp_attr(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ struct ibp_cm_init_qp_attr_cmd *cmd;
++ struct ibp_cm_init_qp_attr_resp *resp;
++ struct ib_cm_id *cm_id;
++ struct ib_qp_attr qp_attr;
++ int qp_attr_mask;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_cm_init_qp_attr_cmd *) hdr;
++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id;
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ qp_attr.qp_state = cmd->qp_attr_state;
++
++ ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
++ if (ret) {
++ print_err("init_qp_attr returned %d\n", ret);
++ goto send_resp;
++ }
++
++ /* Workaround to avoid modify_qp error from Xeon Phi IPoIB connected mode */
++ qp_attr_mask &= ~IB_QP_SMAC;
++
++ len += sizeof(*resp);
++
++ resp = (struct ibp_cm_init_qp_attr_resp *) msg->data;
++
++ resp->qp_attr_mask = qp_attr_mask;
++ resp->qp_access_flags = qp_attr.qp_access_flags;
++ resp->qp_state = qp_attr.qp_state;
++ resp->cur_qp_state = qp_attr.cur_qp_state;
++ resp->path_mtu = qp_attr.path_mtu;
++ resp->path_mig_state = qp_attr.path_mig_state;
++ resp->qkey = qp_attr.qkey;
++ resp->rq_psn = qp_attr.rq_psn;
++ resp->sq_psn = qp_attr.sq_psn;
++ resp->dest_qp_num = qp_attr.dest_qp_num;
++
++ resp->cap_max_send_wr = qp_attr.cap.max_send_wr;
++ resp->cap_max_recv_wr = qp_attr.cap.max_recv_wr;
++ resp->cap_max_send_sge = qp_attr.cap.max_send_sge;
++ resp->cap_max_recv_sge = qp_attr.cap.max_recv_sge;
++ resp->cap_max_inline_data = qp_attr.cap.max_inline_data;
++
++ resp->ah_attr_grh_dgid_subnet_prefix =
++ qp_attr.ah_attr.grh.dgid.global.subnet_prefix;
++ resp->ah_attr_grh_dgid_interface_id =
++ qp_attr.ah_attr.grh.dgid.global.interface_id;
++ resp->ah_attr_grh_flow_label = qp_attr.ah_attr.grh.flow_label;
++ resp->ah_attr_grh_sgid_index = qp_attr.ah_attr.grh.sgid_index;
++ resp->ah_attr_grh_hop_limit = qp_attr.ah_attr.grh.hop_limit;
++ resp->ah_attr_grh_traffic_class = qp_attr.ah_attr.grh.traffic_class;
++ resp->ah_attr_dlid = qp_attr.ah_attr.dlid;
++ resp->ah_attr_sl = qp_attr.ah_attr.sl;
++ resp->ah_attr_src_path_bits = qp_attr.ah_attr.src_path_bits;
++ resp->ah_attr_static_rate = qp_attr.ah_attr.static_rate;
++ resp->ah_attr_ah_flags = qp_attr.ah_attr.ah_flags;
++ resp->ah_attr_port_num = qp_attr.ah_attr.port_num;
++
++ resp->alt_attr_grh_dgid_subnet_prefix =
++ qp_attr.alt_ah_attr.grh.dgid.global.subnet_prefix;
++ resp->alt_attr_grh_dgid_interface_id =
++ qp_attr.alt_ah_attr.grh.dgid.global.interface_id;
++ resp->alt_attr_grh_flow_label = qp_attr.alt_ah_attr.grh.flow_label;
++ resp->alt_attr_grh_sgid_index = qp_attr.alt_ah_attr.grh.sgid_index;
++ resp->alt_attr_grh_hop_limit = qp_attr.alt_ah_attr.grh.hop_limit;
++ resp->alt_attr_grh_traffic_class
++ = qp_attr.alt_ah_attr.grh.traffic_class;
++ resp->alt_attr_dlid = qp_attr.alt_ah_attr.dlid;
++ resp->alt_attr_sl = qp_attr.alt_ah_attr.sl;
++ resp->alt_attr_src_path_bits = qp_attr.alt_ah_attr.src_path_bits;
++ resp->alt_attr_static_rate = qp_attr.alt_ah_attr.static_rate;
++ resp->alt_attr_ah_flags = qp_attr.alt_ah_attr.ah_flags;
++ resp->alt_attr_port_num = qp_attr.alt_ah_attr.port_num;
++
++ resp->pkey_index = qp_attr.pkey_index;
++ resp->alt_pkey_index = qp_attr.alt_pkey_index;
++ resp->en_sqd_async_notify = qp_attr.en_sqd_async_notify;
++ resp->sq_draining = qp_attr.sq_draining;
++ resp->max_rd_atomic = qp_attr.max_rd_atomic;
++ resp->max_dest_rd_atomic = qp_attr.max_dest_rd_atomic;
++ resp->min_rnr_timer = qp_attr.min_rnr_timer;
++ resp->port_num = qp_attr.port_num;
++ resp->timeout = qp_attr.timeout;
++ resp->retry_cnt = qp_attr.retry_cnt;
++ resp->rnr_retry = qp_attr.rnr_retry;
++ resp->alt_port_num = qp_attr.alt_port_num;
++ resp->alt_timeout = qp_attr.alt_timeout;
++
++send_resp:
++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret);
++
++ return ibp_send(client->ep, msg, len);
++}
+diff -urN a6/drivers/infiniband/ibp/cm/common.h a7/drivers/infiniband/ibp/cm/common.h
+--- a6/drivers/infiniband/ibp/cm/common.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/cm/common.h 2015-02-23 10:01:30.289769309 -0800
+@@ -0,0 +1,106 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef COMMON_H
++#define COMMON_H
++
++#include <linux/module.h>
++#include <linux/kthread.h>
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/poll.h>
++#include <linux/mman.h>
++#include <linux/pci.h>
++#include <linux/net.h>
++#include <rdma/ib_verbs.h>
++#include <modules/scif.h>
++
++#define DRV_DESC "CCL Direct CM " DRV_ROLE
++#define DRV_VERSION "1.0"
++#define DRV_BASE "ibp_cm"
++#define PFX DRV_BASE "_"
++#define DRV_PFX DRV_NAME ": "
++
++#define DRV_COPYRIGHT "Copyright (c) 2011-2013 Intel Corporation"
++#define DRV_SIGNON DRV_DESC " v" DRV_VERSION "\n" DRV_COPYRIGHT "\n"
++
++#define MODULE_PARAM(name, var, type, value, desc) \
++ type var = value; \
++ module_param_named(name, var, type, 0644); \
++ MODULE_PARM_DESC(name, desc)
++
++#ifdef IBP_DEBUG
++extern int debug_level;
++#endif
++
++enum {
++ IBP_DEBUG_NONE,
++ IBP_DEBUG_TARGETED,
++ IBP_DEBUG_VERBOSE,
++};
++
++#define _PRINTK(l, f, arg...) \
++ printk(l DRV_PFX "%s(%d) " f, __func__, __LINE__, ##arg)
++
++#ifdef IBP_DEBUG
++#define PRINTK(dbg, l, f, arg...) \
++ do { \
++ if (debug_level >= dbg) \
++ printk(l DRV_PFX "%s(%d) " f, \
++ __func__, __LINE__, ##arg); \
++ } while (0)
++#else
++#define PRINTK(dbg, l, f, arg...) do { } while (0)
++#endif
++
++#define print_dbg(f, arg...) PRINTK(IBP_DEBUG_TARGETED, KERN_DEBUG, f, ##arg)
++#define print_err(f, arg...) _PRINTK(KERN_ERR, f, ##arg)
++#define print_info(f, arg...) pr_info(f, ##arg)
++
++#if 0
++#define FORCED_FUNCTION_TRACING
++#endif
++
++#ifdef FORCED_FUNCTION_TRACING
++#define print_trace(f, arg...) _PRINTK(KERN_ERR, f, ##arg)
++#else
++#define print_trace(f, arg...) PRINTK(IBP_DEBUG_VERBOSE, KERN_ERR, f, ##arg)
++#endif
++
++#ifndef IBP_CM_PORT /* unique scif port for this service */
++#define IBP_CM_PORT SCIF_OFED_PORT_3
++#endif
++
++int ibp_send(scif_epd_t ep, void *buf, size_t len);
++int ibp_recv(scif_epd_t ep, void *buf, size_t len);
++
++#endif /* COMMON_H */
+diff -urN a6/drivers/infiniband/ibp/cm/ibp-abi.h a7/drivers/infiniband/ibp/cm/ibp-abi.h
+--- a6/drivers/infiniband/ibp/cm/ibp-abi.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/cm/ibp-abi.h 2015-02-23 10:01:30.290769309 -0800
+@@ -0,0 +1,94 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef IBP_ABI_H
++#define IBP_ABI_H
++
++#include <linux/types.h>
++#include <rdma/ib_verbs.h>
++#include <rdma/ib_cm.h>
++
++/* Increment this value if any changes break compatibility. */
++#define IBP_CM_ABI_VERSION 1
++
++/* Client to server message enums. */
++enum {
++ IBP_CREATE_CM_ID,
++ IBP_DESTROY_CM_ID,
++ IBP_CM_LISTEN,
++ IBP_CM_NOTIFY,
++ IBP_SEND_CM_REQ,
++ IBP_SEND_CM_REP,
++ IBP_SEND_CM_RTU,
++ IBP_SEND_CM_DREQ,
++ IBP_SEND_CM_DREP,
++ IBP_SEND_CM_REJ,
++ IBP_SEND_CM_MRA,
++ IBP_SEND_CM_LAP,
++ IBP_SEND_CM_APR,
++ IBP_SEND_CM_SIDR_REQ,
++ IBP_SEND_CM_SIDR_REP,
++ IBP_CM_INIT_QP_ATTR,
++};
++
++/* Server to client message enums. */
++enum {
++ IBP_IBP_EVENT,
++ IBP_IBP_RESPONSE,
++};
++
++/*
++ * Make sure that all structs defined in this file are laid out to pack
++ * the same way on different architectures to avoid incompatibility.
++ *
++ * Specifically:
++ * - Do not use pointer types -- pass pointers in a u64 instead.
++ * - Make sure that any structure larger than 4 bytes is padded
++ * to a multiple of 8 bytes; otherwise the structure size may
++ * be different between architectures.
++ */
++
++struct ibp_msg_header { /* present in all messages */
++ u32 opcode;
++ u32 length;
++ u32 status;
++ u32 reserved;
++ u64 request;
++ u64 data[0];
++};
++
++struct ibp_response_msg {
++ struct ibp_msg_header header;
++ u64 data[0];
++};
++
++#endif /* IBP_ABI_H */
+diff -urN a6/drivers/infiniband/ibp/cm/ibp_exports.h a7/drivers/infiniband/ibp/cm/ibp_exports.h
+--- a6/drivers/infiniband/ibp/cm/ibp_exports.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/cm/ibp_exports.h 2015-02-23 10:01:30.290769309 -0800
+@@ -0,0 +1,50 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef IBP_EXPORTS_H
++#define IBP_EXPORTS_H
++
++#include <rdma/ib_verbs.h>
++
++/*
++ ibp_resolve_ib_device - Return the host ib_device handle
++ @ibdev:Card IB device
++
++ Upper level drivers may require the host ib_device handle associated
++ with the card ib_device. This routine resolves the card ib_device to
++ the cooresponding host ib_device handle. A value of 0 is returned if
++ no match was found.
++*/
++u64 ibp_resolve_ib_device(struct ib_device *ibdev);
++
++
++#endif /* IBP_EXPORTS_H */
+diff -urN a6/drivers/infiniband/ibp/cm/Makefile a7/drivers/infiniband/ibp/cm/Makefile
+--- a6/drivers/infiniband/ibp/cm/Makefile 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/cm/Makefile 2015-02-23 10:01:30.290769309 -0800
+@@ -0,0 +1,21 @@
++KDIR ?= /lib/modules/`uname -r`/build
++
++obj-$(CONFIG_IBP_SERVER) += ibp_cm_server.o
++
++ccflags-$(CONFIG_IBP_DEBUG) += -g -DIBP_DEBUG
++
++ibp_cm_server-y := server.o \
++ server_msg.o \
++ cm_server_msg.o
++
++default:
++ $(MAKE) -C $(KDIR) M=`pwd`
++
++modules_install:
++ $(MAKE) -C $(KDIR) M=`pwd` modules_install
++
++clean:
++ rm -rf *.ko *.o .*.ko.cmd .*.o.cmd *.mod.c Module.* modules.order .tmp_versions
++
++unix:
++ dos2unix *.[ch] Kconfig Makefile
+diff -urN a6/drivers/infiniband/ibp/cm/server.c a7/drivers/infiniband/ibp/cm/server.c
+--- a6/drivers/infiniband/ibp/cm/server.c 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/cm/server.c 2015-02-23 10:01:30.290769309 -0800
+@@ -0,0 +1,221 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#include "server.h"
++
++MODULE_AUTHOR("Jerrie Coffman");
++MODULE_AUTHOR("Phil Cayton");
++MODULE_AUTHOR("Jay Sternberg");
++MODULE_LICENSE("Dual BSD/GPL");
++MODULE_DESCRIPTION(DRV_DESC);
++MODULE_VERSION(DRV_VERSION);
++
++MODULE_PARAM(port, port, int, IBP_CM_PORT, "Connection port");
++MODULE_PARAM(backlog, backlog, int, 8, "Connection backlog");
++MODULE_PARAM(timeout, timeout, int, 1000, "Listen/Poll time in milliseconds");
++
++#ifdef IBP_DEBUG
++MODULE_PARAM(debug_level, debug_level, int, 0, "Debug: 0-none, 1-some, 2-all");
++#endif
++
++struct rw_semaphore list_rwsem;
++
++LIST_HEAD(client_list);
++
++static struct task_struct *listen_thread;
++
++static struct ibp_client *ibp_create_client(scif_epd_t ep, uint16_t node)
++{
++ struct ibp_client *client;
++ int ret = -ENOMEM;
++
++ client = kzalloc(sizeof(*client), GFP_KERNEL);
++ if (!client) {
++ print_err("kzalloc failed\n");
++ return ERR_PTR(ret);
++ }
++
++ client->ep = ep;
++
++ client->rx_buf = (void *)__get_free_page(GFP_KERNEL);
++ if (!client->rx_buf) {
++ print_err("__get_free_page rx_buf failed\n");
++ goto err0;
++ }
++
++ client->tx_buf = (void *)__get_free_page(GFP_KERNEL);
++ if (!client->tx_buf) {
++ print_err("__get_free_page tx_buf failed\n");
++ goto err1;
++ }
++
++ client->workqueue = create_singlethread_workqueue(DRV_NAME);
++ if (!client->workqueue) {
++ print_err("create_singlethread_workqueue failed\n");
++ goto err2;
++ }
++
++ down_write(&list_rwsem);
++ list_add(&client->list, &client_list);
++ up_write(&list_rwsem);
++
++ client->ibp_cm_client_thread = kthread_run(ibp_process_recvs,
++ client, DRV_NAME);
++ if (!client->ibp_cm_client_thread) {
++ print_err("create cleint thread failed\n");
++ goto err3;
++ }
++
++ return client;
++err3:
++ down_write(&list_rwsem);
++ list_del(&client->list);
++ up_write(&list_rwsem);
++
++ destroy_workqueue(client->workqueue);
++err2:
++ free_page((uintptr_t)client->tx_buf);
++err1:
++ free_page((uintptr_t)client->rx_buf);
++err0:
++ kfree(client);
++ return ERR_PTR(ret);
++}
++
++static int ibp_cm_listen(void *data)
++{
++ struct ibp_client *client;
++ struct scif_pollepd listen;
++ struct scif_portID peer;
++ scif_epd_t ep;
++ int ret;
++
++ listen.epd = scif_open();
++ if (!listen.epd) {
++ print_err("scif_open failed\n");
++ ret = -EIO;
++ goto err0;
++ }
++ listen.events = POLLIN;
++
++ ret = scif_bind(listen.epd, port);
++ if (ret < 0) {
++ print_err("scif_bind returned %d\n", ret);
++ goto err1;
++ }
++
++ ret = scif_listen(listen.epd, backlog);
++ if (ret) {
++ print_err("scif_listen returned %d\n", ret);
++ goto err1;
++ }
++
++ while (!kthread_should_stop()) {
++
++ schedule();
++
++ ret = scif_poll(&listen, 1, timeout);
++ if (ret == 0) /* timeout */
++ continue;
++ if (ret < 0) {
++ print_err("scif_poll revents 0x%x\n", listen.revents);
++ continue;
++ }
++
++ ret = scif_accept(listen.epd, &peer, &ep, 0);
++ if (ret) {
++ print_err("scif_accept returned %d\n", ret);
++ continue;
++ }
++
++ print_dbg("accepted node %d port %d\n", peer.node, peer.port);
++
++ client = ibp_create_client(ep, peer.node);
++ if (IS_ERR(client)) {
++ ret = PTR_ERR(client);
++ print_err("ibp_create_client returned %d\n", ret);
++ scif_close(ep);
++ }
++ }
++err1:
++ scif_close(listen.epd);
++err0:
++ return ret;
++}
++
++static int __init ibp_cm_server_init(void)
++{
++ int ret = 0;
++
++ print_info(DRV_SIGNON);
++
++ init_rwsem(&list_rwsem);
++
++ /* Start a thread for inbound connections. */
++ listen_thread = kthread_run(ibp_cm_listen, NULL, DRV_NAME);
++ if (IS_ERR(listen_thread)) {
++ ret = PTR_ERR(listen_thread);
++ print_err("kthread_run returned %d\n", ret);
++ }
++
++ return ret;
++}
++
++static void __exit ibp_cm_server_exit(void)
++{
++ struct ibp_client *client, *next;
++ struct completion done;
++
++ kthread_stop(listen_thread);
++
++ down_write(&list_rwsem);
++ list_for_each_entry_safe(client, next, &client_list, list) {
++ init_completion(&done);
++ client->done = &done;
++
++ /* Close scif ep to unblock the client thread scif_recv */
++ scif_close(client->ep);
++
++ up_write(&list_rwsem);
++
++ /* Wait for client thread to finish */
++ wait_for_completion(&done);
++
++ down_write(&list_rwsem);
++ }
++ up_write(&list_rwsem);
++
++ print_info(DRV_DESC " unloaded\n");
++}
++
++module_init(ibp_cm_server_init);
++module_exit(ibp_cm_server_exit);
+diff -urN a6/drivers/infiniband/ibp/cm/server.h a7/drivers/infiniband/ibp/cm/server.h
+--- a6/drivers/infiniband/ibp/cm/server.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/cm/server.h 2015-02-23 10:01:30.290769309 -0800
+@@ -0,0 +1,128 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef SERVER_H
++#define SERVER_H
++
++#include <linux/fs.h>
++#include <linux/cdev.h>
++#include <linux/anon_inodes.h>
++#include <rdma/ib_umem.h>
++#include "ibp-abi.h"
++#include "cm_ibp_abi.h"
++#include "common.h"
++
++#define DRV_ROLE "Server"
++#define DRV_NAME "ibp_cm_server"
++
++#define MAX_MSG_SIZE PAGE_SIZE
++
++extern int timeout;
++extern struct rw_semaphore list_rwsem;
++extern struct list_head client_list;
++extern struct list_head cm_entry_list;
++
++struct ibp_client {
++ struct list_head list;
++ scif_epd_t ep;
++ void *rx_buf;
++ void *tx_buf;
++ struct completion *done;
++ struct workqueue_struct *workqueue;
++ struct task_struct *ibp_cm_client_thread;
++};
++
++struct cm_entry {
++ struct list_head list;
++ struct ib_cm_id *cm_id;
++ struct ibp_client *client;
++};
++
++struct ibp_event_get {
++ __u64 response;
++ __u64 data;
++ __u64 info;
++ __u8 data_len;
++ __u8 info_len;
++ __u8 reserved[6];
++};
++
++struct ibp_event {
++ struct work_struct work;
++ struct ibp_client *client;
++ struct ibp_cm_event event;
++};
++
++#define IBP_INIT_MSG(device, msg, size, op) \
++ do { \
++ (msg)->header.opcode = IBP_##op; \
++ (msg)->header.length = (size); \
++ (msg)->header.status = 0; \
++ (msg)->header.reserved = 0; \
++ (msg)->header.request = 0; \
++ } while (0)
++
++#define IBP_INIT_RESP(device, resp, size, op, req, stat) \
++ do { \
++ (resp)->header.opcode = IBP_##op; \
++ (resp)->header.length = (size); \
++ (resp)->header.status = (stat); \
++ (resp)->header.reserved = 0; \
++ (resp)->header.request = (req); \
++ } while (0)
++
++int ibp_process_recvs(void *p);
++void cleanup_cm_entry_list(void);
++
++int ibp_cmd_create_cm_id(struct ibp_client *client, struct ibp_msg_header *hdr);
++int ibp_cmd_destroy_cm_id(struct ibp_client *client,
++ struct ibp_msg_header *hdr);
++int ibp_cmd_cm_listen(struct ibp_client *client, struct ibp_msg_header *hdr);
++int ibp_cmd_cm_notify(struct ibp_client *client, struct ibp_msg_header *hdr);
++int ibp_cmd_send_cm_req(struct ibp_client *client, struct ibp_msg_header *hdr);
++int ibp_cmd_send_cm_rep(struct ibp_client *client, struct ibp_msg_header *hdr);
++int ibp_cmd_send_cm_rtu(struct ibp_client *client, struct ibp_msg_header *hdr);
++int ibp_cmd_send_cm_dreq(struct ibp_client *client, struct ibp_msg_header *hdr);
++int ibp_cmd_send_cm_drep(struct ibp_client *client, struct ibp_msg_header *hdr);
++int ibp_cmd_send_cm_rej(struct ibp_client *client, struct ibp_msg_header *hdr);
++int ibp_cmd_send_cm_mra(struct ibp_client *client, struct ibp_msg_header *hdr);
++int ibp_cmd_send_cm_lap(struct ibp_client *client, struct ibp_msg_header *hdr);
++int ibp_cmd_send_cm_apr(struct ibp_client *client, struct ibp_msg_header *hdr);
++int ibp_cmd_send_cm_sidr_req(struct ibp_client *client,
++ struct ibp_msg_header *hdr);
++int ibp_cmd_send_cm_sidr_rep(struct ibp_client *client,
++ struct ibp_msg_header *hdr);
++int ibp_cmd_cm_event(struct ibp_client *client, struct ibp_msg_header *hdr);
++int ibp_cmd_cm_init_qp_attr(struct ibp_client *client,
++ struct ibp_msg_header *hdr);
++
++#endif /* SERVER_H */
+diff -urN a6/drivers/infiniband/ibp/cm/server_msg.c a7/drivers/infiniband/ibp/cm/server_msg.c
+--- a6/drivers/infiniband/ibp/cm/server_msg.c 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/cm/server_msg.c 2015-02-23 10:01:30.290769309 -0800
+@@ -0,0 +1,176 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#include "server.h"
++#include "cm_ibp_abi.h"
++
++int ibp_send(scif_epd_t ep, void *buf, size_t len)
++{
++ int ret;
++
++ while (len) {
++ ret = scif_send(ep, buf, (uint32_t)len, SCIF_SEND_BLOCK);
++ if (ret < 0) {
++ print_dbg("scif_send returned %d\n", ret);
++ return ret;
++ }
++ buf += ret;
++ len -= ret;
++ }
++
++ return 0;
++}
++
++int ibp_recv(scif_epd_t ep, void *buf, size_t len)
++{
++ int ret;
++
++ while (len) {
++ ret = scif_recv(ep, buf, (uint32_t)len, SCIF_RECV_BLOCK);
++ if (ret < 0) {
++ print_dbg("scif_recv returned %d\n", ret);
++ return ret;
++ }
++ buf += ret;
++ len -= ret;
++ }
++
++ return 0;
++}
++
++static int
++ibp_cmd_bad_request(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_response_msg *msg;
++ size_t len;
++ int status = -EBADRQC;
++
++ print_dbg("opcode 0x%x\n", hdr->opcode);
++
++ msg = (struct ibp_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ IBP_INIT_RESP(NULL, msg, len, IBP_RESPONSE, hdr->request, status);
++ return ibp_send(client->ep, msg, len);
++}
++
++static void
++ibp_cm_destroy_client(struct ibp_client *client)
++{
++ struct cm_entry *cm, *tmp;
++
++ down_write(&list_rwsem);
++ list_del(&client->list);
++ list_for_each_entry_safe(cm, tmp, &cm_entry_list, list)
++ if (cm->client == client) {
++ ib_destroy_cm_id(cm->cm_id);
++ list_del(&cm->list);
++ kfree(cm);
++ }
++ up_write(&list_rwsem);
++
++ destroy_workqueue(client->workqueue);
++
++ free_page((uintptr_t)client->tx_buf);
++ free_page((uintptr_t)client->rx_buf);
++
++ if (client->done)
++ complete(client->done);
++ else
++ scif_close(client->ep);
++
++ kfree(client);
++}
++
++static int
++(*ibp_msg_table[])(struct ibp_client *c, struct ibp_msg_header *h) = {
++ [IBP_CREATE_CM_ID] = ibp_cmd_create_cm_id,
++ [IBP_DESTROY_CM_ID] = ibp_cmd_destroy_cm_id,
++ [IBP_CM_LISTEN] = ibp_cmd_cm_listen,
++ [IBP_CM_NOTIFY] = ibp_cmd_cm_notify,
++ [IBP_SEND_CM_REQ] = ibp_cmd_send_cm_req,
++ [IBP_SEND_CM_REP] = ibp_cmd_send_cm_rep,
++ [IBP_SEND_CM_RTU] = ibp_cmd_send_cm_rtu,
++ [IBP_SEND_CM_DREQ] = ibp_cmd_send_cm_dreq,
++ [IBP_SEND_CM_DREP] = ibp_cmd_send_cm_drep,
++ [IBP_SEND_CM_REJ] = ibp_cmd_send_cm_rej,
++ [IBP_SEND_CM_MRA] = ibp_cmd_send_cm_mra,
++ [IBP_SEND_CM_LAP] = ibp_cmd_send_cm_lap,
++ [IBP_SEND_CM_APR] = ibp_cmd_send_cm_apr,
++ [IBP_SEND_CM_SIDR_REQ] = ibp_cmd_send_cm_sidr_req,
++ [IBP_SEND_CM_SIDR_REP] = ibp_cmd_send_cm_sidr_rep,
++ [IBP_CM_INIT_QP_ATTR] = ibp_cmd_cm_init_qp_attr,
++};
++
++int ibp_process_recvs(void *p)
++{
++ struct ibp_client *client;
++ struct ibp_msg_header *hdr;
++ int ret;
++
++ client = (struct ibp_client *) p;
++ hdr = (struct ibp_msg_header *) client->rx_buf;
++
++ for (;;) {
++ ret = ibp_recv(client->ep, hdr, sizeof(*hdr));
++ if (ret)
++ break;
++
++ if (hdr->length > MAX_MSG_SIZE) {
++ print_err("message too large, len %u max %lu\n",
++ hdr->length, MAX_MSG_SIZE);
++ ret = -EMSGSIZE;
++ break;
++ }
++
++ if (hdr->length > sizeof(*hdr)) {
++ ret = ibp_recv(client->ep, hdr->data,
++ hdr->length - sizeof(*hdr));
++ if (ret)
++ break;
++ }
++
++ if ((hdr->opcode >= ARRAY_SIZE(ibp_msg_table)) ||
++ !ibp_msg_table[hdr->opcode]) {
++ ibp_cmd_bad_request(client, hdr);
++ continue;
++ }
++
++ ret = ibp_msg_table[hdr->opcode](client, hdr);
++ if (ret)
++ break;
++ }
++
++ ibp_cm_destroy_client(client);
++
++ return ret;
++}
+diff -urN a6/drivers/infiniband/ibp/drv/common.h a7/drivers/infiniband/ibp/drv/common.h
+--- a6/drivers/infiniband/ibp/drv/common.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/drv/common.h 2015-02-23 10:01:30.290769309 -0800
+@@ -0,0 +1,109 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef COMMON_H
++#define COMMON_H
++
++#include <linux/module.h>
++#include <linux/kthread.h>
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/poll.h>
++#include <linux/mman.h>
++#include <linux/pci.h>
++#include <linux/net.h>
++#include <rdma/ib_verbs.h>
++#include <modules/scif.h>
++
++#define DRV_DESC "CCL Direct " DRV_ROLE
++#define DRV_VERSION "1.0"
++#define DRV_BASE "ibp"
++#define PFX DRV_BASE "_"
++#define DRV_PFX DRV_NAME ": "
++
++#define DRV_COPYRIGHT "Copyright (c) 2011-2013 Intel Corporation"
++#define DRV_SIGNON DRV_DESC " v" DRV_VERSION "\n" DRV_COPYRIGHT "\n"
++
++#define MODULE_PARAM(name, var, type, value, desc) \
++ type var = value; \
++ module_param_named(name, var, type, 0644); \
++ MODULE_PARM_DESC(name, desc)
++
++#ifdef IBP_DEBUG
++extern int debug_level;
++#endif
++
++enum {
++ IBP_DEBUG_NONE,
++ IBP_DEBUG_TARGETED,
++ IBP_DEBUG_VERBOSE,
++};
++
++#define _PRINTK(l, f, arg...) \
++ printk(l DRV_PFX "%s(%d) " f, __func__, __LINE__, ##arg)
++
++#ifdef IBP_DEBUG
++#define PRINTK(dbg, l, f, arg...) \
++ do { \
++ if (debug_level >= dbg) \
++ printk(l DRV_PFX "%s(%d) " f, \
++ __func__, __LINE__, ##arg); \
++ } while (0)
++#else
++#define PRINTK(dbg, l, f, arg...) do { } while (0)
++#endif
++
++#define print_dbg(f, arg...) PRINTK(IBP_DEBUG_TARGETED, KERN_DEBUG, f, ##arg)
++#define print_err(f, arg...) _PRINTK(KERN_ERR, f, ##arg)
++#define print_info(f, arg...) pr_info(f, ##arg)
++
++#if 0
++#define FORCED_FUNCTION_TRACING
++#endif
++
++#ifdef FORCED_FUNCTION_TRACING
++#define print_trace(f, arg...) _PRINTK(KERN_ERR, f, ##arg)
++#else
++#define print_trace(f, arg...) PRINTK(IBP_DEBUG_VERBOSE, KERN_ERR, f, ##arg)
++#endif
++
++#ifndef IBP_PORT /* unique scif port for this service */
++#define IBP_PORT SCIF_OFED_PORT_2
++#endif
++
++#define IS_NULL_OR_ERR(p) (!(p) || IS_ERR_VALUE((unsigned long)p))
++
++int ibp_init(void);
++
++void ibp_cleanup(void);
++
++#endif /* COMMON_H */
+diff -urN a6/drivers/infiniband/ibp/drv/ibp-abi.h a7/drivers/infiniband/ibp/drv/ibp-abi.h
+--- a6/drivers/infiniband/ibp/drv/ibp-abi.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/drv/ibp-abi.h 2015-02-23 10:01:30.290769309 -0800
+@@ -0,0 +1,649 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef IBP_ABI_H
++#define IBP_ABI_H
++
++#include <linux/types.h>
++
++/* Increment this value if any changes break compatibility. */
++#define IBP_ABI_VERSION 2
++
++/* Client to server message enums. */
++enum {
++ IBP_VERB_GET_PROTOCOL_STATS,
++ IBP_VERB_QUERY_DEVICE,
++ IBP_VERB_QUERY_PORT,
++ IBP_VERB_GET_LINK_LAYER,
++ IBP_VERB_QUERY_GID,
++ IBP_VERB_QUERY_PKEY,
++ IBP_VERB_MODIFY_DEVICE,
++ IBP_VERB_MODIFY_PORT,
++ IBP_VERB_ALLOC_UCONTEXT,
++ IBP_VERB_DEALLOC_UCONTEXT,
++ IBP_VERB_REG_BUF,
++ IBP_VERB_DEREG_BUF,
++ IBP_VERB_MMAP,
++ IBP_VERB_UNMMAP,
++ IBP_VERB_ALLOC_PD,
++ IBP_VERB_DEALLOC_PD,
++ IBP_VERB_CREATE_AH,
++ IBP_VERB_MODIFY_AH,
++ IBP_VERB_QUERY_AH,
++ IBP_VERB_DESTROY_AH,
++ IBP_VERB_CREATE_SRQ,
++ IBP_VERB_MODIFY_SRQ,
++ IBP_VERB_QUERY_SRQ,
++ IBP_VERB_DESTROY_SRQ,
++ IBP_VERB_POST_SRQ_RECV,
++ IBP_VERB_CREATE_QP,
++ IBP_VERB_MODIFY_QP,
++ IBP_VERB_QUERY_QP,
++ IBP_VERB_DESTROY_QP,
++ IBP_VERB_POST_SEND,
++ IBP_VERB_POST_RECV,
++ IBP_VERB_CREATE_CQ,
++ IBP_VERB_MODIFY_CQ,
++ IBP_VERB_DESTROY_CQ,
++ IBP_VERB_RESIZE_CQ,
++ IBP_VERB_POLL_CQ,
++ IBP_VERB_PEEK_CQ,
++ IBP_VERB_REQ_NOTIFY_CQ,
++ IBP_VERB_REQ_NCOMP_NOTIF,
++ IBP_VERB_GET_DMA_MR,
++ IBP_VERB_REG_PHYS_MR,
++ IBP_VERB_REG_USER_MR,
++ IBP_VERB_QUERY_MR,
++ IBP_VERB_DEREG_MR,
++ IBP_VERB_ALLOC_FAST_REG_MR,
++ IBP_VERB_ALLOC_FAST_REG_PAGE_LIST,
++ IBP_VERB_FREE_FAST_REG_PAGE_LIST,
++ IBP_VERB_REREG_PHYS_MR,
++ IBP_VERB_ALLOC_MW,
++ IBP_VERB_BIND_MW,
++ IBP_VERB_DEALLOC_MW,
++ IBP_VERB_ALLOC_FMR,
++ IBP_VERB_MAP_PHYS_FMR,
++ IBP_VERB_UNMAP_FMR,
++ IBP_VERB_DEALLOC_FMR,
++ IBP_VERB_ATTACH_MCAST,
++ IBP_VERB_DETACH_MCAST,
++ IBP_VERB_PROCESS_MAD,
++ IBP_VERB_ALLOC_XRCD,
++ IBP_VERB_DEALLOC_XRCD,
++};
++
++/* Server to client message enums. */
++enum {
++ IBP_ADD_DEVICE,
++ IBP_REMOVE_DEVICE,
++ IBP_VERB_RESPONSE,
++ IBP_QUEUED_RESPONSE,
++ IBP_ASYNC_EVENT,
++ IBP_CQ_COMP,
++};
++
++/*
++ * Make sure that all structs defined in this file are laid out to pack
++ * the same way on different architectures to avoid incompatibility.
++ *
++ * Specifically:
++ * - Do not use pointer types -- pass pointers in a u64 instead.
++ * - Make sure that any structure larger than 4 bytes is padded
++ * to a multiple of 8 bytes; otherwise the structure size may
++ * be different between architectures.
++ */
++
++struct ibp_msg_header { /* present in all messages */
++ u32 opcode;
++ u32 length;
++ u32 status;
++ u32 reserved;
++ u64 device;
++ u64 request;
++ u64 data[0];
++};
++
++#define IBP_DEVICE_NAME_MAX 64
++
++struct ibp_add_device {
++ u8 name[IBP_DEVICE_NAME_MAX];
++ u32 vendor_id;
++ u32 device_id;
++ u64 ib_device;
++ u64 device;
++ __be64 node_guid;
++ u64 uverbs_cmd_mask;
++ u32 uverbs_abi_ver;
++ u32 ibp_abi_ver;
++ u32 num_comp_vectors;
++ u8 phys_port_cnt;
++ u8 reserved[7];
++};
++
++struct ibp_add_device_msg {
++ struct ibp_msg_header header;
++ struct ibp_add_device data;
++};
++
++struct ibp_remove_device_msg {
++ struct ibp_msg_header header;
++};
++
++struct ibp_verb_response_msg {
++ struct ibp_msg_header header;
++ u64 data[0];
++};
++
++struct ibp_queued_response_msg {
++ struct ibp_msg_header header;
++ u64 data[0];
++};
++
++struct ibp_async_event {
++ u64 ibdev;
++ u64 context;
++ u32 type;
++ u8 reserved[4];
++};
++
++struct ibp_async_event_msg {
++ struct ibp_msg_header header;
++ struct ibp_async_event data;
++};
++
++struct ibp_cq_comp {
++ u64 cq_context;
++};
++
++struct ibp_cq_comp_msg {
++ struct ibp_msg_header header;
++ struct ibp_cq_comp data;
++};
++
++struct ibp_alloc_ucontext_cmd {
++ struct ibp_msg_header header;
++ u64 ibdev;
++ u64 data[0];
++};
++
++struct ibp_alloc_ucontext_resp {
++ u64 ucontext;
++ u64 data[0];
++};
++
++struct ibp_dealloc_ucontext_cmd {
++ struct ibp_msg_header header;
++ u64 ucontext;
++};
++
++struct ibp_mmap_cmd {
++ struct ibp_msg_header header;
++ u64 len;
++ u64 prot;
++ u64 flags;
++ u64 pgoff;
++ u64 ucontext;
++};
++
++struct ibp_mmap_resp {
++ u64 mmap;
++ u64 scif_addr;
++};
++
++struct ibp_unmmap_cmd {
++ struct ibp_msg_header header;
++ u64 mmap;
++};
++
++struct ibp_reg_buf_cmd {
++ struct ibp_msg_header header;
++ u64 ucontext;
++ u64 virt_addr;
++ u64 scif_addr;
++ u64 length;
++ u32 offset;
++ u32 access;
++};
++
++struct ibp_reg_buf_resp {
++ u64 reg;
++};
++
++struct ibp_dereg_buf_cmd {
++ struct ibp_msg_header header;
++ u64 reg;
++};
++
++struct ibp_query_device_cmd {
++ struct ibp_msg_header header;
++};
++
++struct ibp_query_device_resp {
++ u64 fw_ver;
++ __be64 sys_image_guid;
++ u64 max_mr_size;
++ u64 page_size_cap;
++ u32 vendor_id;
++ u32 vendor_part_id;
++ u32 hw_ver;
++ u32 max_qp;
++ u32 max_qp_wr;
++ u32 device_cap_flags;
++ u32 max_sge;
++ u32 max_sge_rd;
++ u32 max_cq;
++ u32 max_cqe;
++ u32 max_mr;
++ u32 max_pd;
++ u32 max_qp_rd_atom;
++ u32 max_ee_rd_atom;
++ u32 max_res_rd_atom;
++ u32 max_qp_init_rd_atom;
++ u32 max_ee_init_rd_atom;
++ u32 atomic_cap;
++ u32 masked_atomic_cap;
++ u32 max_ee;
++ u32 max_rdd;
++ u32 max_mw;
++ u32 max_raw_ipv6_qp;
++ u32 max_raw_ethy_qp;
++ u32 max_mcast_grp;
++ u32 max_mcast_qp_attach;
++ u32 max_total_mcast_qp_attach;
++ u32 max_ah;
++ u32 max_fmr;
++ u32 max_map_per_fmr;
++ u32 max_srq;
++ u32 max_srq_wr;
++ u32 max_srq_sge;
++ u32 max_fast_reg_page_list_len;
++ u16 max_pkeys;
++ u8 local_ca_ack_delay;
++ u8 reserved[5];
++};
++
++struct ibp_query_port_cmd {
++ struct ibp_msg_header header;
++ u8 port_num;
++ u8 reserved[7];
++};
++
++struct ibp_query_port_resp {
++ u32 port_cap_flags;
++ u32 max_msg_sz;
++ u32 bad_pkey_cntr;
++ u32 qkey_viol_cntr;
++ u32 gid_tbl_len;
++ u16 pkey_tbl_len;
++ u16 lid;
++ u16 sm_lid;
++ u8 state;
++ u8 max_mtu;
++ u8 active_mtu;
++ u8 lmc;
++ u8 max_vl_num;
++ u8 sm_sl;
++ u8 subnet_timeout;
++ u8 init_type_reply;
++ u8 active_width;
++ u8 active_speed;
++ u8 phys_state;
++ u8 link_layer;
++ u8 reserved[2];
++};
++
++struct ibp_query_gid_cmd {
++ struct ibp_msg_header header;
++ u32 index;
++ u8 port_num;
++ u8 reserved[3];
++};
++
++struct ibp_query_gid_resp {
++ __be64 subnet_prefix;
++ __be64 interface_id;
++};
++
++struct ibp_query_pkey_cmd {
++ struct ibp_msg_header header;
++ u32 index;
++ u8 port_num;
++ u8 reserved[3];
++};
++
++struct ibp_query_pkey_resp {
++ u16 pkey;
++ u8 reserved[6];
++};
++
++struct ibp_alloc_pd_cmd {
++ struct ibp_msg_header header;
++ u64 ucontext;
++ u64 data[0];
++};
++
++struct ibp_alloc_pd_resp {
++ u64 pd;
++ u64 data[0];
++};
++
++struct ibp_dealloc_pd_cmd {
++ struct ibp_msg_header header;
++ u64 pd;
++};
++
++struct ibp_global_route {
++ __be64 dgid_subnet_prefix;
++ __be64 dgid_interface_id;
++ u32 flow_label;
++ u8 sgid_index;
++ u8 hop_limit;
++ u8 traffic_class;
++ u8 reserved[1];
++};
++
++struct ibp_ah_attr {
++ struct ibp_global_route grh;
++ u16 dlid;
++ u8 sl;
++ u8 src_path_bits;
++ u8 static_rate;
++ u8 ah_flags;
++ u8 port_num;
++ u8 reserved[1];
++};
++
++struct ibp_create_ah_cmd {
++ struct ibp_msg_header header;
++ u64 pd;
++ struct ibp_ah_attr ah_attr;
++};
++
++struct ibp_create_ah_resp {
++ u64 ah;
++};
++
++struct ibp_query_ah_cmd {
++ struct ibp_msg_header header;
++ u64 ah;
++};
++
++struct ibp_query_ah_resp {
++ struct ibp_ah_attr attr;
++};
++
++struct ibp_destroy_ah_cmd {
++ struct ibp_msg_header header;
++ u64 ah;
++};
++
++struct ibp_srq_attr {
++ u32 max_wr;
++ u32 max_sge;
++ u32 srq_limit;
++ u8 reserved[4];
++};
++
++struct ibp_create_srq_cmd {
++ struct ibp_msg_header header;
++ u64 pd;
++ u64 srq_context;
++ struct ibp_srq_attr attr;
++ u64 data[0];
++};
++
++struct ibp_create_srq_resp {
++ u64 srq;
++ struct ibp_srq_attr attr;
++ u64 data[0];
++};
++
++struct ibp_query_srq_cmd {
++ struct ibp_msg_header header;
++ u64 srq;
++};
++
++struct ibp_query_srq_resp {
++ struct ibp_srq_attr attr;
++};
++
++struct ibp_modify_srq_cmd {
++ struct ibp_msg_header header;
++ u64 srq;
++ struct ibp_srq_attr attr;
++ u32 srq_attr_mask;
++ u8 reserved[4];
++ u64 data[0];
++};
++
++struct ibp_modify_srq_resp {
++ struct ibp_srq_attr attr;
++ u64 data[0];
++};
++
++struct ibp_destroy_srq_cmd {
++ struct ibp_msg_header header;
++ u64 srq;
++};
++
++struct ibp_qp_cap {
++ u32 max_send_wr;
++ u32 max_recv_wr;
++ u32 max_send_sge;
++ u32 max_recv_sge;
++ u32 max_inline_data;
++ u8 reserved[4];
++};
++
++struct ibp_create_qp_cmd {
++ struct ibp_msg_header header;
++ u64 pd;
++ u64 send_cq;
++ u64 recv_cq;
++ u64 srq;
++ u64 xrc_domain;
++ u64 qp_context;
++ struct ibp_qp_cap cap;
++ u8 sq_sig_type;
++ u8 qp_type;
++ u8 create_flags;
++ u8 port_num;
++ u64 data[0];
++};
++
++struct ibp_create_qp_resp {
++ u64 qp;
++ struct ibp_qp_cap cap;
++ u32 qpn;
++ u8 reserved[4];
++ u64 data[0];
++};
++
++struct ibp_query_qp_cmd {
++ struct ibp_msg_header header;
++ u64 qp;
++ u32 qp_attr_mask;
++ u8 reserved[4];
++};
++
++struct ibp_query_qp_resp {
++ u32 qp_state;
++ u32 cur_qp_state;
++ u32 path_mtu;
++ u32 path_mig_state;
++ u32 qkey;
++ u32 rq_psn;
++ u32 sq_psn;
++ u32 dest_qp_num;
++ u32 qp_access_flags;
++ u32 init_create_flags;
++ struct ibp_qp_cap init_cap;
++ struct ibp_qp_cap cap;
++ struct ibp_ah_attr ah;
++ struct ibp_ah_attr alt_ah;
++ u16 pkey_index;
++ u16 alt_pkey_index;
++ u8 en_sqd_async_notify;
++ u8 sq_draining;
++ u8 max_rd_atomic;
++ u8 max_dest_rd_atomic;
++ u8 min_rnr_timer;
++ u8 port_num;
++ u8 timeout;
++ u8 retry_cnt;
++ u8 rnr_retry;
++ u8 alt_port_num;
++ u8 alt_timeout;
++ u8 init_sq_sig_type;
++};
++
++struct ibp_modify_qp_cmd {
++ struct ibp_msg_header header;
++ u64 qp;
++ u32 qp_attr_mask;
++ u32 qp_state;
++ u32 cur_qp_state;
++ u32 path_mtu;
++ u32 path_mig_state;
++ u32 qkey;
++ u32 rq_psn;
++ u32 sq_psn;
++ u32 dest_qp_num;
++ u32 qp_access_flags;
++ struct ibp_qp_cap cap;
++ struct ibp_ah_attr ah;
++ struct ibp_ah_attr alt_ah;
++ u16 pkey_index;
++ u16 alt_pkey_index;
++ u8 en_sqd_async_notify;
++ u8 sq_draining;
++ u8 max_rd_atomic;
++ u8 max_dest_rd_atomic;
++ u8 min_rnr_timer;
++ u8 port_num;
++ u8 timeout;
++ u8 retry_cnt;
++ u8 rnr_retry;
++ u8 alt_port_num;
++ u8 alt_timeout;
++ u8 reserved[1];
++ u64 data[0];
++};
++
++struct ibp_modify_qp_resp {
++ struct ibp_qp_cap cap;
++ u64 data[0];
++};
++
++struct ibp_destroy_qp_cmd {
++ struct ibp_msg_header header;
++ u64 qp;
++};
++
++struct ibp_create_cq_cmd {
++ struct ibp_msg_header header;
++ u64 ucontext;
++ u64 cq_context;
++ u32 cqe;
++ u32 vector;
++ u64 data[0];
++};
++
++struct ibp_create_cq_resp {
++ u64 cq;
++ u32 cqe;
++ u8 reserved[4];
++ u64 data[0];
++};
++
++struct ibp_resize_cq_cmd {
++ struct ibp_msg_header header;
++ u64 cq;
++ u32 cqe;
++ u8 reserved[4];
++ u64 data[0];
++};
++
++struct ibp_resize_cq_resp {
++ u32 cqe;
++ u8 reserved[4];
++ u64 data[0];
++};
++
++struct ibp_destroy_cq_cmd {
++ struct ibp_msg_header header;
++ u64 cq;
++};
++
++struct ibp_reg_user_mr_cmd {
++ struct ibp_msg_header header;
++ u64 pd;
++ u64 hca_va;
++ u64 scif_addr;
++ u64 length;
++ u32 offset;
++ u32 access;
++ u64 data[0];
++};
++
++struct ibp_reg_user_mr_resp {
++ u64 mr;
++ u32 lkey;
++ u32 rkey;
++ u64 data[0];
++};
++
++struct ibp_dereg_mr_cmd {
++ struct ibp_msg_header header;
++ u64 mr;
++};
++
++struct ibp_attach_mcast_cmd {
++ struct ibp_msg_header header;
++ u64 qp;
++ __be64 subnet_prefix;
++ __be64 interface_id;
++ u16 lid;
++ u8 data[6];
++};
++
++struct ibp_detach_mcast_cmd {
++ struct ibp_msg_header header;
++ u64 qp;
++ __be64 subnet_prefix;
++ __be64 interface_id;
++ u16 lid;
++ u8 data[6];
++};
++
++#endif /* IBP_ABI_H */
+diff -urN a6/drivers/infiniband/ibp/drv/ibp.h a7/drivers/infiniband/ibp/drv/ibp.h
+--- a6/drivers/infiniband/ibp/drv/ibp.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/drv/ibp.h 2015-02-23 10:01:30.291769309 -0800
+@@ -0,0 +1,257 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef IBP_H
++#define IBP_H
++
++#include <rdma/ib_verbs.h>
++#include "ibp-abi.h"
++
++struct ibp_device {
++ char name[IBP_DEVICE_NAME_MAX];
++ u32 vendor_id;
++ u32 device_id;
++ u64 ib_device;
++ u64 device;
++ __be64 node_guid;
++ u64 uverbs_cmd_mask;
++ u32 uverbs_abi_ver;
++ u32 ibp_abi_ver;
++ struct device *linux_dev;
++ struct list_head list;
++ u64 driver_data;
++ int abi_version;
++ int num_comp_vectors;
++ u8 phys_port_cnt;
++};
++
++struct ibp_id_table {
++ u32 vendor_id;
++ u32 device_id;
++};
++
++struct ibp_driver {
++ const char *name;
++ const struct ibp_id_table *id_table;
++ int (*add)(struct ibp_device *device);
++ void (*remove)(struct ibp_device *device);
++ u64 (*resolve)(struct ib_device *ibdev);
++
++ struct list_head list;
++};
++
++struct ibp_rb {
++ u64 handle;
++};
++
++struct ibp_iomem {
++ void *cookie;
++ void __iomem *addr;
++};
++
++/**
++ * ibp_resolve_ib_device - Return the host ib_device handle
++ * @ibdev:Card IB device
++ *
++ * Upper level drivers may require the host ib_device handle associated
++ * with the card ib_device. This routine resolves the card ib_device to
++ * the cooresponding host ib_device handle. A value of 0 is returned if
++ * no match was found.
++ */
++u64 ibp_resolve_ib_device(struct ib_device *ibdev);
++
++/**
++ * ibp_register_driver - Register this driver
++ * @driver:Driver to register
++ *
++ * Lower level drivers use ibp_register_driver to register for callbacks
++ * on IB device addition and removal. Only one low level driver registration
++ * is allowed for a each vendor/device id pair. When an IB device is added,
++ * it is compared with each registered driver vendor and device id. The add
++ * callback routine for the matching driver will be called.
++ */
++int ibp_register_driver(struct ibp_driver *driver);
++
++/**
++ * ibp_unregister_driver - Unregister this driver
++ * @client:Driver to unregister
++ *
++ * Lower level drivers use ibp_unregister_driver() to remove their
++ * registration. When ibp_unregister_driver() is called, the driver
++ * will receive a remove callback for each IB device with matcing vendor
++ * and device ids.
++ */
++void ibp_unregister_driver(struct ibp_driver *driver);
++
++static inline void ibp_set_driver_data(struct ibp_device *device, u64 data)
++{
++ device->driver_data = data;
++}
++
++static inline u64 ibp_get_driver_data(struct ibp_device *device)
++{
++ return device->driver_data;
++}
++
++int ibp_cmd_alloc_ucontext(struct ibp_device *device, struct ib_device *ibdev,
++ u64 *ucontext, struct ibp_alloc_ucontext_cmd *cmd,
++ size_t cmd_size,
++ struct ibp_alloc_ucontext_resp *resp,
++ size_t resp_size);
++
++int ibp_cmd_dealloc_ucontext(struct ibp_device *device, u64 ucontext);
++
++/**
++ * ibp_reg_buf - Register a private buffer with this driver
++ * @device: the device on which to register
++ * @ucontext: peer driver ucontext handle
++ * @vaddr: starting virtual address of the buffer
++ * @length: length of the buffer
++ * @access: IB_ACCESS_xxx flags for buffer
++ *
++ * Lower level drivers use ibp_reg_buf() to register private buffers.
++ * Upon success, a pointer to a registered buffer structure is returned
++ * which contains an addr handle. The addr handle can be shared with
++ * a peer driver on the host server for its use with ib_umem_get().
++ * This routine should not be used to register IB memory regions.
++ */
++struct ibp_rb *ibp_reg_buf(struct ibp_device *device, u64 ucontext,
++ unsigned long vaddr, size_t length, int access);
++
++/**
++ * ibp_dereg_buf - Deregister a private buffer through this driver
++ * @device: the device on which to deregister
++ * @rb: pointer to the registered buffer structure; may be ERR or NULL
++ *
++ * Lower level drivers use ibp_dereg_buf() to deregister a private buffer.
++ */
++int ibp_dereg_buf(struct ibp_device *device, struct ibp_rb *rb);
++
++int ibp_cmd_mmap(struct ibp_device *device, u64 ucontext,
++ struct vm_area_struct *vma);
++
++struct ibp_iomem *ibp_cmd_ioremap(struct ibp_device *device, u64 ucontext,
++ phys_addr_t offset, unsigned long size);
++
++int ibp_cmd_iounmap(struct ibp_iomem *iomem);
++
++int ibp_cmd_query_device(struct ibp_device *device,
++ struct ib_device_attr *device_attr);
++
++int ibp_cmd_query_port(struct ibp_device *device, u8 port_num,
++ struct ib_port_attr *port_attr);
++
++int ibp_cmd_query_gid(struct ibp_device *device, u8 port_num, int index,
++ union ib_gid *gid);
++
++int ibp_cmd_query_pkey(struct ibp_device *device, u8 port_num, int index,
++ u16 *pkey);
++
++int ibp_cmd_alloc_pd(struct ibp_device *device, u64 ucontext, u64 *pd,
++ struct ibp_alloc_pd_cmd *cmd, size_t cmd_size,
++ struct ibp_alloc_pd_resp *resp, size_t resp_size);
++
++int ibp_cmd_dealloc_pd(struct ibp_device *device, u64 pd);
++
++int ibp_cmd_create_ah(struct ibp_device *device, u64 pd,
++ struct ib_ah_attr *ah_attr,
++ u64 *ah);
++
++int ibp_cmd_query_ah(struct ibp_device *device, u64 ah,
++ struct ib_ah_attr *ah_attr);
++
++int ibp_cmd_destroy_ah(struct ibp_device *device, u64 ah);
++
++int ibp_cmd_create_srq(struct ibp_device *device, u64 pd,
++ struct ib_srq_init_attr *init_attr,
++ u64 *srq, struct ib_srq *ibsrq,
++ struct ibp_create_srq_cmd *cmd, size_t cmd_size,
++ struct ibp_create_srq_resp *resp, size_t resp_size);
++
++int ibp_cmd_query_srq(struct ibp_device *device, u64 srq,
++ struct ib_srq_attr *attr);
++
++int ibp_cmd_modify_srq(struct ibp_device *device, u64 srq,
++ struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
++ struct ibp_modify_srq_cmd *cmd, size_t cmd_size,
++ struct ibp_modify_srq_resp *resp, size_t resp_size);
++
++int ibp_cmd_destroy_srq(struct ibp_device *device, u64 srq);
++
++int ibp_cmd_create_qp(struct ibp_device *device, u64 pd,
++ u64 send_cq, u64 recv_cq, u64 srq,
++ struct ib_qp_init_attr *init_attr,
++ u64 *qp, struct ib_qp *ibqp,
++ struct ibp_create_qp_cmd *cmd, size_t cmd_size,
++ struct ibp_create_qp_resp *resp, size_t resp_size);
++
++int ibp_cmd_query_qp(struct ibp_device *device, u64 qp,
++ struct ib_qp_attr *attr, int qp_attr_mask,
++ struct ib_qp_init_attr *init_attr);
++
++int ibp_cmd_modify_qp(struct ibp_device *device, u64 qp,
++ struct ib_qp_attr *attr, int qp_attr_mask,
++ struct ibp_modify_qp_cmd *cmd, size_t cmd_size,
++ struct ibp_modify_qp_resp *resp, size_t resp_size);
++
++int ibp_cmd_destroy_qp(struct ibp_device *device, u64 qp);
++
++int ibp_cmd_create_cq(struct ibp_device *device, u64 ucontext,
++ int entries, int vector, u64 *cq, struct ib_cq *ibcq,
++ struct ibp_create_cq_cmd *cmd, size_t cmd_size,
++ struct ibp_create_cq_resp *resp, size_t resp_size);
++
++int ibp_cmd_resize_cq(struct ibp_device *device, u64 cq,
++ int entries, struct ib_cq *ibcq,
++ struct ibp_resize_cq_cmd *cmd, size_t cmd_size,
++ struct ibp_resize_cq_resp *resp, size_t resp_size);
++
++int ibp_cmd_destroy_cq(struct ibp_device *device, u64 cq);
++
++int ibp_cmd_reg_user_mr(struct ibp_device *device, u64 pd, u64 start,
++ u64 length, u64 virt_addr, int access, u64 *mr,
++ u32 *lkey, u32 *rkey,
++ struct ibp_reg_user_mr_cmd *cmd, size_t cmd_size,
++ struct ibp_reg_user_mr_resp *resp, size_t resp_size);
++
++int ibp_cmd_dereg_mr(struct ibp_device *device, u64 mr);
++
++int ibp_cmd_get_dma_mr(struct ibp_device *device, u64 pd, int access,
++ u64 *mr, u32 *lkey, u32 *rkey);
++
++int ibp_cmd_attach_mcast(struct ibp_device *device, u64 qp,
++ union ib_gid *gid, u16 lid);
++
++int ibp_cmd_detach_mcast(struct ibp_device *device, u64 qp,
++ union ib_gid *gid, u16 lid);
++
++#endif /* IBP_H */
+diff -urN a6/drivers/infiniband/ibp/drv/Makefile a7/drivers/infiniband/ibp/drv/Makefile
+--- a6/drivers/infiniband/ibp/drv/Makefile 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/drv/Makefile 2015-02-23 10:01:30.291769309 -0800
+@@ -0,0 +1,21 @@
++KDIR ?= /lib/modules/`uname -r`/build
++
++obj-$(CONFIG_IBP_SERVER) += ibp_server.o
++
++ccflags-$(CONFIG_IBP_DEBUG) += -g -DIBP_DEBUG
++
++ibp_server-y := server.o \
++ stack.o \
++ server_msg.o
++
++default:
++ $(MAKE) -C $(KDIR) M=`pwd`
++
++modules_install:
++ $(MAKE) -C $(KDIR) M=`pwd` modules_install
++
++clean:
++ rm -rf *.ko *.o .*.ko.cmd .*.o.cmd *.mod.c Module.* modules.order .tmp_versions
++
++unix:
++ dos2unix *.[ch] Kconfig Makefile
+diff -urN a6/drivers/infiniband/ibp/drv/server.c a7/drivers/infiniband/ibp/drv/server.c
+--- a6/drivers/infiniband/ibp/drv/server.c 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/drv/server.c 2015-02-23 10:01:30.291769309 -0800
+@@ -0,0 +1,548 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#include "server.h"
++
++MODULE_AUTHOR("Jerrie Coffman");
++MODULE_AUTHOR("Phil Cayton");
++MODULE_AUTHOR("Jay Sternberg");
++MODULE_LICENSE("Dual BSD/GPL");
++MODULE_DESCRIPTION(DRV_DESC);
++MODULE_VERSION(DRV_VERSION);
++
++MODULE_PARAM(port, port, int, IBP_PORT, "Connection port");
++MODULE_PARAM(backlog, backlog, int, 8, "Connection backlog");
++MODULE_PARAM(timeout, timeout, int, 1000, "Listen/Poll time in milliseconds");
++
++#ifdef IBP_DEBUG
++MODULE_PARAM(debug_level, debug_level, int, 0, "Debug: 0-none, 1-some, 2-all");
++#endif
++
++#ifdef MOFED
++void *ibp_peer_mem_handle;
++invalidate_peer_memory ib_invalidate;
++#endif
++
++struct rw_semaphore list_rwsem;
++
++static struct class *ibp_class;
++static struct task_struct *listen_thread;
++
++static LIST_HEAD(device_list);
++static LIST_HEAD(client_list);
++static LIST_HEAD(cdev_list);
++
++static void ibp_add_one(struct ib_device *ib_dev);
++static void ibp_remove_one(struct ib_device *ib_dev);
++
++static struct ib_client ib_client = {
++ .name = DRV_NAME,
++ .add = ibp_add_one,
++ .remove = ibp_remove_one
++};
++
++static int ibp_open(struct inode *inode, struct file *filp);
++static ssize_t ibp_write(struct file *filp, const char __user *buf,
++ size_t count, loff_t *pos);
++static int ibp_close(struct inode *inode, struct file *filp);
++
++static const struct file_operations ibp_fops = {
++ .owner = THIS_MODULE,
++ .open = ibp_open,
++ .write = ibp_write,
++ .release = ibp_close,
++};
++
++static int ibp_create_cdev(struct ibp_client *client, uint16_t node)
++{
++ struct device *device;
++ dev_t devt;
++ int ret;
++
++ ret = alloc_chrdev_region(&devt, 0, 1, DRV_BASE);
++ if (ret) {
++ print_err("alloc_chrdev_region returned %d\n", ret);
++ return ret;
++ }
++
++ cdev_init(&client->cdev, &ibp_fops);
++ client->cdev.owner = THIS_MODULE;
++
++ ret = cdev_add(&client->cdev, devt, 1);
++ if (ret) {
++ print_err("cdev_add returned %d\n", ret);
++ goto err0;
++ }
++
++ device = device_create(ibp_class, NULL, devt,
++ NULL, DRV_BASE "%u", node);
++ if (IS_ERR(device)) {
++ ret = PTR_ERR(device);
++ goto err1;
++ }
++
++ /* Start on the cdev_list (until ibp_register_client). */
++ down_write(&list_rwsem);
++ list_add_tail(&client->list, &cdev_list);
++ up_write(&list_rwsem);
++
++ return 0;
++err1:
++ cdev_del(&client->cdev);
++err0:
++ unregister_chrdev_region(devt, 1);
++ return ret;
++}
++
++static void ibp_destroy_cdev(struct ibp_client *client)
++{
++ device_destroy(ibp_class, client->cdev.dev);
++ cdev_del(&client->cdev);
++ unregister_chrdev_region(client->cdev.dev, 1);
++}
++
++static struct ibp_client *ibp_create_client(scif_epd_t ep, uint16_t node)
++{
++ struct ibp_client *client;
++ int ret;
++
++ /* If a reconnect occurs while on the cdev_list just update the ep. */
++ down_read(&list_rwsem);
++ list_for_each_entry(client, &cdev_list, list) {
++ if (client->node == node) {
++ up_read(&list_rwsem);
++ scif_close(client->ep);
++ client->ep = ep;
++ return client;
++ }
++ }
++ up_read(&list_rwsem);
++
++ client = kzalloc(sizeof(*client), GFP_KERNEL);
++ if (!client) {
++ print_err("kzalloc failed\n");
++ return ERR_PTR(-ENOMEM);
++ }
++
++ client->ep = ep;
++ client->node = node;
++ atomic_set(&client->busy, 0);
++ atomic_set(&client->rx_in_process, 0);
++ init_waitqueue_head(&client->rx_wait_queue);
++ mutex_init(&client->ucontext_mutex);
++ INIT_LIST_HEAD(&client->ucontext_list);
++
++ client->workqueue = create_singlethread_workqueue(DRV_NAME);
++ if (!client->workqueue) {
++ print_err("create_singlethread_workqueue failed\n");
++ goto err0;
++ }
++
++ ret = ibp_create_cdev(client, node);
++ if (ret)
++ goto err1;
++
++ return client;
++err1:
++ destroy_workqueue(client->workqueue);
++err0:
++ kfree(client);
++ return ERR_PTR(ret);
++}
++
++static void ibp_destroy_client(struct ibp_client *client)
++{
++ ibp_cleanup_ucontext(&client->ucontext_list);
++ scif_close(client->ep);
++ flush_workqueue(client->workqueue);
++ destroy_workqueue(client->workqueue);
++ ibp_destroy_cdev(client);
++ kfree(client);
++}
++
++static void ibp_register_client(struct ibp_client *client)
++{
++ struct ibp_device *device;
++
++ down_write(&list_rwsem);
++
++ list_move(&client->list, &client_list);
++
++ list_for_each_entry(device, &device_list, list)
++ ibp_send_add(client, device);
++
++ up_write(&list_rwsem);
++}
++
++static void ibp_unregister_client(struct ibp_client *client)
++{
++ struct ibp_device *device;
++
++ flush_workqueue(client->workqueue);
++
++ down_write(&list_rwsem);
++
++ list_del(&client->list);
++
++ list_for_each_entry(device, &device_list, list)
++ ibp_send_remove(client, device);
++
++ up_write(&list_rwsem);
++}
++
++static int ibp_open(struct inode *inode, struct file *filp)
++{
++ struct ibp_client *client;
++
++ client = container_of(inode->i_cdev, struct ibp_client, cdev);
++
++ filp->private_data = client;
++
++ if (atomic_add_return(1, &client->busy) == 1)
++ ibp_register_client(client);
++
++ return 0;
++}
++
++static ssize_t ibp_write(struct file *filp, const char __user *buf,
++ size_t count, loff_t *pos)
++{
++ struct ibp_client *client;
++ void *rx_buf;
++ void *tx_buf;
++ int ret = -ENOMEM;
++
++ client = filp->private_data;
++
++ rx_buf = (void *) __get_free_page(GFP_KERNEL);
++ if (!rx_buf) {
++ print_err("__get_free_page rx_buf failed\n");
++ goto err0;
++ }
++
++ tx_buf = (void *) __get_free_page(GFP_KERNEL);
++ if (!tx_buf) {
++ print_err("__get_free_page tx_buf failed\n");
++ goto err1;
++ }
++
++ ret = ibp_process_recvs(client, rx_buf, tx_buf);
++
++ free_page((uintptr_t) tx_buf);
++err1:
++ free_page((uintptr_t) rx_buf);
++err0:
++ return ret;
++}
++
++static int ibp_close(struct inode *inode, struct file *filp)
++{
++ struct ibp_client *client;
++
++ client = filp->private_data;
++
++ if (atomic_sub_and_test(1, &client->busy)) {
++ ibp_unregister_client(client);
++ device_destroy(ibp_class, client->cdev.dev);
++ ibp_destroy_client(client);
++ }
++
++ return 0;
++}
++
++int ibp_get_device(struct ibp_device *device)
++{
++ struct ibp_device *entry;
++
++ down_read(&list_rwsem);
++
++ list_for_each_entry(entry, &device_list, list) {
++ if (entry == device) {
++ kref_get(&device->ref);
++ break;
++ }
++ }
++
++ up_read(&list_rwsem);
++
++ return (entry == device) ? 0 : -ENODEV;
++}
++
++static void ibp_complete_device(struct kref *ref)
++{
++ struct ibp_device *device;
++
++ device = container_of(ref, struct ibp_device, ref);
++ complete(&device->done);
++}
++
++void ibp_put_device(struct ibp_device *device)
++{
++ kref_put(&device->ref, ibp_complete_device);
++}
++
++static struct ibp_device *ibp_create_device(struct ib_device *ib_dev)
++{
++ struct ibp_device *device;
++
++ device = kzalloc(sizeof(*device), GFP_KERNEL);
++ if (!device) {
++ print_err("kzalloc failed\n");
++ return ERR_PTR(-ENOMEM);
++ }
++ device->ib_dev = ib_dev;
++ kref_init(&device->ref);
++ init_completion(&device->done);
++
++ ib_set_client_data(ib_dev, &ib_client, device);
++
++ return device;
++}
++
++static void ibp_destroy_device(struct ibp_device *device)
++{
++ ibp_put_device(device);
++ wait_for_completion(&device->done);
++
++ ib_set_client_data(device->ib_dev, &ib_client, NULL);
++ kfree(device);
++}
++
++static void ibp_register_device(struct ibp_device *device)
++{
++ struct ibp_client *client;
++
++ down_write(&list_rwsem);
++
++ list_add_tail(&device->list, &device_list);
++ list_for_each_entry(client, &client_list, list)
++ ibp_send_add(client, device);
++
++ up_write(&list_rwsem);
++}
++
++static void ibp_unregister_device(struct ibp_device *device)
++{
++ struct ibp_client *client;
++
++ down_write(&list_rwsem);
++
++ list_for_each_entry(client, &client_list, list)
++ ibp_send_remove(client, device);
++
++ list_del(&device->list);
++
++ up_write(&list_rwsem);
++}
++
++static int ibp_ignore_ib_dev(struct ib_device *ib_dev)
++{
++ /*
++ * Only allow PCI-based channel adapters and RNICs.
++ * PCI is required in order to read the vendor id.
++ */
++ return (!ib_dev->dma_device->bus ||
++ !ib_dev->dma_device->bus->name ||
++ strnicmp(ib_dev->dma_device->bus->name, "pci", 3) ||
++ ((ib_dev->node_type != RDMA_NODE_IB_CA) &&
++ (ib_dev->node_type != RDMA_NODE_RNIC))) ? 1 : 0;
++}
++
++static void ibp_add_one(struct ib_device *ib_dev)
++{
++ struct ibp_device *device;
++
++ if (ibp_ignore_ib_dev(ib_dev))
++ return;
++
++ device = ibp_create_device(ib_dev);
++ if (IS_ERR(device))
++ return;
++
++ ibp_register_device(device);
++}
++
++static void ibp_remove_one(struct ib_device *ib_dev)
++{
++ struct ibp_device *device;
++
++ device = ib_get_client_data(ib_dev, &ib_client);
++ if (!device)
++ return;
++
++ ibp_unregister_device(device);
++ ibp_destroy_device(device);
++}
++
++static int ibp_listen(void *data)
++{
++ struct ibp_client *client;
++ struct scif_pollepd listen;
++ struct scif_portID peer;
++ scif_epd_t ep;
++ int ret;
++
++ listen.epd = scif_open();
++ if (!listen.epd) {
++ print_err("scif_open failed\n");
++ ret = -EIO;
++ goto err0;
++ }
++ listen.events = POLLIN;
++
++ ret = scif_bind(listen.epd, port);
++ if (ret < 0) {
++ print_err("scif_bind returned %d\n", ret);
++ goto err1;
++ }
++
++ ret = scif_listen(listen.epd, backlog);
++ if (ret) {
++ print_err("scif_listen returned %d\n", ret);
++ goto err1;
++ }
++
++ while (!kthread_should_stop()) {
++
++ schedule();
++
++ ret = scif_poll(&listen, 1, timeout);
++ if (ret == 0) /* timeout */
++ continue;
++ if (ret < 0) {
++ print_err("scif_poll revents 0x%x\n", listen.revents);
++ continue;
++ }
++
++ ret = scif_accept(listen.epd, &peer, &ep, 0);
++ if (ret) {
++ print_err("scif_accept returned %d\n", ret);
++ continue;
++ }
++
++ print_dbg("accepted node %d port %d\n", peer.node, peer.port);
++
++ client = ibp_create_client(ep, peer.node);
++ if (IS_ERR(client)) {
++ ret = PTR_ERR(client);
++ print_err("ibp_create_client returned %d\n", ret);
++ scif_close(ep);
++ }
++ }
++err1:
++ scif_close(listen.epd);
++err0:
++ return ret;
++}
++
++static int __init ibp_server_init(void)
++{
++ int ret;
++
++ print_info(DRV_SIGNON);
++
++ init_rwsem(&list_rwsem);
++
++ ret = ibp_init();
++ if (ret) {
++ print_err("ibp_init_server returned %d\n", ret);
++ return ret;
++ }
++
++ ibp_class = class_create(THIS_MODULE, "infiniband_proxy");
++ if (IS_ERR(ibp_class)) {
++ ret = PTR_ERR(ibp_class);
++ print_err("class_create returned %d\n", ret);
++ goto err0;
++ }
++
++ ret = ib_register_client(&ib_client);
++ if (ret) {
++ print_err("ib_register_client returned %d\n", ret);
++ goto err1;
++ }
++
++#ifdef MOFED
++ ibp_peer_mem_handle = ib_register_peer_memory_client(&ibp_peer_mem,
++ &ib_invalidate);
++ if (IS_ERR(ibp_peer_mem_handle)) {
++ ret = PTR_ERR(ibp_peer_mem_handle);
++ print_err("ib_register_peer_memory_client returned %d\n", ret);
++ goto err2;
++ }
++#endif
++
++ /* Start a thread for inbound connections. */
++ listen_thread = kthread_run(ibp_listen, NULL, DRV_NAME);
++ if (IS_ERR(listen_thread)) {
++ ret = PTR_ERR(listen_thread);
++ print_err("kthread_run returned %d\n", ret);
++ goto err3;
++ }
++
++ return 0;
++err3:
++#ifdef MOFED
++ ib_unregister_peer_memory_client(ibp_peer_mem_handle);
++err2:
++#endif
++ ib_unregister_client(&ib_client);
++err1:
++ class_destroy(ibp_class);
++err0:
++ ibp_cleanup();
++ return ret;
++}
++
++static void __exit ibp_server_exit(void)
++{
++ struct ibp_client *client;
++ struct ibp_client *next;
++
++ kthread_stop(listen_thread);
++
++ list_for_each_entry_safe(client, next, &cdev_list, list)
++ ibp_destroy_client(client);
++
++#ifdef MOFED
++ ib_unregister_peer_memory_client(ibp_peer_mem_handle);
++#endif
++ ib_unregister_client(&ib_client);
++ class_destroy(ibp_class);
++
++ ibp_cleanup();
++
++ print_info(DRV_DESC " unloaded\n");
++}
++
++module_init(ibp_server_init);
++module_exit(ibp_server_exit);
+diff -urN a6/drivers/infiniband/ibp/drv/server.h a7/drivers/infiniband/ibp/drv/server.h
+--- a6/drivers/infiniband/ibp/drv/server.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/drv/server.h 2015-02-23 10:01:30.291769309 -0800
+@@ -0,0 +1,191 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef SERVER_H
++#define SERVER_H
++
++#include <linux/fs.h>
++#include <linux/cdev.h>
++#include <linux/anon_inodes.h>
++#include <linux/file.h>
++#include <rdma/ib_user_verbs.h>
++#include <rdma/ib_umem.h>
++#include "ibp-abi.h"
++#include "common.h"
++
++#define DRV_ROLE "Server"
++#define DRV_NAME "ibp_server"
++
++#define MAX_MSG_SIZE PAGE_SIZE
++
++extern int timeout;
++extern struct rw_semaphore list_rwsem;
++
++struct ibp_device {
++ struct list_head list;
++ struct ib_device *ib_dev;
++ struct kref ref;
++ struct completion done;
++};
++
++struct ibp_client {
++ struct list_head list;
++ scif_epd_t ep;
++ struct workqueue_struct *workqueue;
++ struct mutex ucontext_mutex;
++ struct list_head ucontext_list;
++ wait_queue_head_t rx_wait_queue;
++ atomic_t rx_in_process;
++ struct cdev cdev;
++ atomic_t busy;
++ uint16_t node;
++};
++
++struct ibp_queued_response {
++ struct ibp_client *client;
++ struct work_struct work;
++ u64 msg[0];
++};
++
++struct ibp_event {
++ struct ibp_client *client;
++ struct work_struct work;
++ u64 context;
++ u64 ibdev;
++ enum ib_event_type type;
++};
++
++struct ibp_comp {
++ struct ibp_client *client;
++ struct work_struct work;
++ void *cq_context;
++};
++
++struct ibp_ucontext {
++ struct ib_ucontext *ibucontext;
++ struct ibp_client *client;
++ struct ibp_device *device;
++ struct file *filp;
++ struct ib_event_handler event_handler;
++ u64 ibdev;
++ struct mutex mutex;
++ struct list_head list;
++ struct list_head mmap_list;
++ struct rb_root reg_tree;
++};
++
++struct ibp_qp {
++ struct ib_qp *ibqp;
++ struct list_head mcast;
++};
++
++struct ibp_mcast_entry {
++ struct list_head list;
++ union ib_gid gid;
++ u16 lid;
++};
++
++struct ibp_mmap {
++ struct list_head list;
++ struct ibp_ucontext *ucontext;
++ u64 len;
++ u64 prot;
++ u64 vaddr;
++ void __iomem *io_addr;
++ off_t scif_addr;
++};
++
++struct ibp_reg {
++ struct rb_node node;
++ struct scif_range *range;
++ struct ibp_ucontext *ucontext;
++ struct kref ref;
++ u64 virt_addr;
++ u64 length;
++ off_t offset;
++ u32 access;
++};
++
++struct ibp_mr {
++ struct ib_mr *ibmr;
++ struct ibp_reg *reg;
++};
++
++#ifdef MOFED
++#include <rdma/peer_mem.h>
++extern struct peer_memory_client ibp_peer_mem;
++extern void *ibp_peer_mem_handle;
++extern invalidate_peer_memory ib_invalidate;
++#else
++#define IBP_UMEM_MAX_PAGE_CHUNK \
++ ((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) / \
++ ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] - \
++ (void *) &((struct ib_umem_chunk *) 0)->page_list[0]))
++#endif
++
++#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \
++ do { \
++ (udata)->ops = &ibp_copy; \
++ (udata)->inbuf = (void *)(ibuf); \
++ (udata)->outbuf = (void *)(obuf); \
++ (udata)->inlen = (ilen); \
++ (udata)->outlen = (olen); \
++ } while (0)
++
++#define IBP_INIT_MSG(handle, msg, size, op) \
++ do { \
++ (msg)->header.opcode = IBP_##op; \
++ (msg)->header.length = (size); \
++ (msg)->header.status = 0; \
++ (msg)->header.reserved = 0; \
++ (msg)->header.device = (uintptr_t)(handle); \
++ (msg)->header.request = 0; \
++ } while (0)
++
++#define IBP_INIT_RESP(handle, resp, size, op, req, stat) \
++ do { \
++ (resp)->header.opcode = IBP_##op; \
++ (resp)->header.length = (size); \
++ (resp)->header.status = (stat); \
++ (resp)->header.reserved = 0; \
++ (resp)->header.device = (uintptr_t)(handle); \
++ (resp)->header.request = (req); \
++ } while (0)
++
++int ibp_process_recvs(struct ibp_client *client, void *rx_buf, void *tx_buf);
++void ibp_cleanup_ucontext(struct list_head *ucontext_list);
++int ibp_send_add(struct ibp_client *client, struct ibp_device *device);
++int ibp_send_remove(struct ibp_client *client, struct ibp_device *device);
++int ibp_get_device(struct ibp_device *device);
++void ibp_put_device(struct ibp_device *device);
++
++#endif /* SERVER_H */
+diff -urN a6/drivers/infiniband/ibp/drv/server_msg.c a7/drivers/infiniband/ibp/drv/server_msg.c
+--- a6/drivers/infiniband/ibp/drv/server_msg.c 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/drv/server_msg.c 2015-02-23 10:01:30.292769309 -0800
+@@ -0,0 +1,3098 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#include <linux/delay.h>
++
++#include "server.h"
++#include "stack.h"
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
++ #define MUNMAP(x,y,z) \
++ do { \
++ down_write(¤t->mm->mmap_sem); \
++ do_munmap(x,y,z); \
++ up_write(¤t->mm->mmap_sem); \
++ } while (0)
++#else
++ #define MUNMAP(x,y,z) \
++ vm_munmap((unsigned long)y,z)
++#endif
++
++static struct ibp_stack *o_stack;
++static struct ibp_stack *a_stack;
++static struct ibp_stack *c_stack;
++
++/*
++ * umem functions
++ */
++static int ibp_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
++{
++ size_t bytes;
++
++ bytes = min(len, udata->inlen);
++
++ memcpy(dest, udata->inbuf, bytes);
++ if (bytes < len) {
++ memset(dest + bytes, 0, len - bytes);
++ return -EFAULT;
++ }
++ return 0;
++}
++
++static int ibp_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
++{
++ size_t bytes;
++
++ bytes = min(len, udata->outlen);
++
++ memcpy(udata->outbuf, src, bytes);
++ udata->outlen -= bytes;
++
++ return (bytes < len) ? -EFAULT : 0;
++}
++
++static struct ib_udata_ops ibp_copy = {
++ .copy_from = ibp_copy_from_udata,
++ .copy_to = ibp_copy_to_udata
++};
++
++#ifdef MOFED
++
++static struct ibp_reg *__ibp_find_reg(struct ibp_ucontext *ucontext,
++ unsigned long virt, size_t size)
++{
++ struct rb_node *node;
++ struct ibp_reg *reg;
++
++ node = ucontext->reg_tree.rb_node;
++
++ while (node) {
++ reg = rb_entry(node, struct ibp_reg, node);
++
++ if ((virt == reg->virt_addr) &&
++ (size == reg->length))
++ return reg;
++
++ if (virt < reg->virt_addr)
++ node = node->rb_left;
++ else if (virt > reg->virt_addr)
++ node = node->rb_right;
++ else if (size < reg->length)
++ node = node->rb_left;
++ else if (size > reg->length)
++ node = node->rb_right;
++ else
++ node = node->rb_right;
++ }
++
++ return ERR_PTR(-EFAULT);
++}
++
++static struct ibp_reg *ibp_find_reg(struct ibp_ucontext *ucontext,
++ unsigned long virt, size_t size)
++{
++ struct ibp_reg *reg;
++
++ mutex_lock(&ucontext->mutex);
++ reg = __ibp_find_reg(ucontext, virt, size);
++ mutex_unlock(&ucontext->mutex);
++
++ return reg;
++}
++
++/* ibp_peer_acquire return code: 1 mine, 0 not mine */
++static int ibp_peer_acquire(unsigned long addr,
++ size_t size, void* peer_mem_private_data,
++ char* peer_mem_name, void** client_context)
++{
++ struct ibp_ucontext *ucontext;
++ struct ibp_reg *reg;
++
++ /* Verify private data is ours before ibp_ucontext cast. */
++ if (!peer_mem_name || !peer_mem_private_data ||
++ strncmp(peer_mem_name, ibp_peer_mem.name,
++ sizeof(ibp_peer_mem.name)))
++ return 0;
++
++ ucontext = (struct ibp_ucontext *) peer_mem_private_data;
++
++ reg = ibp_find_reg(ucontext, addr, size);
++ if (IS_ERR(reg)) {
++ print_err("ibp_find_reg returned %d\n", (int)PTR_ERR(reg));
++ return 0;
++ }
++
++ *client_context = (void *) reg;
++
++ return 1;
++}
++
++static int ibp_peer_get_pages(unsigned long addr, size_t size, int write,
++ int force, struct sg_table *sg_head,
++ void* client_context, void* core_context)
++{
++ struct ibp_reg *reg;
++ struct page *page;
++ struct scatterlist *sg;
++ void **va;
++ int npages, off, i, ret;
++
++ reg = (struct ibp_reg *) client_context;
++
++ off = (addr - reg->virt_addr) + reg->offset;
++ npages = PAGE_ALIGN(size + (off & ~PAGE_MASK)) >> PAGE_SHIFT;
++
++ ret = sg_alloc_table(sg_head, npages, GFP_KERNEL);
++ if (ret)
++ return ret;
++
++ va = reg->range->va;
++
++ for_each_sg(sg_head->sgl, sg, npages, i) {
++ page = vmalloc_to_page(va[i]);
++ if (!page) {
++ print_err("vmalloc_to_page failed\n");
++ ret = -EINVAL;
++ goto err;
++ }
++ sg_set_page(sg, page, PAGE_SIZE, 0);
++ }
++
++ return 0;
++err:
++ sg_free_table(sg_head);
++ return ret;
++}
++
++static int ibp_peer_dma_map(struct sg_table *sg_head, void *client_context,
++ struct device *dma_device, int dmasync, int *nmap)
++{
++ DEFINE_DMA_ATTRS(attrs);
++ int ret = 0;
++
++ if (dmasync)
++ dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
++
++ *nmap = dma_map_sg_attrs(dma_device,
++ sg_head->sgl,
++ sg_head->orig_nents,
++ DMA_BIDIRECTIONAL,
++ &attrs);
++
++ if (*nmap > 0)
++ sg_head->nents = *nmap;
++ else
++ ret = -ENOMEM;
++
++ return ret;
++}
++
++static int ibp_peer_dma_umap(struct sg_table *sg_head, void *client_context,
++ struct device *dma_device)
++{
++ dma_unmap_sg(dma_device,
++ sg_head->sgl,
++ sg_head->nents,
++ DMA_BIDIRECTIONAL);
++ return 0;
++}
++
++static void ibp_peer_put_pages(struct sg_table *sg_head, void *client_context)
++{
++ sg_free_table(sg_head);
++}
++
++static unsigned long ibp_peer_get_page_size(void *client_context)
++{
++ return PAGE_SIZE;
++}
++
++struct peer_memory_client ibp_peer_mem = {
++ .name = DRV_NAME,
++ .version = DRV_VERSION,
++ .acquire = &ibp_peer_acquire,
++ .get_pages = &ibp_peer_get_pages,
++ .dma_map = &ibp_peer_dma_map,
++ .dma_unmap = &ibp_peer_dma_umap,
++ .put_pages = &ibp_peer_put_pages,
++ .get_page_size = &ibp_peer_get_page_size,
++};
++
++#else /* MOFED */
++
++static struct ibp_reg *__ibp_find_reg(struct ibp_ucontext *ucontext,
++ unsigned long virt, size_t size,
++ int access)
++{
++ struct rb_node *node;
++ struct ibp_reg *reg;
++
++ node = ucontext->reg_tree.rb_node;
++
++ while (node) {
++ reg = rb_entry(node, struct ibp_reg, node);
++
++ if ((virt == reg->virt_addr) &&
++ (size == reg->length) &&
++ (access == reg->access))
++ return reg;
++
++ if (virt < reg->virt_addr)
++ node = node->rb_left;
++ else if (virt > reg->virt_addr)
++ node = node->rb_right;
++ else if (size < reg->length)
++ node = node->rb_left;
++ else if (size > reg->length)
++ node = node->rb_right;
++ else if (access < reg->access)
++ node = node->rb_left;
++ else
++ node = node->rb_right;
++ }
++
++ return ERR_PTR(-EFAULT);
++}
++
++static struct ibp_reg *ibp_find_reg(struct ibp_ucontext *ucontext,
++ unsigned long virt, size_t size,
++ int access)
++{
++ struct ibp_reg *reg;
++
++ mutex_lock(&ucontext->mutex);
++ reg = __ibp_find_reg(ucontext, virt, size, access);
++ mutex_unlock(&ucontext->mutex);
++
++ return reg;
++}
++
++static void __ibp_umem_release(struct ib_device *dev, struct ib_umem *umem,
++ int dirty)
++{
++ struct scatterlist *sg;
++ int i;
++
++ if (umem->nmap > 0)
++ ib_dma_unmap_sg(dev, umem->sg_head.sgl,
++ umem->nmap, DMA_BIDIRECTIONAL);
++
++ if (umem->writable && dirty)
++ for_each_sg(umem->sg_head.sgl, sg, umem->npages, i)
++ set_page_dirty_lock(sg_page(sg));
++
++ sg_free_table(&umem->sg_head);
++}
++
++static struct ib_umem *ibp_umem_get(struct ib_ucontext *ibucontext,
++ unsigned long addr, size_t size,
++ int access, int dmasync)
++{
++ struct ibp_reg *reg;
++ struct ib_umem *umem;
++ struct device *dma_device;
++ struct page *page;
++ struct scatterlist *sg;
++ void **va;
++ dma_addr_t *pa;
++ dma_addr_t daddr;
++ unsigned int dsize;
++ int npages;
++ int off;
++ int i;
++ int ret = 0;
++
++ DEFINE_DMA_ATTRS(attrs);
++
++ reg = ibp_find_reg(ibucontext->umem_private_data, addr, size, access);
++ if (IS_ERR(reg))
++ return ERR_CAST(reg);
++
++ if (dmasync)
++ dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
++
++ umem = kzalloc(sizeof(*umem), GFP_KERNEL);
++ if (!umem) {
++ print_err("kalloc failed\n");
++ return ERR_PTR(-ENOMEM);
++ }
++
++ umem->length = size;
++ umem->offset = addr & ~PAGE_MASK;
++ umem->page_size = PAGE_SIZE;
++ umem->pid = get_task_pid(current, PIDTYPE_PID);
++ umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);
++
++ dsize = 0;
++ daddr = 0;
++ va = reg->range->va;
++ pa = reg->range->phys_addr;
++ dma_device = ibucontext->device->dma_device;
++ off = (addr - reg->virt_addr) + reg->offset;
++ npages = PAGE_ALIGN(size + (off & ~PAGE_MASK)) >> PAGE_SHIFT;
++ off >>= PAGE_SHIFT;
++
++ ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
++ if (ret) {
++ print_err("sg_alloc_table failed\n");
++ goto err1;
++ }
++
++ /* Assume hugetlb unless proven otherwise. */
++ umem->hugetlb = 1;
++ for (i = 0; i < npages && umem->hugetlb; i++) {
++ if (!dsize) {
++ dsize = PAGE_SIZE;
++ daddr = pa[i + off];
++ /* Page must start on a huge page boundary. */
++ if ((daddr & ~HPAGE_MASK) >= PAGE_SIZE)
++ umem->hugetlb = 0;
++ } else if (daddr + dsize != pa[i + off])
++ /* Pages must be contiguous. */
++ umem->hugetlb = 0;
++ else {
++ dsize += PAGE_SIZE;
++ if (dsize == HPAGE_SIZE)
++ dsize = 0;
++ }
++ }
++ /* Page must end on a huge page boundary.*/
++ if (umem->hugetlb && ((daddr + dsize) & ~HPAGE_MASK))
++ umem->hugetlb = 0;
++
++ for_each_sg(umem->sg_head.sgl, sg, npages, i) {
++ page = vmalloc_to_page(va[i]);
++ if (!page) {
++ print_err("vmalloc_to_page failed\n");
++ ret = -EINVAL;
++ goto err2;
++ }
++ sg_set_page(sg, page, PAGE_SIZE, 0);
++ }
++
++ umem->npages = npages;
++
++ umem->nmap = ib_dma_map_sg_attrs(ibucontext->device,
++ umem->sg_head.sgl,
++ umem->npages,
++ DMA_BIDIRECTIONAL,
++ &attrs);
++ if (umem->nmap <= 0) {
++ print_err("map_sg_attrs failed\n");
++ ret = -ENOMEM;
++ goto err2;
++ }
++
++ return umem;
++err2:
++ __ibp_umem_release(ibucontext->device, umem, 0);
++err1:
++ put_pid(umem->pid);
++ kfree(umem);
++ return ERR_PTR(ret);
++}
++
++static void ibp_umem_release(struct ib_umem *umem)
++{
++ struct ib_ucontext *ibucontext;
++
++ ibucontext = umem->context;
++
++ __ibp_umem_release(ibucontext->device, umem, 0);
++
++ put_pid(umem->pid);
++ kfree(umem);
++}
++
++static struct ib_umem_ops ibp_umem = {
++ .get = &ibp_umem_get,
++ .release = &ibp_umem_release,
++};
++
++#endif /* MOFED */
++
++static int ibp_send(scif_epd_t ep, void *buf, size_t len)
++{
++ int ret;
++
++ while (len) {
++ ret = scif_send(ep, buf, (uint32_t) len, SCIF_SEND_BLOCK);
++ if (ret < 0) {
++ print_dbg("scif_send returned %d\n", ret);
++ return ret;
++ }
++ buf += ret;
++ len -= ret;
++ }
++
++ return 0;
++}
++
++static int ibp_recv(scif_epd_t ep, void *buf, size_t len)
++{
++ int ret;
++
++ while (len) {
++ ret = scif_recv(ep, buf, (uint32_t) len, SCIF_RECV_BLOCK);
++ if (ret < 0) {
++ print_dbg("scif_recv returned %d\n", ret);
++ return ret;
++ }
++ buf += ret;
++ len -= ret;
++ }
++
++ return 0;
++}
++
++int ibp_send_add(struct ibp_client *client, struct ibp_device *device)
++{
++ struct pci_dev *pdev;
++ struct ibp_add_device_msg msg;
++
++ print_trace("in\n");
++
++ pdev = to_pci_dev(device->ib_dev->dma_device);
++
++ IBP_INIT_MSG(device, &msg, sizeof(msg), ADD_DEVICE);
++
++ strncpy(msg.data.name, device->ib_dev->name, sizeof(msg.data.name));
++ msg.data.vendor_id = pdev->vendor;
++ msg.data.device_id = pdev->device;
++
++ msg.data.ib_device = (uintptr_t) device->ib_dev;
++ msg.data.device = (uintptr_t) device;
++ msg.data.node_guid = device->ib_dev->node_guid;
++ msg.data.uverbs_cmd_mask = device->ib_dev->uverbs_cmd_mask;
++ msg.data.uverbs_abi_ver = device->ib_dev->uverbs_abi_ver;
++ msg.data.ibp_abi_ver = IBP_ABI_VERSION;
++ msg.data.num_comp_vectors = device->ib_dev->num_comp_vectors;
++ msg.data.phys_port_cnt = device->ib_dev->phys_port_cnt;
++
++ return ibp_send(client->ep, &msg, sizeof(msg));
++}
++
++int ibp_send_remove(struct ibp_client *client, struct ibp_device *device)
++{
++ struct ibp_remove_device_msg msg;
++
++ print_trace("in\n");
++
++ IBP_INIT_MSG(device, &msg, sizeof(msg), REMOVE_DEVICE);
++ return ibp_send(client->ep, &msg, sizeof(msg));
++}
++
++static void ibp_send_queued_response(struct work_struct *work)
++{
++ struct ibp_queued_response_msg *msg;
++ struct ibp_queued_response *resp;
++
++ resp = container_of(work, struct ibp_queued_response, work);
++ msg = (struct ibp_queued_response_msg *) resp->msg;
++
++ ibp_send(resp->client->ep, msg, msg->header.length);
++ kfree(resp);
++}
++
++static int ibp_queue_response(struct ibp_client *client,
++ struct ibp_queued_response_msg *msg)
++{
++ struct ibp_queued_response *resp;
++ size_t len;
++
++ len = sizeof(*resp) + msg->header.length;
++
++ resp = kmalloc(len, GFP_ATOMIC);
++ if (!resp) {
++ print_err("kalloc failed\n");
++ return -ENOMEM;
++ }
++
++ resp->client = client;
++ memcpy(&resp->msg, msg, msg->header.length);
++
++ /* Queue to serialize behing any associated events. */
++ INIT_WORK(&resp->work, ibp_send_queued_response);
++ queue_work(client->workqueue, &resp->work);
++
++ return 0;
++}
++
++static int ibp_cmd_error(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf, int ret)
++{
++ struct ibp_verb_response_msg *msg;
++ size_t len;
++
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ IBP_INIT_RESP(hdr->device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_bad_request(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ print_dbg("opcode 0x%x\n", hdr->opcode);
++ return ibp_cmd_error(client, hdr, tx_buf, -EBADRQC);
++}
++
++static int ibp_cmd_not_supported(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ print_dbg("opcode 0x%x\n", hdr->opcode);
++ return ibp_cmd_error(client, hdr, tx_buf, -ENOSYS);
++}
++
++static int ibp_cmd_query_device(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_query_device_resp *resp;
++ struct ib_device_attr attr;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ ret = ib_query_device(device->ib_dev, &attr);
++ if (ret) {
++ print_err("ib_query_device returned %d\n", ret);
++ goto send_resp;
++ }
++
++ resp = (struct ibp_query_device_resp *) msg->data;
++ len += sizeof(*resp);
++
++ resp->fw_ver = attr.fw_ver;
++ resp->sys_image_guid = attr.sys_image_guid;
++ resp->max_mr_size = attr.max_mr_size;
++ resp->page_size_cap = attr.page_size_cap;
++ resp->vendor_id = attr.vendor_id;
++ resp->vendor_part_id = attr.vendor_part_id;
++ resp->hw_ver = attr.hw_ver;
++ resp->max_qp = attr.max_qp;
++ resp->max_qp_wr = attr.max_qp_wr;
++ resp->device_cap_flags = attr.device_cap_flags;
++ resp->max_sge = attr.max_sge;
++ resp->max_sge_rd = attr.max_sge_rd;
++ resp->max_cq = attr.max_cq;
++ resp->max_cqe = attr.max_cqe;
++ resp->max_mr = attr.max_mr;
++ resp->max_pd = attr.max_pd;
++ resp->max_qp_rd_atom = attr.max_qp_rd_atom;
++ resp->max_ee_rd_atom = attr.max_ee_rd_atom;
++ resp->max_res_rd_atom = attr.max_res_rd_atom;
++ resp->max_qp_init_rd_atom = attr.max_qp_init_rd_atom;
++ resp->max_ee_init_rd_atom = attr.max_ee_init_rd_atom;
++ resp->atomic_cap = attr.atomic_cap;
++ resp->masked_atomic_cap = attr.masked_atomic_cap;
++ resp->max_ee = attr.max_ee;
++ resp->max_rdd = attr.max_rdd;
++ resp->max_mw = attr.max_mw;
++ resp->max_raw_ipv6_qp = attr.max_raw_ipv6_qp;
++ resp->max_raw_ethy_qp = attr.max_raw_ethy_qp;
++ resp->max_mcast_grp = attr.max_mcast_grp;
++ resp->max_mcast_qp_attach = attr.max_mcast_qp_attach;
++ resp->max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach;
++ resp->max_ah = attr.max_ah;
++ resp->max_fmr = attr.max_fmr;
++ resp->max_map_per_fmr = attr.max_map_per_fmr;
++ resp->max_srq = attr.max_srq;
++ resp->max_srq_wr = attr.max_srq_wr;
++ resp->max_srq_sge = attr.max_srq_sge;
++ resp->max_fast_reg_page_list_len = attr.max_fast_reg_page_list_len;
++ resp->max_pkeys = attr.max_pkeys;
++ resp->local_ca_ack_delay = attr.local_ca_ack_delay;
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_query_port(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_query_port_cmd *cmd;
++ struct ibp_query_port_resp *resp;
++ struct ib_port_attr attr;
++ size_t len;
++ int ret;
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_query_port_cmd *) hdr;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ ret = ib_query_port(device->ib_dev, cmd->port_num, &attr);
++ if (ret) {
++ print_err("ib_query_port returned %d\n", ret);
++ goto send_resp;
++ }
++
++ resp = (struct ibp_query_port_resp *) msg->data;
++ len += sizeof(*resp);
++
++ resp->state = attr.state;
++ resp->max_mtu = attr.max_mtu;
++ resp->active_mtu = attr.active_mtu;
++ resp->gid_tbl_len = attr.gid_tbl_len;
++ resp->port_cap_flags = attr.port_cap_flags;
++ resp->max_msg_sz = attr.max_msg_sz;
++ resp->bad_pkey_cntr = attr.bad_pkey_cntr;
++ resp->qkey_viol_cntr = attr.qkey_viol_cntr;
++ resp->pkey_tbl_len = attr.pkey_tbl_len;
++ resp->lid = attr.lid;
++ resp->sm_lid = attr.sm_lid;
++ resp->lmc = attr.lmc;
++ resp->max_vl_num = attr.max_vl_num;
++ resp->sm_sl = attr.sm_sl;
++ resp->subnet_timeout = attr.subnet_timeout;
++ resp->init_type_reply = attr.init_type_reply;
++ resp->active_width = attr.active_width;
++ resp->active_speed = attr.active_speed;
++ resp->phys_state = attr.phys_state;
++ resp->link_layer = rdma_port_get_link_layer(device->ib_dev,
++ cmd->port_num);
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_query_gid(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_query_gid_cmd *cmd;
++ struct ibp_query_gid_resp *resp;
++ size_t len;
++ union ib_gid gid;
++ int ret;
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_query_gid_cmd *) hdr;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ ret = ib_query_gid(device->ib_dev, cmd->port_num, cmd->index, &gid);
++ if (ret) {
++ print_err("ib_query_gid returned %d\n", ret);
++ goto send_resp;
++ }
++
++ resp = (struct ibp_query_gid_resp *) msg->data;
++ len += sizeof(*resp);
++
++ resp->subnet_prefix = gid.global.subnet_prefix;
++ resp->interface_id = gid.global.interface_id;
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_query_pkey(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_query_pkey_cmd *cmd;
++ struct ibp_query_pkey_resp *resp;
++ size_t len;
++ u16 pkey;
++ int ret;
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_query_pkey_cmd *) hdr;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ ret = ib_query_pkey(device->ib_dev, cmd->port_num, cmd->index, &pkey);
++ if (ret) {
++ print_err("ib_query_pkey returned %d\n", ret);
++ goto send_resp;
++ }
++ resp = (struct ibp_query_pkey_resp *) msg->data;
++ len += sizeof(*resp);
++
++ resp->pkey = pkey;
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static void ibp_async_event(struct work_struct *work)
++{
++ struct ibp_event *event;
++ struct ibp_async_event_msg msg;
++
++ event = container_of(work, struct ibp_event, work);
++
++ IBP_INIT_MSG(NULL, &msg, sizeof(msg), ASYNC_EVENT);
++
++ msg.data.context = (uintptr_t) event->context;
++ msg.data.type = event->type;
++
++ ibp_send(event->client->ep, &msg, sizeof(msg));
++
++ ibp_add_to_stack(a_stack, (void *) event);
++}
++
++static void ibp_event_handler(struct ib_event_handler *handler,
++ struct ib_event *ibevent)
++{
++ struct ibp_ucontext *ucontext;
++ struct ibp_client *client;
++ struct ibp_event *event;
++
++ ucontext = container_of(handler, struct ibp_ucontext, event_handler);
++
++ if (ucontext->ibucontext->closing) {
++ print_dbg("ignoring event, connection closing\n");
++ return;
++ }
++
++ event = (struct ibp_event *)
++ ibp_pull_from_stack(a_stack, sizeof(*event), GFP_ATOMIC);
++ if (!event) {
++ print_err("kalloc failed\n");
++ return;
++ }
++
++ client = ucontext->client;
++
++ event->client = client;
++ event->context = ibevent->element.port_num;
++ event->type = ibevent->event;
++ event->ibdev = ucontext->ibdev;
++
++ INIT_WORK(&event->work, ibp_async_event);
++ queue_work(client->workqueue, &event->work);
++}
++
++static int ibp_mmap(struct file *filp, struct vm_area_struct *vma)
++{
++ struct ibp_ucontext *ucontext;
++ struct ib_ucontext *ibucontext;
++
++ ucontext = filp->private_data;
++ ibucontext = ucontext->ibucontext;
++
++ return (ibucontext->device->mmap) ?
++ ibucontext->device->mmap(ibucontext, vma) : -ENOSYS;
++}
++
++static const struct file_operations ibp_fops = {
++ .mmap = ibp_mmap,
++};
++
++static int ibp_cmd_alloc_ucontext(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_alloc_ucontext_cmd *cmd;
++ struct ibp_alloc_ucontext_resp *resp;
++ struct ibp_ucontext *ucontext;
++ struct ib_ucontext *ibucontext;
++ struct ib_udata udata;
++ size_t len;
++ size_t outlen;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_alloc_ucontext_cmd *) hdr;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ resp = (struct ibp_alloc_ucontext_resp *) msg->data;
++ len = hdr->length - sizeof(*cmd);
++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp);
++
++ /* Workaround for len check in mlx5 driver (no impact to others) */
++ len += sizeof(struct ib_uverbs_cmd_hdr);
++
++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen);
++
++ len = sizeof(*msg);
++
++ ret = ibp_get_device(device);
++ if (ret) {
++ print_err("ibp_get_device returned %d\n", ret);
++ goto send_resp;
++ }
++
++ ucontext = kzalloc(sizeof(*ucontext), GFP_KERNEL);
++ if (!ucontext) {
++ print_err("kzalloc failed\n");
++ ret = -ENOMEM;
++ goto err1;
++ }
++ ucontext->device = device;
++
++ ibucontext = device->ib_dev->alloc_ucontext(device->ib_dev, &udata);
++ if (IS_ERR(ibucontext)) {
++ ret = PTR_ERR(ibucontext);
++ print_err("Invalid ibucontext %p\n", ibucontext);
++ goto err2;
++ }
++
++#ifdef MOFED
++ ibucontext->peer_mem_name = ibp_peer_mem.name;
++ ibucontext->peer_mem_private_data = ucontext;
++#else
++ ibucontext->umem_ops = &ibp_umem;
++ ibucontext->umem_private_data = ucontext;
++#endif
++
++ ibucontext->device = device->ib_dev;
++ ibucontext->closing = 0;
++
++ INIT_LIST_HEAD(&ibucontext->pd_list);
++ INIT_LIST_HEAD(&ibucontext->mr_list);
++ INIT_LIST_HEAD(&ibucontext->mw_list);
++ INIT_LIST_HEAD(&ibucontext->cq_list);
++ INIT_LIST_HEAD(&ibucontext->qp_list);
++ INIT_LIST_HEAD(&ibucontext->srq_list);
++ INIT_LIST_HEAD(&ibucontext->ah_list);
++ INIT_LIST_HEAD(&ibucontext->xrcd_list);
++
++ ucontext->filp = anon_inode_getfile("["DRV_NAME"]", &ibp_fops,
++ ucontext, O_RDWR);
++ if (IS_ERR(ucontext->filp)) {
++ ret = PTR_ERR(ucontext->filp);
++ print_err("anon_inode_getfile returned %d\n", ret);
++ goto err3;
++ }
++
++ if (cmd->ibdev) {
++ ucontext->ibdev = cmd->ibdev;
++ INIT_IB_EVENT_HANDLER(&ucontext->event_handler, device->ib_dev,
++ ibp_event_handler);
++ ret = ib_register_event_handler(&ucontext->event_handler);
++ if (ret) {
++ print_err("event_handler returned %d\n", ret);
++ goto err4;
++ }
++ }
++
++ ucontext->client = client;
++ ucontext->ibucontext = ibucontext;
++ mutex_init(&ucontext->mutex);
++ INIT_LIST_HEAD(&ucontext->mmap_list);
++ ucontext->reg_tree = RB_ROOT;
++
++ mutex_lock(&client->ucontext_mutex);
++ list_add_tail(&ucontext->list, &client->ucontext_list);
++ mutex_unlock(&client->ucontext_mutex);
++
++ len += sizeof(*resp);
++ len += outlen - udata.outlen; /* add driver private data */
++
++ resp->ucontext = (uintptr_t)ucontext;
++
++ goto send_resp;
++
++err4:
++ fput(ucontext->filp);
++err3:
++ device->ib_dev->dealloc_ucontext(ibucontext);
++err2:
++ kfree(ucontext);
++err1:
++ ibp_put_device(device);
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_dealloc_ucontext(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_dealloc_ucontext_cmd *cmd;
++ struct ibp_queued_response_msg *msg;
++ struct ibp_ucontext *ucontext;
++ struct ib_ucontext *ibucontext;
++ size_t len;
++ int ret = -EINVAL;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_dealloc_ucontext_cmd *) hdr;
++ ucontext = (struct ibp_ucontext *) cmd->ucontext;
++ msg = (struct ibp_queued_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ if (IS_NULL_OR_ERR(ucontext)) {
++ print_err("Invalid ucontext %p\n", ucontext);
++ goto send_resp;
++ }
++
++ ibucontext = ucontext->ibucontext;
++
++ if (ucontext->ibdev)
++ ib_unregister_event_handler(&ucontext->event_handler);
++
++ fput(ucontext->filp);
++
++ if (device && device->ib_dev) {
++ ret = device->ib_dev->dealloc_ucontext(ibucontext);
++ if (ret) {
++ print_err("ib_dealloc_ucontext returned %d\n", ret);
++ goto send_resp;
++ }
++ }
++
++ mutex_lock(&client->ucontext_mutex);
++ list_del(&ucontext->list);
++ mutex_unlock(&client->ucontext_mutex);
++
++ ibp_put_device(device);
++ kfree(ucontext);
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, QUEUED_RESPONSE, hdr->request, ret);
++ return ibp_queue_response(client, msg);
++}
++
++static void ibp_dereg_buf(struct kref *ref)
++{
++ struct ibp_reg *reg;
++ struct ibp_ucontext *ucontext;
++
++ reg = container_of(ref, struct ibp_reg, ref);
++ ucontext = reg->ucontext;
++
++ if (!RB_EMPTY_NODE(®->node)) {
++ mutex_lock(&ucontext->mutex);
++ rb_erase(®->node, &ucontext->reg_tree);
++ mutex_unlock(&ucontext->mutex);
++ }
++
++ if (reg->range)
++ scif_put_pages(reg->range);
++
++ kfree(reg);
++}
++
++static struct ibp_reg *__ibp_insert_reg_buf(struct ibp_ucontext *ucontext,
++ struct ibp_reg *reg)
++{
++ struct rb_node **link;
++ struct rb_node *parent;
++ struct ibp_reg *cur_reg;
++
++ link = &ucontext->reg_tree.rb_node;
++ parent = NULL;
++
++ while (*link) {
++ parent = *link;
++ cur_reg = rb_entry(parent, struct ibp_reg, node);
++
++#ifdef MOFED
++ if ((reg->virt_addr == cur_reg->virt_addr) &&
++ (reg->length == cur_reg->length))
++ return cur_reg;
++#else
++ if ((reg->virt_addr == cur_reg->virt_addr) &&
++ (reg->length == cur_reg->length) &&
++ (reg->access == cur_reg->access))
++ return cur_reg;
++#endif
++
++ if (reg->virt_addr < cur_reg->virt_addr)
++ link = &(*link)->rb_left;
++ else if (reg->virt_addr > cur_reg->virt_addr)
++ link = &(*link)->rb_right;
++ else if (reg->length < cur_reg->length)
++ link = &(*link)->rb_left;
++ else if (reg->length > cur_reg->length)
++ link = &(*link)->rb_right;
++#ifndef MOFED
++ else if (reg->access < cur_reg->access)
++ link = &(*link)->rb_left;
++#endif
++ else
++ link = &(*link)->rb_right;
++ }
++
++ rb_link_node(®->node, parent, link);
++ rb_insert_color(®->node, &ucontext->reg_tree);
++
++ return NULL;
++}
++
++static struct ibp_reg *ibp_reg_buf(struct ibp_ucontext *ucontext,
++ u64 virt_addr, u64 scif_addr, u64 length,
++ u64 offset, u32 access)
++{
++ struct ibp_reg *reg;
++ struct ibp_reg *cur_reg;
++ int ret;
++
++ reg = kzalloc(sizeof(*reg), GFP_KERNEL);
++ if (!reg) {
++ print_err("kzalloc failed\n");
++ return ERR_PTR(-ENOMEM);
++ }
++
++ kref_init(®->ref);
++ RB_CLEAR_NODE(®->node);
++ reg->ucontext = ucontext;
++ reg->virt_addr = virt_addr;
++ reg->length = length;
++ reg->offset = offset;
++ reg->access = access;
++
++ ret = scif_get_pages(ucontext->client->ep, scif_addr,
++ PAGE_ALIGN(reg->length +
++ (reg->virt_addr & ~PAGE_MASK)),
++ ®->range);
++ if (ret) {
++ print_err("scif_get_pages returned %d\n", ret);
++ kref_put(®->ref, ibp_dereg_buf);
++ return ERR_PTR(ret);
++ }
++
++ mutex_lock(&ucontext->mutex);
++
++ cur_reg = __ibp_insert_reg_buf(ucontext, reg);
++ if (cur_reg) {
++ print_dbg("__ibp_insert_reg_buf duplicate entry\n");
++ kref_get(&cur_reg->ref);
++ }
++
++ mutex_unlock(&ucontext->mutex);
++
++ if (cur_reg) {
++ kref_put(®->ref, ibp_dereg_buf);
++ reg = cur_reg;
++ }
++
++ return reg;
++}
++
++static int ibp_cmd_reg_buf(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_reg_buf_cmd *cmd;
++ struct ibp_reg_buf_resp *resp;
++ struct ibp_ucontext *ucontext;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_reg *reg;
++ size_t len;
++ int ret = 0;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_reg_buf_cmd *) hdr;
++ ucontext = (struct ibp_ucontext *) cmd->ucontext;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ reg = ibp_reg_buf(ucontext, cmd->virt_addr, cmd->scif_addr,
++ cmd->length, cmd->offset, cmd->access);
++ if (IS_ERR(reg)) {
++ ret = PTR_ERR(reg);
++ print_err("ibp_reg_buf returned %d\n", ret);
++ goto send_resp;
++ }
++
++ resp = (struct ibp_reg_buf_resp *) msg->data;
++ len += sizeof(*resp);
++
++ resp->reg = (uintptr_t)reg;
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_dereg_buf(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_dereg_buf_cmd *cmd;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_reg *reg;
++ size_t len;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_dereg_buf_cmd *) hdr;
++ reg = (struct ibp_reg *) cmd->reg;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ kref_put(®->ref, ibp_dereg_buf);
++
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, 0);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_convert_prot_flags(unsigned long prot)
++{
++ int prot_flags;
++
++ prot_flags = 0;
++
++ if (prot & PROT_READ)
++ prot_flags |= SCIF_PROT_READ;
++
++ if (prot & PROT_WRITE)
++ prot_flags |= SCIF_PROT_WRITE;
++
++ return prot_flags;
++}
++
++static int ibp_convert_map_flags(unsigned long flags)
++{
++ int map_flags;
++
++ map_flags = SCIF_MAP_KERNEL;
++
++ if (flags & MAP_FIXED)
++ map_flags |= SCIF_MAP_FIXED;
++
++ return map_flags;
++}
++
++static int ibp_scif_register(struct ibp_client *client, struct ibp_mmap *mmap,
++ unsigned long flags)
++{
++ struct vm_area_struct *vma;
++ unsigned long npages;
++ unsigned long pfn;
++ int offset;
++ int ret;
++
++ print_trace("in\n");
++
++ offset = mmap->vaddr & ~PAGE_MASK;
++ npages = PAGE_ALIGN(mmap->len + offset) >> PAGE_SHIFT;
++ if (npages != 1) {
++ print_err("request %lu but only one page supported\n", npages);
++ return -EINVAL;
++ }
++
++ down_write(¤t->mm->mmap_sem);
++ vma = find_vma(current->mm, mmap->vaddr);
++ if (!vma) {
++ up_write(¤t->mm->mmap_sem);
++ print_err("find_vma failed\n");
++ return -EFAULT;
++ }
++
++ ret = follow_pfn(vma, mmap->vaddr, &pfn);
++
++ up_write(¤t->mm->mmap_sem);
++ if (ret) {
++ print_err("follow_pfn returned %d\n", ret);
++ return ret;
++ }
++
++ mmap->io_addr = ioremap(page_to_phys(pfn_to_page(pfn)), mmap->len);
++ if (!mmap->io_addr) {
++ print_err("ioremap failed\n");
++ return -ENOMEM;
++ }
++
++ mmap->scif_addr = scif_register(client->ep, (void *) mmap->io_addr,
++ mmap->len, (off_t) mmap->io_addr,
++ ibp_convert_prot_flags(mmap->prot),
++ ibp_convert_map_flags(flags));
++ if (IS_ERR_VALUE(mmap->scif_addr)) {
++ ret = mmap->scif_addr;
++ print_err("scif_register returned %d\n", ret);
++ goto err0;
++
++ }
++
++ return 0;
++err0:
++ iounmap(mmap->io_addr);
++ return ret;
++}
++
++static
++void ibp_scif_unregister(struct ibp_client *client, struct ibp_mmap *mmap)
++{
++ int ret;
++
++ print_trace("in\n");
++
++ ret = scif_unregister(client->ep, mmap->scif_addr, mmap->len);
++ if (ret) {
++ if (ret == -ECONNRESET)
++ print_dbg("scif connection reset\n");
++ else
++ print_err("scif_unregister returned %d\n", ret);
++ }
++
++ iounmap(mmap->io_addr);
++}
++
++static int ibp_cmd_mmap(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_mmap_cmd *cmd;
++ struct ibp_mmap_resp *resp;
++ struct ibp_ucontext *ucontext;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_mmap *mmap;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_mmap_cmd *) hdr;
++ ucontext = (struct ibp_ucontext *) cmd->ucontext;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ mmap = kzalloc(sizeof(*mmap), GFP_KERNEL);
++ if (!mmap) {
++ print_err("kzalloc failed\n");
++ ret = -ENOMEM;
++ goto send_resp;
++ }
++ mmap->ucontext = ucontext;
++ mmap->len = cmd->len;
++ mmap->prot = cmd->prot;
++
++ /* The mmap syscall ignores these bits; do the same here. */
++ cmd->flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
++ down_write(¤t->mm->mmap_sem);
++ mmap->vaddr = do_mmap_pgoff(ucontext->filp, 0, cmd->len,
++ cmd->prot, cmd->flags, cmd->pgoff);
++ up_write(¤t->mm->mmap_sem);
++#else
++ mmap->vaddr = vm_mmap(ucontext->filp, 0, cmd->len, cmd->prot,
++ cmd->flags, cmd->pgoff << PAGE_SHIFT);
++#endif
++
++ if (mmap->vaddr & ~PAGE_MASK) {
++ ret = mmap->vaddr;
++ print_err("mmap returned %d\n", ret);
++ goto err1;
++ }
++
++ ret = ibp_scif_register(client, mmap, cmd->flags);
++ if (ret) {
++ print_err("ibp_scif_register returned %d\n", ret);
++ goto err2;
++ }
++
++ mutex_lock(&ucontext->mutex);
++ list_add_tail(&mmap->list, &ucontext->mmap_list);
++ mutex_unlock(&ucontext->mutex);
++
++ resp = (struct ibp_mmap_resp *) msg->data;
++ len += sizeof(*resp);
++
++ resp->scif_addr = mmap->scif_addr;
++ resp->mmap = (uintptr_t)mmap;
++
++ goto send_resp;
++err2:
++ MUNMAP(current->mm, mmap->vaddr, cmd->len);
++err1:
++ kfree(mmap);
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_unmmap(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_unmmap_cmd *cmd;
++ struct ibp_mmap *mmap;
++ struct ibp_verb_response_msg *msg;
++ size_t len;
++ int ret = 0;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_unmmap_cmd *) hdr;
++ mmap = (struct ibp_mmap *) cmd->mmap;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ if (IS_NULL_OR_ERR(mmap)) {
++ print_err("Invalid mmap %p\n", mmap);
++ ret = -EINVAL;
++ goto send_resp;
++ }
++
++ ibp_scif_unregister(client, mmap);
++
++ if (IS_NULL_OR_ERR(current) || IS_NULL_OR_ERR(current->mm)) {
++ print_err("Invalid current mm pointer\n");
++ ret = -EINVAL;
++ goto send_resp;
++ }
++
++ MUNMAP(current->mm, mmap->vaddr, mmap->len);
++
++ if (mmap->ucontext) {
++ mutex_lock(&mmap->ucontext->mutex);
++ list_del(&mmap->list);
++ mutex_unlock(&mmap->ucontext->mutex);
++ }
++
++ kfree(mmap);
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static struct ib_uobject *ibp_create_uobj(struct ibp_ucontext *ucontext)
++{
++ static struct lock_class_key __key;
++ struct ib_uobject *uobj;
++
++ if (IS_NULL_OR_ERR(ucontext))
++ return ERR_PTR(-EINVAL);
++
++ uobj = (struct ib_uobject *)
++ ibp_pull_from_stack(o_stack, sizeof(*uobj), GFP_ATOMIC);
++ if (!uobj)
++ return ERR_PTR(-ENOMEM);
++
++ /*
++ * the uobj struct is updated since this is kernel-to-kernel,
++ * so this structure is not fully setup as in ib_uverbs.
++ */
++ uobj->context = ucontext->ibucontext;
++ uobj->user_handle = (uintptr_t)ucontext;
++ kref_init(&uobj->ref);
++ init_rwsem(&uobj->mutex);
++ lockdep_set_class(&uobj->mutex, &__key);
++ uobj->live = 1;
++
++ return uobj;
++}
++
++static void ibp_destroy_uobj(struct ib_uobject *uobj)
++{
++ struct ibp_ucontext *ucontext;
++
++ if (!IS_NULL_OR_ERR(uobj)) {
++ ucontext = (struct ibp_ucontext *) uobj->user_handle;
++ if (ucontext) {
++ mutex_lock(&ucontext->mutex);
++ list_del(&uobj->list);
++ mutex_unlock(&ucontext->mutex);
++ }
++
++ ibp_add_to_stack(o_stack, (void *) uobj);
++ }
++}
++
++static int ibp_cmd_alloc_pd(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_alloc_pd_cmd *cmd;
++ struct ibp_alloc_pd_resp *resp;
++ struct ibp_ucontext *ucontext;
++ struct ib_uobject *uobj;
++ struct ib_udata udata;
++ struct ib_pd *pd;
++ size_t len;
++ size_t outlen;
++ int ret = 0;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_alloc_pd_cmd *) hdr;
++ ucontext = (struct ibp_ucontext *) cmd->ucontext;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ resp = (struct ibp_alloc_pd_resp *) msg->data;
++ len = hdr->length - sizeof(*cmd);
++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp);
++
++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen);
++
++ len = sizeof(*msg);
++
++ uobj = ibp_create_uobj(ucontext);
++ if (IS_ERR(uobj)) {
++ ret = PTR_ERR(uobj);
++ print_err("ibp_create_uobj returned %d\n", ret);
++ goto send_resp;
++ }
++
++ pd = device->ib_dev->alloc_pd(device->ib_dev, ucontext->ibucontext,
++ &udata);
++ if (IS_ERR(pd)) {
++ ret = PTR_ERR(pd);
++ print_err("ib_alloc_pd returned %d\n", ret);
++ /*
++ * Clear uobj's user_handle as destroy_uobj tries to list_del
++ * uobj from the list and uobj has NOT been added yet
++ */
++ uobj->user_handle = 0;
++ ibp_destroy_uobj(uobj);
++ goto send_resp;
++ }
++
++ pd->device = device->ib_dev;
++ atomic_set(&pd->usecnt, 0);
++
++ pd->uobject = uobj;
++ uobj->object = pd;
++
++ mutex_lock(&ucontext->mutex);
++ list_add_tail(&uobj->list, &ucontext->ibucontext->pd_list);
++ mutex_unlock(&ucontext->mutex);
++
++ len += sizeof(*resp);
++ len += outlen - udata.outlen; /* add driver private data */
++
++ resp->pd = (uintptr_t)pd;
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_dealloc_pd(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_dealloc_pd_cmd *cmd;
++ struct ibp_verb_response_msg *msg;
++ struct ib_uobject *uobj;
++ struct ib_pd *pd;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_dealloc_pd_cmd *) hdr;
++ pd = (struct ib_pd *) cmd->pd;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ if (IS_NULL_OR_ERR(pd)) {
++ print_err("Invalid pd %p\n", pd);
++ ret = -EINVAL;
++ goto send_resp;
++ }
++
++ uobj = pd->uobject;
++
++ ret = ib_dealloc_pd(pd);
++ if (unlikely(ret == -EBUSY)) {
++ msleep(100);
++ ret = ib_dealloc_pd(pd);
++ }
++ if (ret) {
++ print_err("ib_dealloc_pd returned %d\n", ret);
++ goto send_resp;
++ }
++
++ ibp_destroy_uobj(uobj);
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_create_ah(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_create_ah_cmd *cmd;
++ struct ibp_create_ah_resp *resp;
++ struct ibp_ucontext *ucontext;
++ struct ib_uobject *uobj;
++ struct ib_pd *pd;
++ struct ib_ah *ah;
++ struct ib_ah_attr attr;
++ size_t len;
++ int ret = 0;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_create_ah_cmd *) hdr;
++ pd = (struct ib_pd *) cmd->pd;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ ucontext = (struct ibp_ucontext *) pd->uobject->user_handle;
++
++ uobj = ibp_create_uobj(ucontext);
++ if (IS_ERR(uobj)) {
++ ret = PTR_ERR(uobj);
++ print_err("ibp_create_uobj returned %d\n", ret);
++ goto send_resp;
++ }
++
++ memset(&attr, 0, sizeof(attr));
++
++ attr.dlid = cmd->ah_attr.dlid;
++ attr.sl = cmd->ah_attr.sl;
++ attr.src_path_bits = cmd->ah_attr.src_path_bits;
++ attr.static_rate = cmd->ah_attr.static_rate;
++ attr.ah_flags = cmd->ah_attr.ah_flags;
++ attr.port_num = cmd->ah_attr.port_num;
++ attr.grh.dgid.global.subnet_prefix =
++ cmd->ah_attr.grh.dgid_subnet_prefix;
++ attr.grh.dgid.global.interface_id = cmd->ah_attr.grh.dgid_interface_id;
++ attr.grh.flow_label = cmd->ah_attr.grh.flow_label;
++ attr.grh.sgid_index = cmd->ah_attr.grh.sgid_index;
++ attr.grh.hop_limit = cmd->ah_attr.grh.hop_limit;
++ attr.grh.traffic_class = cmd->ah_attr.grh.traffic_class;
++
++ ah = ib_create_ah(pd, &attr);
++ if (IS_ERR(ah)) {
++ ret = PTR_ERR(ah);
++ print_err("ib_create_ah returned %d\n", ret);
++ /*
++ * Clear uobj's user_handle as destroy_uobj tries to list_del
++ * uobj from the list and uobj has NOT been added yet
++ */
++ uobj->user_handle = 0;
++ ibp_destroy_uobj(uobj);
++ goto send_resp;
++ }
++
++ ah->uobject = uobj;
++ uobj->object = ah;
++
++ mutex_lock(&ucontext->mutex);
++ list_add_tail(&uobj->list, &ucontext->ibucontext->ah_list);
++ mutex_unlock(&ucontext->mutex);
++
++ resp = (struct ibp_create_ah_resp *) msg->data;
++ len += sizeof(*resp);
++
++ resp->ah = (uintptr_t) ah;
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_query_ah(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_query_ah_cmd *cmd;
++ struct ibp_query_ah_resp *resp;
++ struct ibp_verb_response_msg *msg;
++ struct ib_ah *ah;
++ struct ib_ah_attr attr;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_query_ah_cmd *) hdr;
++ ah = (struct ib_ah *) cmd->ah;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ ret = ib_query_ah(ah, &attr);
++ if (ret) {
++ print_err("ib_query_ah returned %d\n", ret);
++ goto send_resp;
++ }
++
++ resp = (struct ibp_query_ah_resp *) msg->data;
++ len += sizeof(*resp);
++
++ resp->attr.dlid = attr.dlid;
++ resp->attr.sl = attr.sl;
++ resp->attr.src_path_bits = attr.src_path_bits;
++ resp->attr.static_rate = attr.static_rate;
++ resp->attr.ah_flags = attr.ah_flags;
++ resp->attr.port_num = attr.port_num;
++ resp->attr.grh.dgid_subnet_prefix = attr.grh.dgid.global.subnet_prefix;
++ resp->attr.grh.dgid_interface_id = attr.grh.dgid.global.interface_id;
++ resp->attr.grh.flow_label = attr.grh.flow_label;
++ resp->attr.grh.sgid_index = attr.grh.sgid_index;
++ resp->attr.grh.hop_limit = attr.grh.hop_limit;
++ resp->attr.grh.traffic_class = attr.grh.traffic_class;
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_destroy_ah(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_destroy_ah_cmd *cmd;
++ struct ib_uobject *uobj;
++ struct ib_ah *ah;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_destroy_ah_cmd *) hdr;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ ah = (struct ib_ah *) cmd->ah;
++ len = sizeof(*msg);
++
++ uobj = ah->uobject;
++
++ ret = ib_destroy_ah(ah);
++ if (ret) {
++ print_err("ib_destroy_ah returned %d\n", ret);
++ goto send_resp;
++ }
++
++ ibp_destroy_uobj(uobj);
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static void ibp_ibsrq_event(struct ib_event *ibevent, void *srq_context)
++{
++ struct ibp_ucontext *ucontext;
++ struct ibp_client *client;
++ struct ibp_event *event;
++ struct ib_uobject *uobj;
++
++ print_trace("in\n");
++
++ event = kmalloc(sizeof(*event), GFP_ATOMIC);
++ if (!event) {
++ print_err("kalloc failed\n");
++ return;
++ }
++
++ uobj = ibevent->element.srq->uobject;
++ ucontext = (struct ibp_ucontext *) uobj->user_handle;
++ client = ucontext->client;
++
++ event->client = client;
++ event->context = (uintptr_t) srq_context;
++ event->type = ibevent->event;
++ event->ibdev = ucontext->ibdev;
++
++ INIT_WORK(&event->work, ibp_async_event);
++ queue_work(client->workqueue, &event->work);
++}
++
++static int ibp_cmd_create_srq(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_create_srq_cmd *cmd;
++ struct ibp_create_srq_resp *resp;
++ struct ibp_ucontext *ucontext;
++ struct ib_uobject *uobj;
++ struct ib_pd *pd;
++ struct ib_srq *srq;
++ struct ib_srq_init_attr init_attr;
++ struct ib_udata udata;
++ size_t len;
++ size_t outlen;
++ int ret = 0;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_create_srq_cmd *) hdr;
++ pd = (struct ib_pd *) cmd->pd;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ resp = (struct ibp_create_srq_resp *) msg->data;
++ len = hdr->length - sizeof(*cmd);
++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp);
++
++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen);
++
++ len = sizeof(*msg);
++
++ ucontext = (struct ibp_ucontext *) pd->uobject->user_handle;
++
++ uobj = ibp_create_uobj(ucontext);
++ if (IS_ERR(uobj)) {
++ ret = PTR_ERR(uobj);
++ print_err("ibp_create_uobj returned %d\n", ret);
++ goto send_resp;
++ }
++
++ memset(&init_attr, 0, sizeof(init_attr));
++
++ init_attr.event_handler = ibp_ibsrq_event;
++ init_attr.srq_context = (void *) cmd->srq_context;
++ init_attr.attr.max_wr = cmd->attr.max_wr;
++ init_attr.attr.max_sge = cmd->attr.max_sge;
++ init_attr.attr.srq_limit = cmd->attr.srq_limit;
++
++ srq = device->ib_dev->create_srq(pd, &init_attr, &udata);
++ if (IS_ERR(srq)) {
++ ret = PTR_ERR(srq);
++ print_err("ib_create_srq returned %d\n", ret);
++ /*
++ * Clear uobj's user_handle as destroy_uobj tries to list_del
++ * uobj from the list and uobj has NOT been added yet
++ */
++ uobj->user_handle = 0;
++ ibp_destroy_uobj(uobj);
++ goto send_resp;
++ }
++
++ srq->device = device->ib_dev;
++ srq->pd = pd;
++ srq->event_handler = init_attr.event_handler;
++ srq->srq_context = init_attr.srq_context;
++ srq->srq_type = 0;
++ srq->ext.xrc.cq = NULL;
++ srq->ext.xrc.xrcd = NULL;
++
++ atomic_inc(&pd->usecnt);
++ atomic_set(&srq->usecnt, 0);
++
++ srq->uobject = uobj;
++ uobj->object = srq;
++
++ mutex_lock(&ucontext->mutex);
++ list_add_tail(&uobj->list, &ucontext->ibucontext->srq_list);
++ mutex_unlock(&ucontext->mutex);
++
++ len += sizeof(*resp);
++ len += outlen - udata.outlen; /* add driver private data */
++
++ resp->srq = (uintptr_t)srq;
++ resp->attr.max_wr = init_attr.attr.max_wr;
++ resp->attr.max_sge = init_attr.attr.max_sge;
++ resp->attr.srq_limit = init_attr.attr.srq_limit;
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_modify_srq(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_modify_srq_cmd *cmd;
++ struct ibp_modify_srq_resp *resp;
++ struct ib_srq *srq;
++ struct ib_srq_attr attr;
++ struct ib_udata udata;
++ size_t len;
++ size_t outlen;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_modify_srq_cmd *) hdr;
++ srq = (struct ib_srq *) cmd->srq;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ resp = (struct ibp_modify_srq_resp *) msg->data;
++ len = hdr->length - sizeof(*cmd);
++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp);
++
++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen);
++
++ len = sizeof(*msg);
++
++ memset(&attr, 0, sizeof(attr));
++
++ attr.max_wr = cmd->attr.max_wr;
++ attr.max_sge = cmd->attr.max_sge;
++ attr.srq_limit = cmd->attr.srq_limit;
++
++ ret = device->ib_dev->modify_srq(srq, &attr, cmd->srq_attr_mask,
++ &udata);
++ if (ret) {
++ print_err("ib_modify_srq returned %d\n", ret);
++ goto send_resp;
++ }
++
++ len += sizeof(*resp);
++ len += outlen - udata.outlen; /* add driver private data */
++
++ resp->attr.max_wr = attr.max_wr;
++ resp->attr.max_sge = attr.max_sge;
++ resp->attr.srq_limit = attr.srq_limit;
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_query_srq(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_query_srq_cmd *cmd;
++ struct ibp_query_srq_resp *resp;
++ struct ib_srq *srq;
++ struct ib_srq_attr attr;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_query_srq_cmd *) hdr;
++ srq = (struct ib_srq *) cmd->srq;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ ret = ib_query_srq(srq, &attr);
++ if (ret) {
++ print_err("ib_query_srq returned %d\n", ret);
++ goto send_resp;
++ }
++
++ resp = (struct ibp_query_srq_resp *) msg->data;
++ len += sizeof(*resp);
++
++ resp->attr.max_wr = attr.max_wr;
++ resp->attr.max_sge = attr.max_sge;
++ resp->attr.srq_limit = attr.srq_limit;
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_destroy_srq(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_queued_response_msg *msg;
++ struct ibp_destroy_srq_cmd *cmd;
++ struct ib_uobject *uobj;
++ struct ib_srq *srq;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_destroy_srq_cmd *) hdr;
++ srq = (struct ib_srq *) cmd->srq;
++ msg = (struct ibp_queued_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ uobj = srq->uobject;
++
++ ret = ib_destroy_srq(srq);
++ if (unlikely(ret == -EBUSY)) {
++ msleep(100);
++ ret = ib_destroy_srq(srq);
++ }
++ if (ret) {
++ print_err("ib_destroy_srq returned %d\n", ret);
++ goto send_resp;
++ }
++
++ ibp_destroy_uobj(uobj);
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, QUEUED_RESPONSE, hdr->request, ret);
++ return ibp_queue_response(client, msg);
++}
++
++static void ibp_ibqp_event(struct ib_event *ibevent, void *qp_context)
++{
++ struct ibp_ucontext *ucontext;
++ struct ibp_client *client;
++ struct ibp_event *event;
++ struct ib_uobject *uobj;
++
++ event = kmalloc(sizeof(*event), GFP_ATOMIC);
++ if (!event) {
++ print_err("kalloc failed\n");
++ return;
++ }
++
++ uobj = ibevent->element.qp->uobject;
++ ucontext = (struct ibp_ucontext *) uobj->user_handle;
++ client = ucontext->client;
++
++ event->client = client;
++ event->context = (uintptr_t) qp_context;
++ event->type = ibevent->event;
++ event->ibdev = ucontext->ibdev;
++
++ INIT_WORK(&event->work, ibp_async_event);
++ queue_work(client->workqueue, &event->work);
++}
++
++static int ibp_cmd_create_qp(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_create_qp_cmd *cmd;
++ struct ibp_create_qp_resp *resp;
++ struct ibp_ucontext *ucontext;
++ struct ib_uobject *uobj;
++ struct ib_pd *pd;
++ struct ibp_qp *qp;
++ struct ib_qp_init_attr init_attr;
++ struct ib_udata udata;
++ size_t len;
++ size_t outlen;
++ int ret = 0;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_create_qp_cmd *) hdr;
++ pd = (struct ib_pd *) cmd->pd;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ resp = (struct ibp_create_qp_resp *) msg->data;
++ len = hdr->length - sizeof(*cmd);
++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp);
++
++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen);
++
++ len = sizeof(*msg);
++
++ qp = kzalloc(sizeof *qp, GFP_KERNEL);
++ if (!qp) {
++ print_err("kzalloc failed\n");
++ ret = -ENOMEM;
++ goto send_resp;
++ }
++ INIT_LIST_HEAD(&qp->mcast);
++
++ ucontext = (struct ibp_ucontext *) pd->uobject->user_handle;
++
++ uobj = ibp_create_uobj(ucontext);
++ if (IS_ERR(uobj)) {
++ ret = PTR_ERR(uobj);
++ print_err("ibp_create_uobj returned %d\n", ret);
++ goto send_resp;
++ }
++
++ memset(&init_attr, 0, sizeof(init_attr));
++
++ init_attr.send_cq = (struct ib_cq *) cmd->send_cq;
++ init_attr.recv_cq = (struct ib_cq *) cmd->recv_cq;
++ init_attr.srq = (struct ib_srq *) cmd->srq;
++ init_attr.xrcd = (struct ib_xrcd *) cmd->xrc_domain;
++ init_attr.cap.max_send_wr = cmd->cap.max_send_wr;
++ init_attr.cap.max_recv_wr = cmd->cap.max_recv_wr;
++ init_attr.cap.max_send_sge = cmd->cap.max_send_sge;
++ init_attr.cap.max_recv_sge = cmd->cap.max_recv_sge;
++ init_attr.cap.max_inline_data = cmd->cap.max_inline_data;
++ init_attr.sq_sig_type = cmd->sq_sig_type;
++ init_attr.qp_type = cmd->qp_type;
++ init_attr.create_flags = cmd->create_flags;
++ init_attr.port_num = cmd->port_num;
++
++ qp->ibqp = device->ib_dev->create_qp(pd, &init_attr, &udata);
++ if (IS_ERR(qp->ibqp)) {
++ ret = PTR_ERR(qp->ibqp);
++ print_err("ib_create_qp returned %d\n", ret);
++ /*
++ * Clear uobj's user_handle as destroy_uobj tries to list_del
++ * uobj from the list and uobj has NOT been added yet
++ */
++ uobj->user_handle = 0;
++ ibp_destroy_uobj(uobj);
++ goto send_resp;
++ }
++
++ qp->ibqp->device = device->ib_dev;
++ qp->ibqp->pd = pd;
++ qp->ibqp->send_cq = init_attr.send_cq;
++ qp->ibqp->recv_cq = init_attr.recv_cq;
++ qp->ibqp->srq = init_attr.srq;
++ qp->ibqp->event_handler = ibp_ibqp_event;
++ qp->ibqp->qp_context = (void *) cmd->qp_context;
++ qp->ibqp->qp_type = init_attr.qp_type;
++
++ if (qp->ibqp->qp_type == IB_QPT_XRC_TGT) {
++ qp->ibqp->xrcd = init_attr.xrcd;
++ atomic_inc(&qp->ibqp->xrcd->usecnt);
++ } else {
++ qp->ibqp->xrcd = NULL;
++ qp->ibqp->real_qp = qp->ibqp;
++ }
++ atomic_set(&qp->ibqp->usecnt, 0);
++
++ atomic_inc(&pd->usecnt);
++ atomic_inc(&init_attr.send_cq->usecnt);
++ atomic_inc(&init_attr.recv_cq->usecnt);
++
++ if (init_attr.srq)
++ atomic_inc(&init_attr.srq->usecnt);
++
++ qp->ibqp->uobject = uobj;
++ uobj->object = qp;
++
++ mutex_lock(&ucontext->mutex);
++ list_add_tail(&uobj->list, &ucontext->ibucontext->qp_list);
++ mutex_unlock(&ucontext->mutex);
++
++ len += sizeof(*resp);
++ len += outlen - udata.outlen; /* add driver private data */
++
++ resp->qp = (uintptr_t) qp;
++ resp->qpn = qp->ibqp->qp_num;
++ resp->cap.max_send_wr = init_attr.cap.max_send_wr;
++ resp->cap.max_recv_wr = init_attr.cap.max_recv_wr;
++ resp->cap.max_send_sge = init_attr.cap.max_send_sge;
++ resp->cap.max_recv_sge = init_attr.cap.max_recv_sge;
++ resp->cap.max_inline_data = init_attr.cap.max_inline_data;
++
++send_resp:
++ if (ret)
++ kfree(qp);
++
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_modify_qp(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_modify_qp_cmd *cmd;
++ struct ibp_modify_qp_resp *resp;
++ struct ibp_qp *qp;
++ struct ib_qp_attr attr;
++ struct ib_udata udata;
++ size_t len;
++ size_t outlen;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_modify_qp_cmd *) hdr;
++ qp = (struct ibp_qp *) cmd->qp;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ resp = (struct ibp_modify_qp_resp *) msg->data;
++ len = hdr->length - sizeof(*cmd);
++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp);
++
++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen);
++
++ len = sizeof(*msg);
++
++ memset(&attr, 0, sizeof(attr));
++
++ attr.qp_state = cmd->qp_state;
++ attr.cur_qp_state = cmd->cur_qp_state;
++ attr.path_mtu = cmd->path_mtu;
++ attr.path_mig_state = cmd->path_mig_state;
++ attr.qkey = cmd->qkey;
++ attr.rq_psn = cmd->rq_psn;
++ attr.sq_psn = cmd->sq_psn;
++ attr.dest_qp_num = cmd->dest_qp_num;
++ attr.qp_access_flags = cmd->qp_access_flags;
++ attr.cap.max_send_wr = cmd->cap.max_send_wr;
++ attr.cap.max_recv_wr = cmd->cap.max_recv_wr;
++ attr.cap.max_send_sge = cmd->cap.max_send_sge;
++ attr.cap.max_recv_sge = cmd->cap.max_recv_sge;
++ attr.cap.max_inline_data = cmd->cap.max_inline_data;
++ attr.ah_attr.grh.dgid.global.subnet_prefix =
++ cmd->ah.grh.dgid_subnet_prefix;
++ attr.ah_attr.grh.dgid.global.interface_id =
++ cmd->ah.grh.dgid_interface_id;
++ attr.ah_attr.grh.flow_label = cmd->ah.grh.flow_label;
++ attr.ah_attr.grh.sgid_index = cmd->ah.grh.sgid_index;
++ attr.ah_attr.grh.hop_limit = cmd->ah.grh.hop_limit;
++ attr.ah_attr.grh.traffic_class = cmd->ah.grh.traffic_class;
++ attr.ah_attr.dlid = cmd->ah.dlid;
++ attr.ah_attr.sl = cmd->ah.sl;
++ attr.ah_attr.src_path_bits = cmd->ah.src_path_bits;
++ attr.ah_attr.static_rate = cmd->ah.static_rate;
++ attr.ah_attr.ah_flags = cmd->ah.ah_flags;
++ attr.ah_attr.port_num = cmd->ah.port_num;
++ attr.alt_ah_attr.grh.dgid.global.subnet_prefix =
++ cmd->alt_ah.grh.dgid_subnet_prefix;
++ attr.alt_ah_attr.grh.dgid.global.interface_id =
++ cmd->alt_ah.grh.dgid_interface_id;
++ attr.alt_ah_attr.grh.flow_label = cmd->alt_ah.grh.flow_label;
++ attr.alt_ah_attr.grh.sgid_index = cmd->alt_ah.grh.sgid_index;
++ attr.alt_ah_attr.grh.hop_limit = cmd->alt_ah.grh.hop_limit;
++ attr.alt_ah_attr.grh.traffic_class = cmd->alt_ah.grh.traffic_class;
++ attr.alt_ah_attr.dlid = cmd->alt_ah.dlid;
++ attr.alt_ah_attr.sl = cmd->alt_ah.sl;
++ attr.alt_ah_attr.src_path_bits = cmd->alt_ah.src_path_bits;
++ attr.alt_ah_attr.static_rate = cmd->alt_ah.static_rate;
++ attr.alt_ah_attr.ah_flags = cmd->alt_ah.ah_flags;
++ attr.alt_ah_attr.port_num = cmd->alt_ah.port_num;
++ attr.pkey_index = cmd->pkey_index;
++ attr.alt_pkey_index = cmd->alt_pkey_index;
++ attr.en_sqd_async_notify = cmd->en_sqd_async_notify;
++ attr.sq_draining = cmd->sq_draining;
++ attr.max_rd_atomic = cmd->max_rd_atomic;
++ attr.max_dest_rd_atomic = cmd->max_dest_rd_atomic;
++ attr.min_rnr_timer = cmd->min_rnr_timer;
++ attr.port_num = cmd->port_num;
++ attr.timeout = cmd->timeout;
++ attr.retry_cnt = cmd->retry_cnt;
++ attr.rnr_retry = cmd->rnr_retry;
++ attr.alt_port_num = cmd->alt_port_num;
++ attr.alt_timeout = cmd->alt_timeout;
++
++ ret = device->ib_dev->modify_qp(qp->ibqp, &attr, cmd->qp_attr_mask, &udata);
++ if (ret) {
++ print_err("ib_modify_qp returned %d\n", ret);
++ goto send_resp;
++ }
++
++ len += sizeof(*resp);
++ len += outlen - udata.outlen; /* add driver private data */
++
++ resp->cap.max_send_wr = attr.cap.max_send_wr;
++ resp->cap.max_recv_wr = attr.cap.max_recv_wr;
++ resp->cap.max_send_sge = attr.cap.max_send_sge;
++ resp->cap.max_recv_sge = attr.cap.max_recv_sge;
++ resp->cap.max_inline_data = attr.cap.max_inline_data;
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_query_qp(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_query_qp_cmd *cmd;
++ struct ibp_query_qp_resp *resp;
++ struct ibp_qp *qp;
++ struct ib_qp_attr qp_attr;
++ struct ib_qp_init_attr qp_init_attr;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_query_qp_cmd *) hdr;
++ qp = (struct ibp_qp *) cmd->qp;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ ret = ib_query_qp(qp->ibqp, &qp_attr, cmd->qp_attr_mask, &qp_init_attr);
++ if (ret) {
++ print_err("ib_query_qp returned %d\n", ret);
++ goto send_resp;
++ }
++
++ resp = (struct ibp_query_qp_resp *) msg->data;
++ len += sizeof(*resp);
++
++ resp->qp_state = qp_attr.qp_state;
++ resp->cur_qp_state = qp_attr.cur_qp_state;
++ resp->path_mtu = qp_attr.path_mtu;
++ resp->path_mig_state = qp_attr.path_mig_state;
++ resp->qkey = qp_attr.qkey;
++ resp->rq_psn = qp_attr.rq_psn;
++ resp->sq_psn = qp_attr.sq_psn;
++ resp->dest_qp_num = qp_attr.dest_qp_num;
++ resp->qp_access_flags = qp_attr.qp_access_flags;
++
++ resp->init_cap.max_send_wr = qp_init_attr.cap.max_send_wr;
++ resp->init_cap.max_recv_wr = qp_init_attr.cap.max_recv_wr;
++ resp->init_cap.max_send_sge = qp_init_attr.cap.max_send_sge;
++ resp->init_cap.max_recv_sge = qp_init_attr.cap.max_recv_sge;
++ resp->init_cap.max_inline_data = qp_init_attr.cap.max_inline_data;
++ resp->init_create_flags = qp_init_attr.create_flags;
++ resp->init_sq_sig_type = qp_init_attr.sq_sig_type;
++
++ resp->cap.max_send_wr = qp_attr.cap.max_send_wr;
++ resp->cap.max_recv_wr = qp_attr.cap.max_recv_wr;
++ resp->cap.max_send_sge = qp_attr.cap.max_send_sge;
++ resp->cap.max_recv_sge = qp_attr.cap.max_recv_sge;
++ resp->cap.max_inline_data = qp_attr.cap.max_inline_data;
++
++ resp->ah.grh.dgid_subnet_prefix =
++ qp_attr.ah_attr.grh.dgid.global.subnet_prefix;
++ resp->ah.grh.dgid_interface_id =
++ qp_attr.ah_attr.grh.dgid.global.interface_id;
++ resp->ah.grh.flow_label = qp_attr.ah_attr.grh.flow_label;
++ resp->ah.grh.sgid_index = qp_attr.ah_attr.grh.sgid_index;
++ resp->ah.grh.hop_limit = qp_attr.ah_attr.grh.hop_limit;
++ resp->ah.grh.traffic_class = qp_attr.ah_attr.grh.traffic_class;
++ resp->ah.dlid = qp_attr.ah_attr.dlid;
++ resp->ah.sl = qp_attr.ah_attr.sl;
++ resp->ah.src_path_bits = qp_attr.ah_attr.src_path_bits;
++ resp->ah.static_rate = qp_attr.ah_attr.static_rate;
++ resp->ah.ah_flags = qp_attr.ah_attr.ah_flags;
++ resp->ah.port_num = qp_attr.ah_attr.port_num;
++
++ resp->alt_ah.grh.dgid_subnet_prefix =
++ qp_attr.alt_ah_attr.grh.dgid.global.subnet_prefix;
++ resp->alt_ah.grh.dgid_interface_id =
++ qp_attr.alt_ah_attr.grh.dgid.global.interface_id;
++ resp->alt_ah.grh.flow_label = qp_attr.alt_ah_attr.grh.flow_label;
++ resp->alt_ah.grh.sgid_index = qp_attr.alt_ah_attr.grh.sgid_index;
++ resp->alt_ah.grh.hop_limit = qp_attr.alt_ah_attr.grh.hop_limit;
++ resp->alt_ah.grh.traffic_class = qp_attr.alt_ah_attr.grh.traffic_class;
++ resp->alt_ah.dlid = qp_attr.alt_ah_attr.dlid;
++ resp->alt_ah.sl = qp_attr.alt_ah_attr.sl;
++ resp->alt_ah.src_path_bits = qp_attr.alt_ah_attr.src_path_bits;
++ resp->alt_ah.static_rate = qp_attr.alt_ah_attr.static_rate;
++ resp->alt_ah.ah_flags = qp_attr.alt_ah_attr.ah_flags;
++ resp->alt_ah.port_num = qp_attr.alt_ah_attr.port_num;
++
++ resp->pkey_index = qp_attr.pkey_index;
++ resp->alt_pkey_index = qp_attr.alt_pkey_index;
++ resp->en_sqd_async_notify = qp_attr.en_sqd_async_notify;
++ resp->sq_draining = qp_attr.sq_draining;
++ resp->max_rd_atomic = qp_attr.max_rd_atomic;
++ resp->max_dest_rd_atomic = qp_attr.max_dest_rd_atomic;
++ resp->min_rnr_timer = qp_attr.min_rnr_timer;
++ resp->port_num = qp_attr.port_num;
++ resp->timeout = qp_attr.timeout;
++ resp->retry_cnt = qp_attr.retry_cnt;
++ resp->rnr_retry = qp_attr.rnr_retry;
++ resp->alt_port_num = qp_attr.alt_port_num;
++ resp->alt_timeout = qp_attr.alt_timeout;
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_destroy_qp(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_queued_response_msg *msg;
++ struct ibp_destroy_qp_cmd *cmd;
++ struct ib_uobject *uobj;
++ struct ibp_qp *qp;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_destroy_qp_cmd *) hdr;
++ qp = (struct ibp_qp *) cmd->qp;
++ msg = (struct ibp_queued_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ uobj = qp->ibqp->uobject;
++
++ ret = ib_destroy_qp(qp->ibqp);
++ if (ret) {
++ print_err("ib_destroy_qp returned %d\n", ret);
++ goto send_resp;
++ }
++
++ ibp_destroy_uobj(uobj);
++
++ kfree(qp);
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, QUEUED_RESPONSE, hdr->request, ret);
++ return ibp_queue_response(client, msg);
++}
++
++static void ibp_ibcq_event(struct ib_event *ibevent, void *cq_context)
++{
++ struct ibp_ucontext *ucontext;
++ struct ibp_client *client;
++ struct ibp_event *event;
++ struct ib_uobject *uobj;
++
++ event = kmalloc(sizeof(*event), GFP_ATOMIC);
++ if (!event) {
++ print_err("kalloc failed\n");
++ return;
++ }
++
++ uobj = (struct ib_uobject *) ibevent->element.cq->uobject;
++ ucontext = (void *) uobj->user_handle;
++ client = ucontext->client;
++
++ event->client = client;
++ event->context = (uintptr_t) cq_context;
++ event->type = ibevent->event;
++ event->ibdev = ucontext->ibdev;
++
++ INIT_WORK(&event->work, ibp_async_event);
++ queue_work(client->workqueue, &event->work);
++}
++
++static void ibp_cq_comp(struct work_struct *work)
++{
++ struct ibp_comp *comp;
++ struct ibp_cq_comp_msg msg;
++
++ comp = container_of(work, struct ibp_comp, work);
++
++ IBP_INIT_MSG(NULL, &msg, sizeof(msg), CQ_COMP);
++
++ msg.data.cq_context = (uintptr_t) comp->cq_context;
++
++ ibp_send(comp->client->ep, &msg, sizeof(msg));
++
++ ibp_add_to_stack(c_stack, (void *) comp);
++}
++
++static void ibp_ibcq_comp(struct ib_cq *ibcq, void *cq_context)
++{
++ struct ibp_ucontext *ucontext;
++ struct ibp_client *client;
++ struct ibp_comp *comp;
++
++ ucontext = (struct ibp_ucontext *) ibcq->uobject->user_handle;
++
++ if (ucontext->ibucontext->closing) {
++ print_dbg("ignoring cq completion, connection closing\n");
++ return;
++ }
++
++ comp = (struct ibp_comp *)
++ ibp_pull_from_stack(c_stack, sizeof(*comp), GFP_ATOMIC);
++ if (!comp) {
++ print_err("kalloc failed\n");
++ return;
++ }
++
++ client = ucontext->client;
++
++ comp->client = client;
++ comp->cq_context = cq_context;
++
++ INIT_WORK(&comp->work, ibp_cq_comp);
++ queue_work(client->workqueue, &comp->work);
++}
++
++static int ibp_cmd_create_cq(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_create_cq_cmd *cmd;
++ struct ibp_create_cq_resp *resp;
++ struct ibp_ucontext *ucontext;
++ struct ib_uobject *uobj;
++ struct ib_udata udata;
++ struct ib_cq *cq;
++ size_t len;
++ size_t outlen;
++ int ret = 0;
++#ifdef MOFED
++ struct ib_cq_init_attr attr;
++#endif
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_create_cq_cmd *) hdr;
++ ucontext = (struct ibp_ucontext *) cmd->ucontext;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ resp = (struct ibp_create_cq_resp *) msg->data;
++ len = hdr->length - sizeof(*cmd);
++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp);
++
++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen);
++
++ len = sizeof(*msg);
++
++ uobj = ibp_create_uobj(ucontext);
++ if (IS_ERR(uobj)) {
++ ret = PTR_ERR(uobj);
++ print_err("ibp_create_uobj returned %d\n", ret);
++ goto send_resp;
++ }
++
++#ifdef MOFED
++ memset(&attr, 0, sizeof(attr));
++ attr.cqe = cmd->cqe;
++ attr.comp_vector = cmd->vector;
++
++ cq = device->ib_dev->create_cq(device->ib_dev, &attr,
++ ucontext->ibucontext, &udata);
++#else
++ cq = device->ib_dev->create_cq(device->ib_dev, (int) cmd->cqe,
++ (int) cmd->vector,
++ ucontext->ibucontext, &udata);
++#endif
++ if (IS_ERR(cq)) {
++ ret = PTR_ERR(cq);
++ print_err("ib_create_cq returned %d\n", ret);
++ /*
++ * Clear uobj's user_handle as destroy_uobj tries to list_del
++ * uobj from the list and uobj has NOT been added yet
++ */
++ uobj->user_handle = 0;
++ ibp_destroy_uobj(uobj);
++ goto send_resp;
++ }
++
++ cq->device = device->ib_dev;
++ cq->event_handler = ibp_ibcq_event;
++ cq->comp_handler = ibp_ibcq_comp;
++ cq->cq_context = (void *) cmd->cq_context;
++ atomic_set(&cq->usecnt, 0);
++
++ cq->uobject = uobj;
++ uobj->object = cq;
++
++ mutex_lock(&ucontext->mutex);
++ list_add_tail(&uobj->list, &ucontext->ibucontext->cq_list);
++ mutex_unlock(&ucontext->mutex);
++
++ len += sizeof(*resp);
++ len += outlen - udata.outlen; /* add driver private data */
++
++ resp->cq = (uintptr_t)cq;
++ resp->cqe = cq->cqe;
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_destroy_cq(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_queued_response_msg *msg;
++ struct ibp_destroy_cq_cmd *cmd;
++ struct ib_uobject *uobj;
++ struct ib_cq *cq;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_destroy_cq_cmd *) hdr;
++ cq = (struct ib_cq *) cmd->cq;
++ msg = (struct ibp_queued_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ uobj = cq->uobject;
++
++ ret = ib_destroy_cq(cq);
++ if (unlikely(ret == -EBUSY)) {
++ msleep(100);
++ ret = ib_destroy_cq(cq);
++ }
++ if (ret) {
++ print_err("ib_destroy_cq returned %d\n", ret);
++ goto send_resp;
++ }
++
++ ibp_destroy_uobj(uobj);
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, QUEUED_RESPONSE, hdr->request, ret);
++ return ibp_queue_response(client, msg);
++}
++
++static int ibp_cmd_resize_cq(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_resize_cq_cmd *cmd;
++ struct ibp_resize_cq_resp *resp;
++ struct ib_cq *cq;
++ struct ib_udata udata;
++ size_t len;
++ size_t outlen;
++ int ret;
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_resize_cq_cmd *) hdr;
++ cq = (struct ib_cq *) cmd->cq;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ resp = (struct ibp_resize_cq_resp *) msg->data;
++ len = hdr->length - sizeof(*cmd);
++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp);
++
++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen);
++
++ len = sizeof(*msg);
++
++ ret = device->ib_dev->resize_cq ?
++ device->ib_dev->resize_cq(cq, (int) cmd->cqe, &udata) : -ENOSYS;
++ if (ret) {
++ print_err("ib_resize_cq returned %d\n", ret);
++ goto send_resp;
++ }
++
++ len += sizeof(*resp);
++ len += outlen - udata.outlen; /* add driver private data */
++
++ resp->cqe = cq->cqe;
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_reg_user_mr(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_reg_user_mr_cmd *cmd;
++ struct ibp_reg_user_mr_resp *resp;
++ struct ibp_mr *mr;
++ struct ibp_ucontext *ucontext;
++ struct ib_uobject *uobj;
++ struct ib_udata udata;
++ struct ib_pd *pd;
++ size_t len;
++ size_t outlen;
++ int ret = 0;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_reg_user_mr_cmd *) hdr;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ resp = (struct ibp_reg_user_mr_resp *) msg->data;
++ len = hdr->length - sizeof(*cmd);
++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp);
++
++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen);
++
++ len = sizeof(*msg);
++
++ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
++ if (!mr) {
++ print_err("kzalloc failed\n");
++ ret = -ENOMEM;
++ goto send_resp;
++ }
++
++ pd = (struct ib_pd *) cmd->pd;
++
++ ucontext = (struct ibp_ucontext *) pd->uobject->user_handle;
++
++ mr->reg = ibp_reg_buf(ucontext, cmd->hca_va, cmd->scif_addr,
++ cmd->length, cmd->offset, cmd->access);
++ if (IS_ERR(mr->reg)) {
++ ret = PTR_ERR(mr->reg);
++ print_err("ibp_reg_buf returned %d\n", ret);
++ goto send_resp;
++ }
++
++ uobj = ibp_create_uobj(ucontext);
++ if (IS_ERR(uobj)) {
++ ret = PTR_ERR(uobj);
++ print_err("ibp_create_uobj returned %d\n", ret);
++ kref_put(&mr->reg->ref, ibp_dereg_buf);
++ goto send_resp;
++ }
++
++#ifdef MOFED
++ mr->ibmr = pd->device->reg_user_mr(pd, cmd->hca_va, cmd->length,
++ cmd->hca_va, cmd->access, &udata, 0);
++#else
++ mr->ibmr = pd->device->reg_user_mr(pd, cmd->hca_va, cmd->length,
++ cmd->hca_va, cmd->access, &udata);
++#endif
++ if (IS_ERR(mr->ibmr)) {
++ ret = PTR_ERR(mr->ibmr);
++ print_err("ib_reg_user_mr returned %d\n", ret);
++ kref_put(&mr->reg->ref, ibp_dereg_buf);
++ ibp_destroy_uobj(uobj);
++ goto send_resp;
++ }
++
++ mr->ibmr->pd = pd;
++ mr->ibmr->device = pd->device;
++ atomic_inc(&pd->usecnt);
++ atomic_set(&mr->ibmr->usecnt, 0);
++
++ mr->ibmr->uobject = uobj;
++ uobj->object = mr;
++
++ mutex_lock(&ucontext->mutex);
++ list_add_tail(&uobj->list, &ucontext->ibucontext->mr_list);
++ mutex_unlock(&ucontext->mutex);
++
++ len += sizeof(*resp);
++ len += outlen - udata.outlen; /* add driver private data */
++
++ resp->mr = (uintptr_t) mr;
++ resp->lkey = mr->ibmr->lkey;
++ resp->rkey = mr->ibmr->rkey;
++
++send_resp:
++ if (ret)
++ kfree(mr);
++
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_dereg_mr(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_dereg_mr_cmd *cmd;
++ struct ibp_mr *mr;
++ struct ib_uobject *uobj;
++ size_t len;
++ int ret;
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_dereg_mr_cmd *) hdr;
++ mr = (struct ibp_mr *) cmd->mr;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ if (IS_NULL_OR_ERR(mr)) {
++ print_err("Invalid mr %p\n", mr);
++ ret = -EINVAL;
++ goto send_resp;
++ }
++
++ uobj = mr->ibmr->uobject;
++
++ ret = ib_dereg_mr(mr->ibmr);
++ if (unlikely(ret == -EBUSY)) {
++ msleep(100);
++ ret = ib_dereg_mr(mr->ibmr);
++ }
++ if (ret) {
++ print_err("ib_dereg_mr returned %d\n", ret);
++ goto send_resp;
++ }
++
++ ibp_destroy_uobj(uobj);
++
++ if (mr->reg)
++ kref_put(&mr->reg->ref, ibp_dereg_buf);
++
++ kfree(mr);
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_attach_mcast(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_attach_mcast_cmd *cmd;
++ struct ibp_mcast_entry *mcast;
++ struct ibp_ucontext *ucontext;
++ struct ibp_qp *qp;
++ union ib_gid gid;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_attach_mcast_cmd *) hdr;
++ qp = (struct ibp_qp *) cmd->qp;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ ucontext = (struct ibp_ucontext *) qp->ibqp->uobject->user_handle;
++
++ mcast = kzalloc(sizeof *mcast, GFP_KERNEL);
++ if (!mcast) {
++ print_err("kzalloc failed\n");
++ ret = -ENOMEM;
++ goto send_resp;
++ }
++
++ gid.global.subnet_prefix = cmd->subnet_prefix;
++ gid.global.interface_id = cmd->interface_id;
++
++ ret = ib_attach_mcast(qp->ibqp, &gid, cmd->lid);
++ if (ret) {
++ print_err("ib_attach_mcast returned %d\n", ret);
++ kfree(mcast);
++ goto send_resp;
++ }
++
++ mcast->lid = cmd->lid;
++ mcast->gid.global.subnet_prefix = cmd->subnet_prefix;
++ mcast->gid.global.interface_id = cmd->interface_id;
++
++ mutex_lock(&ucontext->mutex);
++ list_add_tail(&mcast->list, &qp->mcast);
++ mutex_unlock(&ucontext->mutex);
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static int ibp_cmd_detach_mcast(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf)
++{
++ struct ibp_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_detach_mcast_cmd *cmd;
++ struct ibp_mcast_entry *mcast;
++ struct ibp_ucontext *ucontext;
++ struct ibp_qp *qp;
++ union ib_gid gid;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ device = (struct ibp_device *) hdr->device;
++ cmd = (struct ibp_detach_mcast_cmd *) hdr;
++ qp = (struct ibp_qp *) cmd->qp;
++ msg = (struct ibp_verb_response_msg *) tx_buf;
++ len = sizeof(*msg);
++
++ ucontext = (struct ibp_ucontext *) qp->ibqp->uobject->user_handle;
++
++ gid.global.subnet_prefix = cmd->subnet_prefix;
++ gid.global.interface_id = cmd->interface_id;
++
++ ret = ib_detach_mcast(qp->ibqp, &gid, cmd->lid);
++ if (ret) {
++ print_err("ib_detach_mcast returned %d\n", ret);
++ goto send_resp;
++ }
++
++ mutex_lock(&ucontext->mutex);
++ list_for_each_entry(mcast, &qp->mcast, list)
++ if (cmd->lid == mcast->lid &&
++ !memcmp(&gid , mcast->gid.raw, sizeof mcast->gid.raw)) {
++ list_del(&mcast->list);
++ kfree(mcast);
++ break;
++ }
++ mutex_unlock(&ucontext->mutex);
++
++send_resp:
++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret);
++ return ibp_send(client->ep, msg, len);
++}
++
++static void ibp_detach_mcast(struct ibp_qp *qp)
++{
++ struct ibp_mcast_entry *mcast, *tmp;
++
++ list_for_each_entry_safe(mcast, tmp, &qp->mcast, list) {
++ ib_detach_mcast(qp->ibqp, &mcast->gid, mcast->lid);
++ list_del(&mcast->list);
++ kfree(mcast);
++ }
++}
++
++static void ibp_destroy_ucontext(struct ibp_ucontext *ucontext)
++{
++ struct ib_ucontext *ibuctx;
++ struct ib_uobject *uobj;
++ struct ib_uobject *tmp;
++ struct ibp_mmap *mmap;
++ struct ibp_mmap *tmp_mmap;
++
++ ibuctx = ucontext->ibucontext;
++ if (!ibuctx)
++ goto out;
++
++ ibuctx->closing = 1;
++
++ synchronize_sched();
++
++ down_write(&list_rwsem);
++
++ list_for_each_entry_safe(uobj, tmp, &ibuctx->ah_list, list) {
++ struct ib_ah *ibah = uobj->object;
++ ib_destroy_ah(ibah);
++ ibp_destroy_uobj(uobj);
++ }
++
++ list_for_each_entry_safe(uobj, tmp, &ibuctx->qp_list, list) {
++ struct ibp_qp *qp = uobj->object;
++ ibp_detach_mcast(qp);
++ ib_destroy_qp(qp->ibqp);
++ ibp_destroy_uobj(uobj);
++ kfree(qp);
++ }
++
++ list_for_each_entry_safe(uobj, tmp, &ibuctx->cq_list, list) {
++ struct ib_cq *ibcq = uobj->object;
++ ib_destroy_cq(ibcq);
++ ibp_destroy_uobj(uobj);
++ }
++
++ list_for_each_entry_safe(uobj, tmp, &ibuctx->srq_list, list) {
++ struct ib_srq *ibsrq = uobj->object;
++ ib_destroy_srq(ibsrq);
++ ibp_destroy_uobj(uobj);
++ }
++
++ list_for_each_entry_safe(uobj, tmp, &ibuctx->mr_list, list) {
++ struct ibp_mr *mr = uobj->object;
++ ib_dereg_mr(mr->ibmr);
++ ibp_destroy_uobj(uobj);
++ kref_put(&mr->reg->ref, ibp_dereg_buf);
++ kfree(mr);
++ }
++
++ list_for_each_entry_safe(uobj, tmp, &ibuctx->xrcd_list, list) {
++ struct ib_xrcd *ibxrcd = uobj->object;
++ ib_dealloc_xrcd(ibxrcd);
++ ibp_destroy_uobj(uobj);
++ }
++
++ list_for_each_entry_safe(uobj, tmp, &ibuctx->pd_list, list) {
++ struct ib_pd *ibpd = uobj->object;
++ ib_dealloc_pd(ibpd);
++ ibp_destroy_uobj(uobj);
++ }
++
++ up_write(&list_rwsem);
++
++ synchronize_sched();
++
++ ibuctx->device->dealloc_ucontext(ibuctx);
++out:
++ if (ucontext->ibdev)
++ ib_unregister_event_handler(&ucontext->event_handler);
++
++ list_for_each_entry_safe(mmap, tmp_mmap, &ucontext->mmap_list, list) {
++ ibp_scif_unregister(ucontext->client, mmap);
++
++ if (!IS_NULL_OR_ERR(current) && !IS_NULL_OR_ERR(current->mm)) {
++ MUNMAP(current->mm, mmap->vaddr, mmap->len);
++ }
++ kfree(mmap);
++ }
++
++ while (!RB_EMPTY_ROOT(&ucontext->reg_tree)) {
++ struct ibp_reg *reg;
++ reg = rb_entry(ucontext->reg_tree.rb_node, struct ibp_reg,
++ node);
++ kref_put(®->ref, ibp_dereg_buf);
++ }
++
++ ibp_put_device(ucontext->device);
++ fput(ucontext->filp);
++ kfree(ucontext);
++}
++
++void ibp_cleanup_ucontext(struct list_head *ucontext_list)
++{
++ struct ibp_ucontext *ucontext;
++ struct ibp_ucontext *next;
++
++ list_for_each_entry_safe(ucontext, next, ucontext_list, list)
++ ibp_destroy_ucontext(ucontext);
++}
++
++static int (*ibp_msg_table[])(struct ibp_client *client,
++ struct ibp_msg_header *hdr, void *tx_buf) = {
++ [IBP_VERB_GET_PROTOCOL_STATS] = ibp_cmd_not_supported,
++ [IBP_VERB_QUERY_DEVICE] = ibp_cmd_query_device,
++ [IBP_VERB_QUERY_PORT] = ibp_cmd_query_port,
++ [IBP_VERB_GET_LINK_LAYER] = ibp_cmd_not_supported,
++ [IBP_VERB_QUERY_GID] = ibp_cmd_query_gid,
++ [IBP_VERB_QUERY_PKEY] = ibp_cmd_query_pkey,
++ [IBP_VERB_MODIFY_DEVICE] = ibp_cmd_not_supported,
++ [IBP_VERB_MODIFY_PORT] = ibp_cmd_not_supported,
++ [IBP_VERB_ALLOC_UCONTEXT] = ibp_cmd_alloc_ucontext,
++ [IBP_VERB_DEALLOC_UCONTEXT] = ibp_cmd_dealloc_ucontext,
++ [IBP_VERB_REG_BUF] = ibp_cmd_reg_buf,
++ [IBP_VERB_DEREG_BUF] = ibp_cmd_dereg_buf,
++ [IBP_VERB_MMAP] = ibp_cmd_mmap,
++ [IBP_VERB_UNMMAP] = ibp_cmd_unmmap,
++ [IBP_VERB_ALLOC_PD] = ibp_cmd_alloc_pd,
++ [IBP_VERB_DEALLOC_PD] = ibp_cmd_dealloc_pd,
++ [IBP_VERB_CREATE_AH] = ibp_cmd_create_ah,
++ [IBP_VERB_MODIFY_AH] = ibp_cmd_not_supported,
++ [IBP_VERB_QUERY_AH] = ibp_cmd_query_ah,
++ [IBP_VERB_DESTROY_AH] = ibp_cmd_destroy_ah,
++ [IBP_VERB_CREATE_SRQ] = ibp_cmd_create_srq,
++ [IBP_VERB_MODIFY_SRQ] = ibp_cmd_modify_srq,
++ [IBP_VERB_QUERY_SRQ] = ibp_cmd_query_srq,
++ [IBP_VERB_DESTROY_SRQ] = ibp_cmd_destroy_srq,
++ [IBP_VERB_POST_SRQ_RECV] = ibp_cmd_not_supported,
++ [IBP_VERB_CREATE_QP] = ibp_cmd_create_qp,
++ [IBP_VERB_MODIFY_QP] = ibp_cmd_modify_qp,
++ [IBP_VERB_QUERY_QP] = ibp_cmd_query_qp,
++ [IBP_VERB_DESTROY_QP] = ibp_cmd_destroy_qp,
++ [IBP_VERB_POST_SEND] = ibp_cmd_not_supported,
++ [IBP_VERB_POST_RECV] = ibp_cmd_not_supported,
++ [IBP_VERB_CREATE_CQ] = ibp_cmd_create_cq,
++ [IBP_VERB_MODIFY_CQ] = ibp_cmd_not_supported,
++ [IBP_VERB_DESTROY_CQ] = ibp_cmd_destroy_cq,
++ [IBP_VERB_RESIZE_CQ] = ibp_cmd_resize_cq,
++ [IBP_VERB_POLL_CQ] = ibp_cmd_not_supported,
++ [IBP_VERB_PEEK_CQ] = ibp_cmd_not_supported,
++ [IBP_VERB_REQ_NOTIFY_CQ] = ibp_cmd_not_supported,
++ [IBP_VERB_REQ_NCOMP_NOTIF] = ibp_cmd_not_supported,
++ [IBP_VERB_GET_DMA_MR] = ibp_cmd_not_supported,
++ [IBP_VERB_REG_PHYS_MR] = ibp_cmd_not_supported,
++ [IBP_VERB_REG_USER_MR] = ibp_cmd_reg_user_mr,
++ [IBP_VERB_QUERY_MR] = ibp_cmd_not_supported,
++ [IBP_VERB_DEREG_MR] = ibp_cmd_dereg_mr,
++ [IBP_VERB_ALLOC_FAST_REG_MR] = ibp_cmd_not_supported,
++ [IBP_VERB_ALLOC_FAST_REG_PAGE_LIST] = ibp_cmd_not_supported,
++ [IBP_VERB_FREE_FAST_REG_PAGE_LIST] = ibp_cmd_not_supported,
++ [IBP_VERB_REREG_PHYS_MR] = ibp_cmd_not_supported,
++ [IBP_VERB_ALLOC_MW] = ibp_cmd_not_supported,
++ [IBP_VERB_BIND_MW] = ibp_cmd_not_supported,
++ [IBP_VERB_DEALLOC_MW] = ibp_cmd_not_supported,
++ [IBP_VERB_ALLOC_FMR] = ibp_cmd_not_supported,
++ [IBP_VERB_MAP_PHYS_FMR] = ibp_cmd_not_supported,
++ [IBP_VERB_UNMAP_FMR] = ibp_cmd_not_supported,
++ [IBP_VERB_DEALLOC_FMR] = ibp_cmd_not_supported,
++ [IBP_VERB_ATTACH_MCAST] = ibp_cmd_attach_mcast,
++ [IBP_VERB_DETACH_MCAST] = ibp_cmd_detach_mcast,
++ [IBP_VERB_PROCESS_MAD] = ibp_cmd_not_supported,
++ [IBP_VERB_ALLOC_XRCD] = ibp_cmd_not_supported,
++ [IBP_VERB_DEALLOC_XRCD] = ibp_cmd_not_supported,
++};
++
++int ibp_init()
++{
++ a_stack = ibp_init_stack();
++ c_stack = ibp_init_stack();
++ o_stack = ibp_init_stack();
++
++ if (!a_stack || !c_stack || !o_stack) {
++ print_err("stack allocation failed\n");
++ return -ENOMEM;
++ }
++
++ return 0;
++}
++
++void ibp_cleanup()
++{
++ ibp_clear_stack(a_stack);
++ ibp_clear_stack(c_stack);
++ ibp_clear_stack(o_stack);
++}
++
++int ibp_process_recvs(struct ibp_client *client, void *rx_buf, void *tx_buf)
++{
++ struct ibp_msg_header *hdr;
++ int ret;
++
++ hdr = (struct ibp_msg_header *) rx_buf;
++
++ for (;;) {
++ wait_event_interruptible(client->rx_wait_queue,
++ !atomic_xchg(&client->rx_in_process,
++ 1));
++
++ ret = ibp_recv(client->ep, hdr, sizeof(*hdr));
++ if (ret)
++ goto err;
++
++ if (hdr->length > MAX_MSG_SIZE) {
++ print_err("message too large, len %u max %lu\n",
++ hdr->length, MAX_MSG_SIZE);
++ ret = -EMSGSIZE;
++ goto err;
++ }
++
++ ret = ibp_recv(client->ep, hdr->data,
++ hdr->length - sizeof(*hdr));
++ if (ret)
++ goto err;
++
++ atomic_set(&client->rx_in_process, 0);
++ wake_up_interruptible(&client->rx_wait_queue);
++
++ if ((hdr->opcode >= ARRAY_SIZE(ibp_msg_table)) ||
++ !ibp_msg_table[hdr->opcode]) {
++ ibp_cmd_bad_request(client, hdr, tx_buf);
++ continue;
++ }
++
++ ret = ibp_msg_table[hdr->opcode](client, hdr, tx_buf);
++ if (ret)
++ goto err;
++ }
++
++ goto out;
++err:
++ atomic_set(&client->rx_in_process, 0);
++ wake_up_interruptible(&client->rx_wait_queue);
++
++out:
++ return ret;
++}
+diff -urN a6/drivers/infiniband/ibp/drv/stack.c a7/drivers/infiniband/ibp/drv/stack.c
+--- a6/drivers/infiniband/ibp/drv/stack.c 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/drv/stack.c 2015-02-23 10:01:30.292769309 -0800
+@@ -0,0 +1,102 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#include "common.h"
++#include "stack.h"
++
++static DEFINE_SPINLOCK(stack_lock);
++
++struct ibp_stack *ibp_init_stack(void)
++{
++ struct ibp_stack *s;
++
++ s = kzalloc(sizeof(struct ibp_stack), GFP_KERNEL);
++ if (s)
++ s->top_pointer = &s->base[0];
++
++ return s;
++}
++
++void ibp_clear_stack(struct ibp_stack *s)
++{
++ while (s->top_pointer != s->base) {
++ s->top_pointer--;
++ kfree(*s->top_pointer);
++ }
++ kfree(s);
++}
++
++void ibp_add_to_stack(struct ibp_stack *s, void *p)
++{
++ spin_lock_irq(&stack_lock);
++
++ if (unlikely(++s->sample_cnt == STACK_GC_SAMPLE)) {
++ s->sample_cnt = 0;
++ if (unlikely(++s->gc_cnt == STACK_GC_RATE)) {
++ s->gc_cnt = 0;
++ while (s->current_count > s->high_water_mark) {
++ s->top_pointer--;
++ s->current_count--;
++ kfree(*s->top_pointer);
++ }
++ } else if (s->high_water_mark < s->current_count)
++ s->high_water_mark = s->current_count;
++ }
++
++ if (likely(s->current_count < MAX_STACK)) {
++ *s->top_pointer++ = p;
++ s->current_count++;
++ } else
++ kfree(p);
++
++ spin_unlock_irq(&stack_lock);
++}
++
++void *ibp_pull_from_stack(struct ibp_stack *s, size_t size, int gfp_mask)
++{
++ void *p;
++ unsigned long flag;
++
++ spin_lock_irqsave(&stack_lock, flag);
++
++ if (s->top_pointer == s->base)
++ p = kmalloc(size, gfp_mask);
++ else {
++ s->current_count--;
++ s->top_pointer--;
++ p = *s->top_pointer;
++ }
++
++ spin_unlock_irqrestore(&stack_lock, flag);
++
++ return p;
++}
+diff -urN a6/drivers/infiniband/ibp/drv/stack.h a7/drivers/infiniband/ibp/drv/stack.h
+--- a6/drivers/infiniband/ibp/drv/stack.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/drv/stack.h 2015-02-23 10:01:30.293769309 -0800
+@@ -0,0 +1,57 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef _IBP_STACK_H_
++#define _IBP_STACK_H_
++
++#define STACK_GC_SAMPLE 5
++#define STACK_GC_RATE 10
++#define MAX_STACK 128
++
++struct ibp_stack {
++ int current_count;
++ int high_water_mark;
++ int gc_cnt;
++ int sample_cnt;
++ void **top_pointer;
++ void *base[MAX_STACK+1];
++};
++
++struct ibp_stack *ibp_init_stack(void);
++
++void ibp_add_to_stack(struct ibp_stack *s, void *p);
++
++void *ibp_pull_from_stack(struct ibp_stack *s, size_t size, int gfp_mask);
++
++void ibp_clear_stack(struct ibp_stack *s);
++
++#endif /* _IBP_STACK_H_ */
+diff -urN a6/drivers/infiniband/ibp/Kconfig a7/drivers/infiniband/ibp/Kconfig
+--- a6/drivers/infiniband/ibp/Kconfig 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/Kconfig 2015-02-23 10:01:30.293769309 -0800
+@@ -0,0 +1,16 @@
++config IBP_SERVER
++ tristate "CCL Direct IB Server drivers"
++ ---help---
++ Server drivers for CCL Direct including server proxies for
++ hw drivers, and core drivers ib_sa and ib_cm.
++ Also includes is a kernel mode test module
++
++ To compile this driver as a module, choose M here.
++ If unsure, say N.
++
++config IBP_DEBUG
++ bool "CCL Direct debugging"
++ depends on IBP_SERVER
++ default y
++ ---help---
++ This option causes debug code to be compiled into the CCL Direct drivers.
+diff -urN a6/drivers/infiniband/ibp/Makefile a7/drivers/infiniband/ibp/Makefile
+--- a6/drivers/infiniband/ibp/Makefile 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/Makefile 2015-02-23 10:01:30.293769309 -0800
+@@ -0,0 +1,3 @@
++obj-$(CONFIG_IBP_SERVER) += drv/
++obj-$(CONFIG_IBP_SERVER) += cm/
++obj-$(CONFIG_IBP_SERVER) += sa/
+diff -urN a6/drivers/infiniband/ibp/sa/common.h a7/drivers/infiniband/ibp/sa/common.h
+--- a6/drivers/infiniband/ibp/sa/common.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/sa/common.h 2015-02-23 10:01:30.293769309 -0800
+@@ -0,0 +1,108 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef COMMON_H
++#define COMMON_H
++
++#include <linux/module.h>
++#include <linux/kthread.h>
++#include <linux/types.h>
++#include <linux/slab.h>
++#include <linux/poll.h>
++#include <linux/mman.h>
++#include <linux/pci.h>
++#include <linux/net.h>
++#include <rdma/ib_verbs.h>
++#include <modules/scif.h>
++
++#define DRV_DESC "CCL Direct SA " DRV_ROLE
++#define DRV_VERSION "1.0"
++#define DRV_BASE "ibp_sa"
++#define PFX DRV_BASE "_"
++#define DRV_PFX DRV_NAME ": "
++
++#define DRV_COPYRIGHT "Copyright (c) 2011-2013 Intel Corporation"
++#define DRV_SIGNON DRV_DESC " v" DRV_VERSION "\n" DRV_COPYRIGHT "\n"
++
++#define MODULE_PARAM(name, var, type, value, desc) \
++ type var = value; \
++ module_param_named(name, var, type, 0644); \
++ MODULE_PARM_DESC(name, desc)
++
++#ifdef IBP_DEBUG
++extern int debug_level;
++#endif
++
++enum {
++ IBP_DEBUG_NONE,
++ IBP_DEBUG_TARGETED,
++ IBP_DEBUG_VERBOSE,
++};
++
++#define _PRINTK(l, f, arg...) \
++ printk(l DRV_PFX "%s(%d) " f, __func__, __LINE__, ##arg)
++
++#ifdef IBP_DEBUG
++#define PRINTK(dbg, l, f, arg...) \
++ do { \
++ if (debug_level >= dbg) \
++ printk(l DRV_PFX "%s(%d) " f, \
++ __func__, __LINE__, ##arg); \
++ } while (0)
++#else
++#define PRINTK(dbg, l, f, arg...) do { } while (0)
++#endif
++
++#define print_dbg(f, arg...) PRINTK(IBP_DEBUG_TARGETED, KERN_DEBUG, f, ##arg)
++#define print_err(f, arg...) _PRINTK(KERN_ERR, f, ##arg)
++#define print_info(f, arg...) pr_info(f, ##arg)
++
++#if 0
++#define FORCED_FUNCTION_TRACING
++#endif
++
++#ifdef FORCED_FUNCTION_TRACING
++#define print_trace(f, arg...) _PRINTK(KERN_ERR, f, ##arg)
++#else
++#define print_trace(f, arg...) PRINTK(IBP_DEBUG_VERBOSE, KERN_ERR, f, ##arg)
++#endif
++
++#ifndef IBP_SA_PORT /* unique scif port for this service */
++#define IBP_SA_PORT SCIF_OFED_PORT_4
++#endif
++
++#define IS_NULL_OR_ERR(p) (!(p) || IS_ERR_VALUE((unsigned long)p))
++
++int ibp_send(scif_epd_t ep, void *buf, size_t len);
++int ibp_recv(scif_epd_t ep, void *buf, size_t len);
++
++#endif /* COMMON_H */
+diff -urN a6/drivers/infiniband/ibp/sa/ibp-abi.h a7/drivers/infiniband/ibp/sa/ibp-abi.h
+--- a6/drivers/infiniband/ibp/sa/ibp-abi.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/sa/ibp-abi.h 2015-02-23 10:01:30.293769309 -0800
+@@ -0,0 +1,101 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef IBP_ABI_H
++#define IBP_ABI_H
++
++#include <linux/types.h>
++#include <rdma/ib_verbs.h>
++#include <rdma/ib_sa.h>
++
++/* Increment this value if any changes break compatibility. */
++#define IBP_CM_ABI_VERSION 1
++#define MAX_MSG_SIZE PAGE_SIZE
++
++/* Client to server message enums. */
++enum {
++ /* have callback */
++ IBP_SA_PATH_REC_GET,
++ IBP_SA_JOIN_MCAST,
++
++ /* no callback */
++ IBP_SA_FREE_MCAST,
++ IBP_SA_GET_MCMEMBER_REC,
++ IBP_SA_REGISTER_CLIENT,
++ IBP_SA_UNREGISTER_CLIENT,
++ IBP_SA_CANCEL_QUERY,
++ IBP_INIT_AH_FROM_PATH,
++ IBP_INIT_AH_FROM_MCMEMBER,
++#if 0
++ /* not used or local to client */
++ IBP_SA_SERVICE_REC_QUERY,
++ IBP_SA_UNPACK_PATH,
++#endif
++};
++
++/* Server to client message enums. */
++enum {
++ IBP_CALLBACK,
++ IBP_RESPONSE,
++};
++
++enum {
++ PATH_REC_GET_CB,
++ JOIN_MCAST_CB,
++};
++
++/*
++ * Make sure that all structs defined in this file are laid out to pack
++ * the same way on different architectures to avoid incompatibility.
++ *
++ * Specifically:
++ * - Do not use pointer types -- pass pointers in a u64 instead.
++ * - Make sure that any structure larger than 4 bytes is padded
++ * to a multiple of 8 bytes; otherwise the structure size may
++ * be different between architectures.
++ */
++
++struct ibp_msg_header { /* present in all messages */
++ u32 opcode;
++ u32 length;
++ u32 status;
++ u32 reserved;
++ u64 request;
++ u64 data[0];
++};
++
++struct ibp_verb_response_msg {
++ struct ibp_msg_header header;
++ u64 data[0];
++};
++
++#endif /* IBP_ABI_H */
+diff -urN a6/drivers/infiniband/ibp/sa/ibp_exports.h a7/drivers/infiniband/ibp/sa/ibp_exports.h
+--- a6/drivers/infiniband/ibp/sa/ibp_exports.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/sa/ibp_exports.h 2015-02-23 10:01:30.293769309 -0800
+@@ -0,0 +1,49 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef IBP_EXPORTS_H
++#define IBP_EXPORTS_H
++
++#include <rdma/ib_verbs.h>
++
++/*
++ ibp_resolve_ib_device - Return the host ib_device handle
++ @ibdev:Card IB device
++
++ Upper level drivers may require the host ib_device handle associated
++ with the card ib_device. This routine resolves the card ib_device to
++ the cooresponding host ib_device handle. A value of 0 is returned if
++ no match was found.
++*/
++u64 ibp_resolve_ib_device(struct ib_device *ibdev);
++
++#endif /* IBP_EXPORTS_H */
+diff -urN a6/drivers/infiniband/ibp/sa/Makefile a7/drivers/infiniband/ibp/sa/Makefile
+--- a6/drivers/infiniband/ibp/sa/Makefile 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/sa/Makefile 2015-02-23 10:01:30.293769309 -0800
+@@ -0,0 +1,21 @@
++KDIR ?= /lib/modules/`uname -r`/build
++
++obj-$(CONFIG_IBP_SERVER) += ibp_sa_server.o
++
++ccflags-$(CONFIG_IBP_DEBUG) += -g -DIBP_DEBUG
++
++ibp_sa_server-y := server.o \
++ server_msg.o \
++ sa_server_msg.o
++
++default:
++ $(MAKE) -C $(KDIR) M=`pwd`
++
++modules_install:
++ $(MAKE) -C $(KDIR) M=`pwd` modules_install
++
++clean:
++ rm -rf *.ko *.o .*.ko.cmd .*.o.cmd *.mod.c Module.* modules.order .tmp_versions
++
++unix:
++ dos2unix *.[ch] Kconfig Makefile
+diff -urN a6/drivers/infiniband/ibp/sa/sa_ibp_abi.h a7/drivers/infiniband/ibp/sa/sa_ibp_abi.h
+--- a6/drivers/infiniband/ibp/sa/sa_ibp_abi.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/sa/sa_ibp_abi.h 2015-02-23 10:01:30.293769309 -0800
+@@ -0,0 +1,251 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef SA_IBP_ABI_H
++#define SA_IBP_ABI_H
++
++#include <linux/types.h>
++#include <rdma/ib_verbs.h>
++#include <rdma/ib_sa.h>
++
++/* Increment this value if any changes break compatibility. */
++#define IBP_SA_ABI_VERSION 1
++
++/*
++ * Make sure that all structs defined in this file are laid out to pack
++ * the same way on different architectures to avoid incompatibility.
++ *
++ * Specifically:
++ * - Do not use pointer types -- pass pointers in a u64 instead.
++ * - Make sure that any structure larger than 4 bytes is padded
++ * to a multiple of 8 bytes; otherwise the structure size may
++ * be different between architectures.
++ */
++
++struct cb_header {
++ u64 cb_type;
++ u64 status;
++ u64 ibp_client;
++};
++
++struct ibp_sa_path_rec {
++ __be64 service_id;
++ u64 dgid_prefix;
++ u64 dgid_id;
++ u64 sgid_prefix;
++ u64 sgid_id;
++ __be16 dlid;
++ __be16 slid;
++ u32 raw_traffic;
++ __be32 flow_label;
++ u8 hop_limit;
++ u8 traffic_class;
++ u32 reversible;
++ u8 numb_path;
++ __be16 pkey;
++ __be16 qos_class;
++ u8 sl;
++ u8 mtu_selector;
++ u8 mtu;
++ u8 rate_selector;
++ u8 rate;
++ u8 packet_life_time_selector;
++ u8 packet_life_time;
++ u8 preference;
++};
++
++struct path_rec_data {
++ u64 entry;
++ u64 query;
++ struct ibp_sa_path_rec resp;
++ u8 reserved[1];
++};
++
++struct ibp_sa_mcmember_rec {
++ u64 mgid_prefix;
++ u64 mgid_id;
++ u64 port_gid_prefix;
++ u64 port_gid_id;
++ __be32 qkey;
++ __be16 mlid;
++ u8 mtu_selector;
++ u8 mtu;
++ u8 traffic_class;
++ __be16 pkey;
++ u8 rate_selector;
++ u8 rate;
++ u8 packet_life_time_selector;
++ u8 packet_life_time;
++ u8 sl;
++ __be32 flow_label;
++ u8 hop_limit;
++ u8 scope;
++ u8 join_state;
++ u64 proxy_join;
++ u8 reserved[1];
++};
++
++struct mc_join_data {
++ u64 mcentry;
++ u64 ibp_mcast;
++ struct ibp_sa_mcmember_rec rec;
++};
++
++struct callback_msg {
++ struct cb_header header;
++ union {
++ struct path_rec_data path_rec;
++ struct mc_join_data mc_join;
++ } u;
++};
++
++struct ibp_callback_msg {
++ struct ibp_msg_header header;
++ u8 data[0];
++};
++
++struct ibp_sa_path_rec_get_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_client;
++ u64 entry;
++ u64 query;
++ u64 device;
++ u64 port_num;
++ u64 comp_mask;
++ u64 timeout_ms;
++ u64 gfp_mask;
++ struct ibp_sa_path_rec rec;
++};
++
++struct ibp_sa_path_rec_get_resp {
++ u64 sa_query;
++ u64 query_id;
++};
++
++struct ibp_sa_register_client_cmd {
++ struct ibp_msg_header header;
++};
++
++struct ibp_sa_register_client_resp {
++ u64 ibp_client;
++};
++
++struct ibp_sa_unregister_client_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_client;
++};
++
++struct ibp_sa_cancel_query_cmd {
++ struct ibp_msg_header header;
++ u64 id;
++ u64 client;
++};
++
++struct ibp_init_ah_from_path_cmd {
++ struct ibp_msg_header header;
++ u64 device;
++ u8 port_num;
++ struct ibp_sa_path_rec rec;
++};
++
++struct ibp_ah_attr {
++ u64 dgid_prefix;
++ u64 dgid_id;
++ u32 flow_label;
++ u8 sgid_index;
++ u8 hop_limit;
++ u8 traffic_class;
++ u16 dlid;
++ u8 sl;
++ u8 src_path_bits;
++ u8 static_rate;
++ u8 ah_flags;
++ u8 port_num;
++};
++struct ibp_init_ah_from_path_resp {
++ struct ibp_ah_attr attr;
++};
++
++struct ibp_init_ah_from_mcmember_cmd {
++ struct ibp_msg_header header;
++ u64 device;
++ u8 port_num;
++ struct ib_sa_mcmember_rec rec;
++};
++
++struct ibp_init_ah_from_mcmember_resp {
++ struct ibp_ah_attr attr;
++};
++
++struct ibp_sa_join_multicast_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_client;
++ u64 mcentry;
++ u64 device;
++ u8 port_num;
++ u64 comp_mask;
++ u64 gfp_mask;
++ struct ib_sa_mcmember_rec rec;
++};
++
++struct ibp_sa_join_multicast_resp {
++ u64 ibp_mcast;
++};
++
++struct ibp_sa_free_multicast_cmd {
++ struct ibp_msg_header header;
++ u64 ibp_mcast;
++};
++
++struct ibp_sa_get_mcmember_rec_cmd {
++ struct ibp_msg_header header;
++ u64 device;
++ u8 port_num;
++ u64 subnet_prefix;
++ u64 interface_id;
++};
++
++struct ibp_sa_get_mcmember_rec_resp {
++ struct ib_sa_mcmember_rec rec;
++};
++
++struct ibp_sa_event {
++ enum ib_event_type event_type;
++ u64 ibp_client;
++ union {
++ __u32 send_status;
++ } u;
++ u64 data_length;
++ u8 data[0];
++};
++
++#endif /* SA_IBP_ABI_H */
+diff -urN a6/drivers/infiniband/ibp/sa/sa_server_msg.c a7/drivers/infiniband/ibp/sa/sa_server_msg.c
+--- a6/drivers/infiniband/ibp/sa/sa_server_msg.c 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/sa/sa_server_msg.c 2015-02-23 10:01:30.294769309 -0800
+@@ -0,0 +1,970 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ * * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#include "server.h"
++
++LIST_HEAD(sa_entry_list);
++LIST_HEAD(query_list);
++LIST_HEAD(mcast_list);
++
++static void free_query_list(struct sa_query_entry *entry)
++{
++ if (entry) {
++ down_write(&list_rwsem);
++
++ list_del(&entry->list);
++
++ up_write(&list_rwsem);
++ }
++}
++
++static struct sa_query_entry *add_query_list(struct ibp_client *client)
++{
++ struct sa_query_entry *entry;
++
++ print_trace("in\n");
++
++ entry = kzalloc(sizeof(struct sa_query_entry), GFP_KERNEL);
++ if (!entry) {
++ print_err("kzalloc failed\n");
++ return ERR_PTR(-ENOMEM);
++ }
++
++ entry->ibp_client = client;
++
++ down_write(&list_rwsem);
++
++ list_add(&entry->list, &query_list);
++
++ up_write(&list_rwsem);
++
++ return entry;
++}
++
++static struct sa_query_entry *find_query_entry(struct ib_sa_client *client)
++{
++ struct sa_query_entry *query = NULL;
++
++ down_read(&list_rwsem);
++
++ list_for_each_entry(query, &query_list, list)
++ if (query->sa_client == client)
++ goto out;
++
++ print_err("Could not find sa_query_entry\n");
++
++out:
++ up_read(&list_rwsem);
++
++ return query;
++}
++
++static struct sa_entry *find_sa_entry(struct ib_sa_client *ib_client)
++{
++ struct sa_entry *entry = NULL;
++
++ down_read(&list_rwsem);
++
++ list_for_each_entry(entry, &sa_entry_list, list)
++ if (&entry->ib_client == ib_client)
++ goto out;
++
++ print_err("Could not find sa_entry\n");
++
++out:
++ up_read(&list_rwsem);
++
++ return entry;
++}
++
++/* Translate from server side "true" SA client to proxied SA client on the
++ * client
++ */
++static struct ib_sa_client *find_ibp_client(struct ibp_client *ibp_client)
++{
++ struct sa_entry *entry;
++ struct ib_sa_client *client = NULL;
++
++ down_read(&list_rwsem);
++
++ list_for_each_entry(entry, &sa_entry_list, list)
++ if (entry->client == ibp_client) {
++ client = &entry->ib_client;
++ goto out;
++ }
++
++ print_err("Could not find proxied sa_client %p\n", ibp_client);
++
++out:
++ up_read(&list_rwsem);
++
++ return client;
++}
++
++int ibp_cmd_sa_register_client(struct ibp_client *ibp_client,
++ struct ibp_msg_header *hdr)
++{
++ struct sa_entry *entry;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_sa_register_client_resp *resp;
++ size_t len;
++ int status = 0;
++ int ret;
++
++ print_trace("in\n");
++
++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf;
++ len = sizeof(*msg);
++
++ entry = kzalloc((sizeof(struct sa_entry)), GFP_KERNEL);
++ if (!entry) {
++ print_err("kzalloc failed\n");
++ status = -ENOMEM;
++ goto send_resp;
++ }
++
++ entry->client = ibp_client;
++
++ len += sizeof(*resp);
++
++ resp = (struct ibp_sa_register_client_resp *) msg->data;
++
++ resp->ibp_client = (u64) &entry->ib_client;
++send_resp:
++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, status);
++
++ ret = ibp_send(ibp_client->ep, msg, len);
++ if (ret) {
++ kfree(entry);
++ print_err("ibp_send returned %d\n", ret);
++ return ret;
++ }
++ if (status)
++ return status;
++
++ ib_sa_register_client(&entry->ib_client);
++
++ down_write(&list_rwsem);
++ list_add(&entry->list, &sa_entry_list);
++ up_write(&list_rwsem);
++
++ return 0;
++}
++
++int ibp_cmd_sa_unregister_client(struct ibp_client *ibp_client,
++ struct ibp_msg_header *hdr)
++{
++ struct sa_entry *entry;
++ struct ibp_sa_unregister_client_cmd *cmd;
++ struct ibp_verb_response_msg *msg;
++ struct ib_sa_client *client;
++ size_t len;
++ int ret = 0;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_sa_unregister_client_cmd *) hdr;
++ client = (struct ib_sa_client *) cmd->ibp_client;
++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf;
++ len = sizeof(*msg);
++
++ entry = find_sa_entry(client);
++ if (!entry) {
++ ret = -EINVAL;
++ goto send_resp;
++ }
++
++ down_write(&list_rwsem);
++ list_del(&entry->list);
++ up_write(&list_rwsem);
++
++ ib_sa_unregister_client(&entry->ib_client);
++
++send_resp:
++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret);
++
++ ret = ibp_send(ibp_client->ep, msg, len);
++ if (ret)
++ print_err("ibp_send returned %d\n", ret);
++
++ return ret;
++}
++
++int ibp_cmd_sa_cancel_query(struct ibp_client *ibp_client,
++ struct ibp_msg_header *hdr)
++{
++ struct sa_query_entry *entry;
++ struct ibp_sa_cancel_query_cmd *cmd;
++ struct ibp_verb_response_msg *msg;
++ size_t len;
++ int ret = 0;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_sa_cancel_query_cmd *) hdr;
++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf;
++ len = sizeof(*msg);
++
++ entry = find_query_entry((struct ib_sa_client *) cmd->client);
++ if (!entry) {
++ ret = -EINVAL;
++ goto send_resp;
++ }
++
++ ib_sa_cancel_query(cmd->id, entry->query);
++
++ free_query_list(entry);
++
++send_resp:
++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret);
++
++ ret = ibp_send(ibp_client->ep, msg, len);
++ if (ret)
++ print_err("ibp_send returned %d\n", ret);
++
++ return ret;
++}
++
++int ibp_cmd_init_ah_from_path(struct ibp_client *ibp_client,
++ struct ibp_msg_header *hdr)
++{
++ struct ib_device *device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_init_ah_from_path_cmd *cmd;
++ struct ibp_init_ah_from_path_resp *resp;
++ struct ib_sa_path_rec rec;
++ struct ib_ah_attr attr;
++ size_t len;
++ u8 port_num;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_init_ah_from_path_cmd *) hdr;
++ device = (struct ib_device *) cmd->device;
++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf;
++ len = sizeof(*msg);
++
++ port_num = cmd->port_num;
++
++ rec.service_id = cmd->rec.service_id;
++ rec.dgid.global.interface_id
++ = cmd->rec.dgid_id;
++ rec.dgid.global.subnet_prefix
++ = cmd->rec.dgid_prefix;
++ rec.sgid.global.interface_id
++ = cmd->rec.sgid_id;
++ rec.sgid.global.subnet_prefix
++ = cmd->rec.sgid_prefix;
++ rec.dlid = cmd->rec.dlid;
++ rec.slid = cmd->rec.slid;
++ rec.raw_traffic = cmd->rec.raw_traffic;
++ rec.flow_label = cmd->rec.flow_label;
++ rec.hop_limit = cmd->rec.hop_limit;
++ rec.traffic_class = cmd->rec.traffic_class;
++ rec.reversible = cmd->rec.reversible;
++ rec.numb_path = cmd->rec.numb_path;
++ rec.pkey = cmd->rec.pkey;
++ rec.qos_class = cmd->rec.qos_class;
++ rec.sl = cmd->rec.sl;
++ rec.mtu_selector = cmd->rec.mtu_selector;
++ rec.mtu = cmd->rec.mtu;
++ rec.rate_selector = cmd->rec.rate_selector;
++ rec.rate = cmd->rec.rate;
++ rec.packet_life_time_selector
++ = cmd->rec.packet_life_time_selector;
++ rec.packet_life_time = cmd->rec.packet_life_time;
++ rec.preference = cmd->rec.preference;
++
++ ret = ib_init_ah_from_path(device, port_num, &rec, &attr);
++ if (ret)
++ print_err("init_ah_from_path returned %d\n", ret);
++
++ len += sizeof(*resp);
++ resp = (struct ibp_init_ah_from_path_resp *) msg->data;
++
++ resp->attr.dgid_prefix = attr.grh.dgid.global.subnet_prefix;
++ resp->attr.dgid_id = attr.grh.dgid.global.interface_id;
++ resp->attr.flow_label = attr.grh.flow_label;
++ resp->attr.sgid_index = attr.grh.sgid_index;
++ resp->attr.hop_limit = attr.grh.hop_limit;
++ resp->attr.traffic_class
++ = attr.grh.traffic_class;
++ resp->attr.dlid = attr.dlid;
++ resp->attr.sl = attr.sl;
++ resp->attr.src_path_bits
++ = attr.src_path_bits;
++ resp->attr.static_rate = attr.static_rate;
++ resp->attr.ah_flags = attr.ah_flags;
++ resp->attr.port_num = attr.port_num;
++
++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret);
++
++ ret = ibp_send(ibp_client->ep, msg, len);
++ if (ret)
++ print_err("ibp_send returned %d\n", ret);
++
++ return ret;
++}
++
++int ibp_cmd_init_ah_from_mcmember(struct ibp_client *ibp_client,
++ struct ibp_msg_header *hdr)
++{
++ struct ib_device *device;
++ struct ibp_init_ah_from_mcmember_cmd *cmd;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_init_ah_from_mcmember_resp *resp;
++ struct ib_sa_mcmember_rec rec;
++ struct ib_ah_attr attr;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_init_ah_from_mcmember_cmd *) hdr;
++ device = (struct ib_device *) cmd->device;
++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf;
++ len = sizeof(*msg);
++
++ rec.mgid.global.subnet_prefix = cmd->rec.mgid.global.subnet_prefix;
++ rec.mgid.global.interface_id = cmd->rec.mgid.global.interface_id;
++ rec.port_gid.global.subnet_prefix
++ = cmd->rec.port_gid.global.subnet_prefix;
++ rec.port_gid.global.interface_id
++ = cmd->rec.port_gid.global.interface_id;
++ rec.qkey = cmd->rec.qkey;
++ rec.mlid = cmd->rec.mlid;
++ rec.mtu_selector = cmd->rec.mtu_selector;
++ rec.mtu = cmd->rec.mtu;
++ rec.traffic_class = cmd->rec.traffic_class;
++ rec.pkey = cmd->rec.pkey;
++ rec.rate_selector = cmd->rec.rate_selector;
++ rec.rate = cmd->rec.rate;
++ rec.packet_life_time_selector
++ = cmd->rec.packet_life_time_selector;
++ rec.packet_life_time = cmd->rec.packet_life_time;
++ rec.sl = cmd->rec.sl;
++ rec.flow_label = cmd->rec.flow_label;
++ rec.hop_limit = cmd->rec.hop_limit;
++ rec.scope = cmd->rec.scope;
++ rec.join_state = cmd->rec.join_state;
++ rec.proxy_join = cmd->rec.proxy_join;
++
++ ret = ib_init_ah_from_mcmember(device, cmd->port_num, &rec, &attr);
++ if (ret) {
++ print_err("ib_init_ah_from_mcmember returned %d\n", ret);
++ goto send_resp;
++ }
++
++ len += sizeof(*resp);
++ resp = (struct ibp_init_ah_from_mcmember_resp *) msg->data;
++
++ resp->attr.dgid_prefix = attr.grh.dgid.global.subnet_prefix;
++ resp->attr.dgid_id = attr.grh.dgid.global.interface_id;
++ resp->attr.flow_label = attr.grh.flow_label;
++ resp->attr.sgid_index = attr.grh.sgid_index;
++ resp->attr.hop_limit = attr.grh.hop_limit;
++ resp->attr.traffic_class
++ = attr.grh.traffic_class;
++ resp->attr.dlid = attr.dlid;
++ resp->attr.sl = attr.sl;
++ resp->attr.src_path_bits
++ = attr.src_path_bits;
++ resp->attr.static_rate = attr.static_rate;
++ resp->attr.ah_flags = attr.ah_flags;
++ resp->attr.port_num = attr.port_num;
++
++
++send_resp:
++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret);
++
++ ret = ibp_send(ibp_client->ep, msg, len);
++ if (ret)
++ print_err("ibp_send returned %d\n", ret);
++
++ return ret;
++}
++
++static void ibp_send_callback(struct work_struct *work)
++{
++ struct callback_work *cb_work;
++ struct ibp_callback_msg *msg;
++ struct cb_header *header;
++ struct ibp_client *client;
++ size_t len;
++ int data_length;
++ int cb_type;
++ int ret;
++
++ print_trace("in\n");
++
++ cb_work = (struct callback_work *) work;
++ len = sizeof(*msg);
++
++ if (!cb_work) {
++ print_err("Invalid callback work_struct\n");
++ return;
++ }
++
++ header = &cb_work->msg.header;
++ cb_type = header->cb_type;
++
++ client = cb_work->client;
++ if (!client) {
++ print_err("Invalid callback client\n");
++ goto err;
++ }
++ if (!client->ep) {
++ print_err("Invalid callback client ep\n");
++ goto err;
++ }
++ if (cb_work->data->ret) {
++ print_err("caller failed to send msg to card\n");
++ goto err;
++ }
++
++ data_length = cb_work->length;
++
++ if (cb_type == PATH_REC_GET_CB) {
++ ret = sizeof(struct path_rec_data) + sizeof(struct cb_header);
++ if (data_length != ret) {
++ print_err("Invalid data length %d, expecting %d\n",
++ data_length, ret);
++ goto err;
++ }
++ } else if (cb_type == JOIN_MCAST_CB) {
++ ret = sizeof(struct mc_join_data) + sizeof(struct cb_header);
++ if (data_length != ret) {
++ print_err("Invalid data length %d, expecting %d\n",
++ data_length, ret);
++ goto err;
++ }
++ } else {
++ print_err("Invalid callback type %d\n", cb_type);
++ goto err;
++ }
++
++ len += data_length;
++
++ msg = kzalloc(len, GFP_KERNEL);
++ if (!msg) {
++ print_err("kzmalloc failed\n");
++ goto err;
++ }
++ IBP_INIT_MSG(msg, len, CALLBACK);
++
++ memcpy(msg->data, &cb_work->msg, data_length);
++
++ /* wait for host to send message to card before processing cb */
++ mutex_lock(&cb_work->data->lock);
++
++ ret = ibp_send(client->ep, msg, len);
++ if (ret)
++ print_err("ibp_send returned %d\n", ret);
++
++ mutex_unlock(&cb_work->data->lock);
++
++ kfree(msg);
++err:
++ if (cb_type == PATH_REC_GET_CB)
++ kfree(cb_work->data);
++
++ kfree(cb_work);
++}
++
++static void path_rec_get_callback(int status, struct ib_sa_path_rec *resp,
++ void *context)
++{
++ struct path_rec_cb_data *data;
++ struct sa_query_entry *entry;
++ struct ibp_client *client;
++ struct ib_sa_client *ib_client;
++ struct callback_work *cb_work;
++ struct cb_header *header;
++ struct path_rec_data *path_rec;
++
++ print_trace("in\n");
++
++ data = (struct path_rec_cb_data *) context;
++ entry = data->entry;
++ client = entry->ibp_client;
++
++ cb_work = kzalloc(sizeof(struct callback_work), GFP_KERNEL);
++ if (!cb_work) {
++ print_err("kzalloc failed\n");
++ goto err1;
++ }
++
++ ib_client = find_ibp_client(client);
++ if (!ib_client) {
++ print_err("Could not find client for event handler\n");
++ goto err2;
++ }
++
++ if (!entry->query) {
++ print_err("Callback occurred before call returned\n");
++ goto err2;
++ }
++
++ cb_work->data = (struct generic_cb_data *) data;
++ cb_work->client = client;
++ cb_work->length = sizeof(*header) + sizeof(*path_rec);
++
++ header = &cb_work->msg.header;
++ header->cb_type = PATH_REC_GET_CB;
++ header->status = status;
++ header->ibp_client = (u64) ib_client;
++
++ path_rec = &cb_work->msg.u.path_rec;
++ path_rec->entry = data->ibp_entry;
++ path_rec->query = data->ibp_query;
++
++ if (status) {
++ print_err("callback status %d\n", status);
++ // XXX How is data deallocated in error cases?
++ goto queue_work;
++ }
++
++ path_rec->resp.service_id = resp->service_id;
++ path_rec->resp.dgid_prefix = resp->dgid.global.subnet_prefix;
++ path_rec->resp.dgid_id = resp->dgid.global.interface_id;
++ path_rec->resp.sgid_prefix = resp->sgid.global.subnet_prefix;
++ path_rec->resp.sgid_id = resp->sgid.global.interface_id;
++ path_rec->resp.dlid = resp->dlid;
++ path_rec->resp.slid = resp->slid;
++ path_rec->resp.raw_traffic = resp->raw_traffic;
++ path_rec->resp.flow_label = resp->flow_label;
++ path_rec->resp.hop_limit = resp->hop_limit;
++ path_rec->resp.traffic_class = resp->traffic_class;
++ path_rec->resp.reversible = resp->reversible;
++ path_rec->resp.numb_path = resp->numb_path;
++ path_rec->resp.pkey = resp->pkey;
++ path_rec->resp.qos_class = resp->qos_class;
++ path_rec->resp.sl = resp->sl;
++ path_rec->resp.mtu_selector = resp->mtu_selector;
++ path_rec->resp.mtu = resp->mtu;
++ path_rec->resp.rate_selector = resp->rate_selector;
++ path_rec->resp.rate = resp->rate;
++ path_rec->resp.packet_life_time_selector
++ = resp->packet_life_time_selector;
++ path_rec->resp.packet_life_time = resp->packet_life_time;
++ path_rec->resp.preference = resp->preference;
++
++queue_work:
++ free_query_list(entry);
++
++ INIT_WORK(&cb_work->work, ibp_send_callback);
++ queue_work(client->workqueue, &cb_work->work);
++ return;
++err2:
++ kfree(cb_work);
++err1:
++ kfree(data);
++ return;
++}
++
++int ibp_cmd_sa_path_rec_get(struct ibp_client *ibp_client,
++ struct ibp_msg_header *hdr)
++{
++ struct ib_device *ib_device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_sa_path_rec_get_cmd *cmd;
++ struct ibp_sa_path_rec_get_resp *resp;
++ struct ib_sa_client *client;
++ struct ib_sa_query *sa_query;
++ struct sa_query_entry *entry;
++ struct path_rec_cb_data *data = NULL;
++ struct ib_sa_path_rec rec;
++ size_t len;
++ int query_id;
++ int ret = 0;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_sa_path_rec_get_cmd *) hdr;
++ ib_device = (struct ib_device *) cmd->device;
++ client = (struct ib_sa_client *) cmd->ibp_client;
++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf;
++ len = sizeof(*msg);
++
++ entry = add_query_list(ibp_client);
++ if (IS_ERR(entry)) {
++ ret = PTR_ERR(entry);
++ goto send_resp;
++ }
++
++ data = kzalloc(sizeof(*data), GFP_KERNEL);
++ if (!data) {
++ free_query_list(entry);
++ print_err("kzalloc failed\n");
++ ret = -ENOMEM;
++ goto send_resp;
++ }
++
++ data->entry = entry;
++ data->ibp_entry = cmd->entry;
++ data->ibp_query = cmd->query;
++
++ rec.service_id = cmd->rec.service_id;
++ rec.dgid.global.interface_id
++ = cmd->rec.dgid_id;
++ rec.dgid.global.subnet_prefix
++ = cmd->rec.dgid_prefix;
++ rec.sgid.global.interface_id
++ = cmd->rec.sgid_id;
++ rec.sgid.global.subnet_prefix
++ = cmd->rec.sgid_prefix;
++ rec.dlid = cmd->rec.dlid;
++ rec.slid = cmd->rec.slid;
++ rec.raw_traffic = cmd->rec.raw_traffic;
++ rec.flow_label = cmd->rec.flow_label;
++ rec.hop_limit = cmd->rec.hop_limit;
++ rec.traffic_class = cmd->rec.traffic_class;
++ rec.reversible = cmd->rec.reversible;
++ rec.numb_path = cmd->rec.numb_path;
++ rec.pkey = cmd->rec.pkey;
++ rec.qos_class = cmd->rec.qos_class;
++ rec.sl = cmd->rec.sl;
++ rec.mtu_selector = cmd->rec.mtu_selector;
++ rec.mtu = cmd->rec.mtu;
++ rec.rate_selector = cmd->rec.rate_selector;
++ rec.rate = cmd->rec.rate;
++ rec.packet_life_time_selector
++ = cmd->rec.packet_life_time_selector;
++ rec.packet_life_time = cmd->rec.packet_life_time;
++ rec.preference = cmd->rec.preference;
++
++ mutex_init(&data->lock);
++ mutex_lock(&data->lock);
++
++ query_id = ib_sa_path_rec_get(client, ib_device, cmd->port_num, &rec,
++ cmd->comp_mask, cmd->timeout_ms,
++ GFP_KERNEL, path_rec_get_callback, data,
++ &sa_query);
++ if (query_id < 0) {
++ ret = query_id;
++ print_err("ib_sa_path_rec_get returned %d\n", ret);
++ free_query_list(entry);
++ mutex_unlock(&data->lock);
++ kfree(data);
++ data = NULL;
++ goto send_resp;
++ }
++ entry->query = sa_query;
++ entry->sa_client = client;
++ entry->id = query_id;
++
++ len += sizeof(*resp);
++ resp = (struct ibp_sa_path_rec_get_resp *) msg->data;
++ resp->query_id = query_id;
++ resp->sa_query = (u64)sa_query;
++
++send_resp:
++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret);
++
++ ret = ibp_send(ibp_client->ep, msg, len);
++
++ if (data) {
++ data->ret = ret;
++ mutex_unlock(&data->lock);
++ }
++
++ if (ret)
++ print_err("ibp_send returned %d\n", ret);
++
++ return ret;
++}
++
++static int sa_join_callback(int status, struct ib_sa_multicast *multicast)
++{
++ struct join_mcast_cb_data *data;
++ struct ibp_client *client;
++ struct ib_sa_client *ib_client;
++ struct callback_work *cb_work;
++ struct cb_header *header;
++ struct mc_join_data *mc_join;
++ struct ib_sa_mcmember_rec *ib_rec;
++ struct ibp_sa_mcmember_rec *ibp_rec;
++ int ret = 0;
++
++ print_trace("in\n");
++
++ data = (struct join_mcast_cb_data *) multicast->context;
++
++ if (status == -ENETRESET)
++ goto err1;
++
++ cb_work = kzalloc(sizeof(struct callback_work), GFP_KERNEL);
++ if (!cb_work) {
++ print_err("kzalloc failed\n");
++ ret = -ENOMEM;
++ goto err1;
++ }
++
++ client = data->client;
++
++ ib_client = find_ibp_client(client);
++ if (!ib_client) {
++ print_err("Could not find client for event handler\n");
++ ret = -EINVAL;
++ goto err2;
++ }
++
++ cb_work->data = (struct generic_cb_data *) data;
++ cb_work->client = client;
++ cb_work->length = sizeof(*header) + sizeof(*mc_join);
++
++ header = &cb_work->msg.header;
++ header->cb_type = JOIN_MCAST_CB;
++ header->status = status;
++ header->ibp_client = (u64) ib_client;
++
++ mc_join = &cb_work->msg.u.mc_join;
++ mc_join->ibp_mcast = (u64) multicast;
++ mc_join->mcentry = data->mcentry;
++
++ if (status) {
++ print_err("callback status %d\n", status);
++ goto queue_work;
++ }
++
++ ib_rec = &multicast->rec;
++ ibp_rec = &mc_join->rec;
++
++ ibp_rec->mgid_prefix = ib_rec->mgid.global.subnet_prefix;
++ ibp_rec->mgid_id = ib_rec->mgid.global.interface_id;
++ ibp_rec->port_gid_prefix = ib_rec->port_gid.global.subnet_prefix;
++ ibp_rec->port_gid_id = ib_rec->port_gid.global.interface_id;
++ ibp_rec->qkey = ib_rec->qkey;
++ ibp_rec->mlid = ib_rec->mlid;
++ ibp_rec->mtu_selector = ib_rec->mtu_selector;
++ ibp_rec->mtu = ib_rec->mtu;
++ ibp_rec->traffic_class = ib_rec->traffic_class;
++ ibp_rec->pkey = ib_rec->pkey;
++ ibp_rec->rate_selector = ib_rec->rate_selector;
++ ibp_rec->rate = ib_rec->rate;
++ ibp_rec->packet_life_time_selector
++ = ib_rec->packet_life_time_selector;
++ ibp_rec->packet_life_time = ib_rec->packet_life_time;
++ ibp_rec->sl = ib_rec->sl;
++ ibp_rec->flow_label = ib_rec->flow_label;
++ ibp_rec->hop_limit = ib_rec->hop_limit;
++ ibp_rec->join_state = ib_rec->join_state;
++ ibp_rec->proxy_join = ib_rec->proxy_join;
++
++queue_work:
++ INIT_WORK(&cb_work->work, ibp_send_callback);
++ queue_work(client->workqueue, &cb_work->work);
++ return 0;
++err2:
++ kfree(cb_work);
++err1:
++ return ret;
++}
++
++int ibp_cmd_sa_join_multicast(struct ibp_client *ibp_client,
++ struct ibp_msg_header *hdr)
++{
++ struct ib_device *ib_device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_sa_join_multicast_cmd *cmd;
++ struct ibp_sa_join_multicast_resp *resp;
++ struct ib_sa_client *client;
++ struct ib_sa_multicast *multicast;
++ struct join_mcast_cb_data *data;
++ size_t len;
++ int ret = 0;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_sa_join_multicast_cmd *) hdr;
++ ib_device = (struct ib_device *) cmd->device;
++ client = (struct ib_sa_client *) cmd->ibp_client;
++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf;
++ len = sizeof(*msg);
++
++ data = kzalloc(sizeof(*data), GFP_KERNEL);
++ if (!data) {
++ ret = -ENOMEM;
++ goto send_resp;
++ }
++
++ data->client = ibp_client;
++ data->mcentry = cmd->mcentry;
++
++ mutex_init(&data->lock);
++ mutex_lock(&data->lock);
++
++ down_write(&list_rwsem);
++ list_add(&data->list, &mcast_list);
++ up_write(&list_rwsem);
++
++ multicast = ib_sa_join_multicast(client, ib_device,
++ cmd->port_num, &cmd->rec,
++ cmd->comp_mask, GFP_KERNEL,
++ sa_join_callback, data);
++
++ if (IS_ERR(multicast)) {
++ ret = PTR_ERR(multicast);
++ print_err("ib_sa_join_multicast returned %d\n", ret);
++ mutex_unlock(&data->lock);
++ down_write(&list_rwsem);
++ list_del(&data->list);
++ up_write(&list_rwsem);
++ kfree(data);
++ data = NULL;
++ goto send_resp;
++ }
++ data->mcast = multicast;
++
++ len += sizeof(*resp);
++ resp = (struct ibp_sa_join_multicast_resp *) msg->data;
++
++ resp->ibp_mcast = (u64) multicast;
++
++send_resp:
++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret);
++
++ ret = ibp_send(ibp_client->ep, msg, len);
++
++ if (data) {
++ data->ret = ret;
++ mutex_unlock(&data->lock);
++ }
++
++ if (ret)
++ print_err("ibp_send returned %d\n", ret);
++
++ return ret;
++}
++
++int ibp_cmd_sa_free_multicast(struct ibp_client *ibp_client,
++ struct ibp_msg_header *hdr)
++{
++ struct ibp_verb_response_msg *msg;
++ struct ibp_sa_free_multicast_cmd *cmd;
++ struct ib_sa_multicast *multicast;
++ struct join_mcast_cb_data *data;
++ size_t len;
++ int ret = 0;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_sa_free_multicast_cmd *) hdr;
++ multicast = (struct ib_sa_multicast *) cmd->ibp_mcast;
++ data = (struct join_mcast_cb_data *) multicast->context;
++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf;
++ len = sizeof(*msg);
++
++ ib_sa_free_multicast(multicast);
++
++ down_write(&list_rwsem);
++ list_del(&data->list);
++ up_write(&list_rwsem);
++
++ kfree(data);
++
++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret);
++
++ ret = ibp_send(ibp_client->ep, msg, len);
++ if (ret)
++ print_err("ibp_send returned %d\n", ret);
++
++ return ret;
++}
++
++int ibp_cmd_sa_get_mcmember_rec(struct ibp_client *ibp_client,
++ struct ibp_msg_header *hdr)
++{
++ struct ib_device *ib_device;
++ struct ibp_verb_response_msg *msg;
++ struct ibp_sa_get_mcmember_rec_cmd *cmd;
++ struct ibp_sa_get_mcmember_rec_resp *resp;
++ struct ib_sa_mcmember_rec rec;
++ union ib_gid mgid;
++ size_t len;
++ int ret;
++
++ print_trace("in\n");
++
++ cmd = (struct ibp_sa_get_mcmember_rec_cmd *) hdr;
++ ib_device = (struct ib_device *) cmd->device;
++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf;
++ len = sizeof(*msg);
++
++ mgid.global.subnet_prefix = cmd->subnet_prefix;
++ mgid.global.interface_id = cmd->interface_id;
++
++ ret = ib_sa_get_mcmember_rec(ib_device, cmd->port_num, &mgid, &rec);
++ if (ret) {
++ print_err("ib_sa_get_mcmember_rec returned %d\n", ret);
++ goto send_resp;
++ }
++
++ len += sizeof(*resp);
++ resp = (struct ibp_sa_get_mcmember_rec_resp *) msg->data;
++
++ resp->rec.mgid.global.subnet_prefix
++ = rec.mgid.global.subnet_prefix;
++ resp->rec.mgid.global.interface_id
++ = rec.mgid.global.interface_id;
++ resp->rec.port_gid.global.subnet_prefix
++ = rec.port_gid.global.subnet_prefix;
++ resp->rec.port_gid.global.interface_id
++ = rec.port_gid.global.interface_id;
++ resp->rec.qkey = rec.qkey;
++ resp->rec.mlid = rec.mlid;
++ resp->rec.mtu_selector = rec.mtu_selector;
++ resp->rec.mtu = rec.mtu;
++ resp->rec.traffic_class = rec.traffic_class;
++ resp->rec.pkey = rec.pkey;
++ resp->rec.rate_selector = rec.rate_selector;
++ resp->rec.rate = rec.rate;
++ resp->rec.packet_life_time_selector
++ = rec.packet_life_time_selector;
++ resp->rec.packet_life_time
++ = rec.packet_life_time;
++ resp->rec.sl = rec.sl;
++ resp->rec.flow_label = rec.flow_label;
++ resp->rec.hop_limit = rec.hop_limit;
++ resp->rec.scope = rec.scope;
++ resp->rec.join_state = rec.join_state;
++ resp->rec.proxy_join = rec.proxy_join;
++
++send_resp:
++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret);
++
++ ret = ibp_send(ibp_client->ep, msg, len);
++ if (ret)
++ print_err("ibp_send returned %d\n", ret);
++
++ return ret;
++}
+diff -urN a6/drivers/infiniband/ibp/sa/sa_table.h a7/drivers/infiniband/ibp/sa/sa_table.h
+--- a6/drivers/infiniband/ibp/sa/sa_table.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/sa/sa_table.h 2015-02-23 10:01:30.294769309 -0800
+@@ -0,0 +1,131 @@
++/*"
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#define PATH_REC_FIELD(field) \
++ .struct_offset_bytes = offsetof(struct ib_sa_path_rec, field), \
++ .struct_size_bytes = sizeof((struct ib_sa_path_rec *) 0)->field, \
++ .field_name = "sa_path_rec:" #field
++
++static const struct ib_field path_rec_table[] = {
++ { PATH_REC_FIELD(service_id),
++ .offset_words = 0,
++ .offset_bits = 0,
++ .size_bits = 64 },
++ { PATH_REC_FIELD(dgid),
++ .offset_words = 2,
++ .offset_bits = 0,
++ .size_bits = 128 },
++ { PATH_REC_FIELD(sgid),
++ .offset_words = 6,
++ .offset_bits = 0,
++ .size_bits = 128 },
++ { PATH_REC_FIELD(dlid),
++ .offset_words = 10,
++ .offset_bits = 0,
++ .size_bits = 16 },
++ { PATH_REC_FIELD(slid),
++ .offset_words = 10,
++ .offset_bits = 16,
++ .size_bits = 16 },
++ { PATH_REC_FIELD(raw_traffic),
++ .offset_words = 11,
++ .offset_bits = 0,
++ .size_bits = 1 },
++ { RESERVED,
++ .offset_words = 11,
++ .offset_bits = 1,
++ .size_bits = 3 },
++ { PATH_REC_FIELD(flow_label),
++ .offset_words = 11,
++ .offset_bits = 4,
++ .size_bits = 20 },
++ { PATH_REC_FIELD(hop_limit),
++ .offset_words = 11,
++ .offset_bits = 24,
++ .size_bits = 8 },
++ { PATH_REC_FIELD(traffic_class),
++ .offset_words = 12,
++ .offset_bits = 0,
++ .size_bits = 8 },
++ { PATH_REC_FIELD(reversible),
++ .offset_words = 12,
++ .offset_bits = 8,
++ .size_bits = 1 },
++ { PATH_REC_FIELD(numb_path),
++ .offset_words = 12,
++ .offset_bits = 9,
++ .size_bits = 7 },
++ { PATH_REC_FIELD(pkey),
++ .offset_words = 12,
++ .offset_bits = 16,
++ .size_bits = 16 },
++ { PATH_REC_FIELD(qos_class),
++ .offset_words = 13,
++ .offset_bits = 0,
++ .size_bits = 12 },
++ { PATH_REC_FIELD(sl),
++ .offset_words = 13,
++ .offset_bits = 12,
++ .size_bits = 4 },
++ { PATH_REC_FIELD(mtu_selector),
++ .offset_words = 13,
++ .offset_bits = 16,
++ .size_bits = 2 },
++ { PATH_REC_FIELD(mtu),
++ .offset_words = 13,
++ .offset_bits = 18,
++ .size_bits = 6 },
++ { PATH_REC_FIELD(rate_selector),
++ .offset_words = 13,
++ .offset_bits = 24,
++ .size_bits = 2 },
++ { PATH_REC_FIELD(rate),
++ .offset_words = 13,
++ .offset_bits = 26,
++ .size_bits = 6 },
++ { PATH_REC_FIELD(packet_life_time_selector),
++ .offset_words = 14,
++ .offset_bits = 0,
++ .size_bits = 2 },
++ { PATH_REC_FIELD(packet_life_time),
++ .offset_words = 14,
++ .offset_bits = 2,
++ .size_bits = 6 },
++ { PATH_REC_FIELD(preference),
++ .offset_words = 14,
++ .offset_bits = 8,
++ .size_bits = 8 },
++ { RESERVED,
++ .offset_words = 14,
++ .offset_bits = 16,
++ .size_bits = 48 },
++};
+diff -urN a6/drivers/infiniband/ibp/sa/server.c a7/drivers/infiniband/ibp/sa/server.c
+--- a6/drivers/infiniband/ibp/sa/server.c 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/sa/server.c 2015-02-23 10:01:30.294769309 -0800
+@@ -0,0 +1,221 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#include "server.h"
++
++MODULE_AUTHOR("Jerrie Coffman");
++MODULE_AUTHOR("Phil Cayton");
++MODULE_AUTHOR("Jay Sternberg");
++MODULE_LICENSE("Dual BSD/GPL");
++MODULE_DESCRIPTION(DRV_DESC);
++MODULE_VERSION(DRV_VERSION);
++
++MODULE_PARAM(port, port, int, IBP_SA_PORT, "Connection port");
++MODULE_PARAM(backlog, backlog, int, 8, "Connection backlog");
++MODULE_PARAM(timeout, timeout, int, 1000, "Listen/Poll time in milliseconds");
++
++#ifdef IBP_DEBUG
++MODULE_PARAM(debug_level, debug_level, int, 0, "Debug: 0-none, 1-some, 2-all");
++#endif
++
++struct rw_semaphore list_rwsem;
++
++LIST_HEAD(client_list);
++
++static struct task_struct *listen_thread;
++
++static struct ibp_client *ibp_create_client(scif_epd_t ep, uint16_t node)
++{
++ struct ibp_client *client;
++ int ret = -ENOMEM;
++
++ client = kzalloc(sizeof(*client), GFP_KERNEL);
++ if (!client) {
++ print_err("kzalloc failed\n");
++ return ERR_PTR(ret);
++ }
++
++ client->ep = ep;
++
++ client->rx_buf = (void *)__get_free_page(GFP_KERNEL);
++ if (!client->rx_buf) {
++ print_err("__get_free_page rx_buf failed\n");
++ goto err0;
++ }
++
++ client->tx_buf = (void *)__get_free_page(GFP_KERNEL);
++ if (!client->tx_buf) {
++ print_err("__get_free_page tx_buf failed\n");
++ goto err1;
++ }
++
++ client->workqueue = create_singlethread_workqueue(DRV_NAME);
++ if (!client->workqueue) {
++ print_err("create_singlethread_workqueue failed\n");
++ goto err2;
++ }
++
++ down_write(&list_rwsem);
++ list_add(&client->list, &client_list);
++ up_write(&list_rwsem);
++
++ client->ibp_sa_client_thread = kthread_run(ibp_process_recvs,
++ client, DRV_NAME);
++ if (!client->ibp_sa_client_thread) {
++ print_err("create client thread failed\n");
++ goto err3;
++ }
++
++ return client;
++err3:
++ down_write(&list_rwsem);
++ list_del(&client->list);
++ up_write(&list_rwsem);
++
++ destroy_workqueue(client->workqueue);
++err2:
++ free_page((uintptr_t)client->tx_buf);
++err1:
++ free_page((uintptr_t)client->rx_buf);
++err0:
++ kfree(client);
++ return ERR_PTR(ret);
++}
++
++static int ibp_sa_listen(void *data)
++{
++ struct ibp_client *client;
++ struct scif_pollepd listen;
++ struct scif_portID peer;
++ scif_epd_t ep;
++ int ret;
++
++ listen.epd = scif_open();
++ if (IS_NULL_OR_ERR(listen.epd)) {
++ print_err("scif_open failed\n");
++ ret = -EIO;
++ goto err0;
++ }
++ listen.events = POLLIN;
++
++ ret = scif_bind(listen.epd, port);
++ if (ret < 0) {
++ print_err("scif_bind returned %d\n", ret);
++ goto err1;
++ }
++
++ ret = scif_listen(listen.epd, backlog);
++ if (ret) {
++ print_err("scif_listen returned %d\n", ret);
++ goto err1;
++ }
++
++ while (!kthread_should_stop()) {
++
++ schedule();
++
++ ret = scif_poll(&listen, 1, timeout);
++ if (ret == 0) /* timeout */
++ continue;
++ if (ret < 0) {
++ print_err("scif_poll revents 0x%x\n", listen.revents);
++ continue;
++ }
++
++ ret = scif_accept(listen.epd, &peer, &ep, 0);
++ if (ret) {
++ print_err("scif_accept returned %d\n", ret);
++ continue;
++ }
++
++ print_dbg("accepted node %d port %d\n", peer.node, peer.port);
++
++ client = ibp_create_client(ep, peer.node);
++ if (IS_ERR(client)) {
++ ret = PTR_ERR(client);
++ print_err("ibp_create_client returned %d\n", ret);
++ scif_close(ep);
++ }
++ }
++err1:
++ scif_close(listen.epd);
++err0:
++ return ret;
++}
++
++static int __init ibp_sa_server_init(void)
++{
++ int ret = 0;
++
++ print_info(DRV_SIGNON);
++
++ init_rwsem(&list_rwsem);
++
++ /* Start a thread for inbound connections. */
++ listen_thread = kthread_run(ibp_sa_listen, NULL, DRV_NAME);
++ if (IS_NULL_OR_ERR(listen_thread)) {
++ ret = PTR_ERR(listen_thread);
++ print_err("kthread_run returned %d\n", ret);
++ }
++
++ return ret;
++}
++
++static void __exit ibp_sa_server_exit(void)
++{
++ struct ibp_client *client, *next;
++ struct completion done;
++
++ kthread_stop(listen_thread);
++
++ down_write(&list_rwsem);
++ list_for_each_entry_safe(client, next, &client_list, list) {
++ init_completion(&done);
++ client->done = &done;
++
++ /* Close scif ep to unblock the client thread scif_recv */
++ scif_close(client->ep);
++
++ up_write(&list_rwsem);
++
++ /* Wait for client thread to finish */
++ wait_for_completion(&done);
++
++ down_write(&list_rwsem);
++ }
++ up_write(&list_rwsem);
++
++ print_info(DRV_DESC " unloaded\n");
++}
++
++module_init(ibp_sa_server_init);
++module_exit(ibp_sa_server_exit);
+diff -urN a6/drivers/infiniband/ibp/sa/server.h a7/drivers/infiniband/ibp/sa/server.h
+--- a6/drivers/infiniband/ibp/sa/server.h 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/sa/server.h 2015-02-23 10:01:30.294769309 -0800
+@@ -0,0 +1,172 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#ifndef SERVER_H
++#define SERVER_H
++
++#include <linux/fs.h>
++#include <linux/cdev.h>
++#include <linux/anon_inodes.h>
++#include <rdma/ib_umem.h>
++#include <rdma/ib_cache.h>
++#include "ibp-abi.h"
++#include "sa_ibp_abi.h"
++#include "common.h"
++
++#define DRV_ROLE "Server"
++#define DRV_NAME "ibp_sa_server"
++
++extern int timeout;
++extern struct rw_semaphore list_rwsem;
++extern struct list_head client_list;
++extern struct list_head sa_entry_list;
++extern struct list_head query_list;
++extern struct list_head mcast_list;
++
++struct ib_sa_sm_ah {
++ struct ib_ah *ah;
++ struct kref ref;
++ u16 pkey_index;
++ u8 src_path_mask;
++};
++
++struct ib_sa_port {
++ struct ib_mad_agent *agent;
++ struct ib_mad_agent *notice_agent;
++ struct ib_sa_sm_ah *sm_ah;
++ struct work_struct update_task;
++ spinlock_t ah_lock;
++ u8 port_num;
++ struct ib_device *device;
++};
++
++struct ib_sa_device {
++ int start_port, end_port;
++ struct ib_event_handler event_handler;
++ struct ib_sa_port port[0];
++};
++
++struct ibp_client {
++ struct list_head list;
++ scif_epd_t ep;
++ void *rx_buf;
++ void *tx_buf;
++ struct completion *done;
++ struct workqueue_struct *workqueue;
++ struct task_struct *ibp_sa_client_thread;
++};
++
++struct sa_entry {
++ struct list_head list;
++ struct ib_sa_client ib_client;
++ struct ibp_client *client;
++};
++
++struct sa_query_entry {
++ struct list_head list;
++ int id;
++ struct ibp_client *ibp_client;
++ struct ib_sa_client *sa_client;
++ struct ib_sa_query *query;
++};
++
++struct path_rec_cb_data {
++ struct mutex lock;
++ int ret;
++ struct sa_query_entry *entry;
++ u64 ibp_entry;
++ u64 ibp_query;
++};
++
++struct join_mcast_cb_data {
++ struct mutex lock;
++ int ret;
++ struct ibp_client *client;
++ struct ib_sa_multicast *mcast;
++ struct list_head list;
++ u64 entry;
++ u64 mcentry;
++};
++
++struct generic_cb_data {
++ struct mutex lock;
++ int ret;
++};
++
++struct callback_work {
++ struct work_struct work;
++ struct ibp_client *client;
++ struct generic_cb_data *data;
++ int length;
++ struct callback_msg msg;
++};
++
++#define IBP_INIT_MSG(msg, size, op) \
++ do { \
++ (msg)->header.opcode = IBP_##op; \
++ (msg)->header.length = (size); \
++ (msg)->header.status = 0; \
++ (msg)->header.reserved = 0; \
++ (msg)->header.request = 0; \
++ } while (0)
++
++#define IBP_INIT_RESP(resp, size, op, req, stat) \
++ do { \
++ (resp)->header.opcode = IBP_##op; \
++ (resp)->header.length = (size); \
++ (resp)->header.status = (stat); \
++ (resp)->header.reserved = 0; \
++ (resp)->header.request = (req); \
++ } while (0)
++
++int ibp_process_recvs(void *p);
++
++int ibp_cmd_sa_path_rec_get(struct ibp_client *client,
++ struct ibp_msg_header *hdr);
++int ibp_cmd_sa_register_client(struct ibp_client *client,
++ struct ibp_msg_header *hdr);
++int ibp_cmd_sa_unregister_client(struct ibp_client *client,
++ struct ibp_msg_header *hdr);
++int ibp_cmd_sa_cancel_query(struct ibp_client *client,
++ struct ibp_msg_header *hdr);
++int ibp_cmd_init_ah_from_path(struct ibp_client *client,
++ struct ibp_msg_header *hdr);
++int ibp_cmd_init_ah_from_mcmember(struct ibp_client *client,
++ struct ibp_msg_header *hdr);
++int ibp_cmd_sa_join_multicast(struct ibp_client *client,
++ struct ibp_msg_header *hdr);
++int ibp_cmd_sa_free_multicast(struct ibp_client *client,
++ struct ibp_msg_header *hdr);
++int ibp_cmd_sa_get_mcmember_rec(struct ibp_client *client,
++ struct ibp_msg_header *hdr);
++
++#endif /* SERVER_H */
+diff -urN a6/drivers/infiniband/ibp/sa/server_msg.c a7/drivers/infiniband/ibp/sa/server_msg.c
+--- a6/drivers/infiniband/ibp/sa/server_msg.c 1969-12-31 16:00:00.000000000 -0800
++++ a7/drivers/infiniband/ibp/sa/server_msg.c 2015-02-23 10:01:30.294769309 -0800
+@@ -0,0 +1,185 @@
++/*
++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#include <linux/delay.h>
++
++#include "server.h"
++#include "sa_ibp_abi.h"
++
++int ibp_send(scif_epd_t ep, void *buf, size_t len)
++{
++ int ret;
++
++ while (len) {
++ ret = scif_send(ep, buf, (uint32_t)len, SCIF_SEND_BLOCK);
++ if (ret < 0) {
++ print_dbg("scif_send returned %d\n", ret);
++ return ret;
++ }
++ buf += ret;
++ len -= ret;
++ }
++
++ return 0;
++}
++
++int ibp_recv(scif_epd_t ep, void *buf, size_t len)
++{
++ int ret;
++
++ while (len) {
++ ret = scif_recv(ep, buf, (uint32_t)len, SCIF_RECV_BLOCK);
++ if (ret < 0) {
++ print_dbg("scif_recv returned %d\n", ret);
++ return ret;
++ }
++ buf += ret;
++ len -= ret;
++ }
++
++ return 0;
++}
++
++static int
++ibp_cmd_bad_request(struct ibp_client *client, struct ibp_msg_header *hdr)
++{
++ struct ibp_verb_response_msg *msg;
++ size_t len;
++ int status = -EBADRQC;
++
++ msg = (struct ibp_verb_response_msg *) client->tx_buf;
++ len = sizeof(*msg);
++
++ print_dbg("opcode 0x%x\n", hdr->opcode);
++
++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, status);
++ return ibp_send(client->ep, msg, len);
++}
++
++static void
++ibp_sa_destroy_client(struct ibp_client *client)
++{
++ struct join_mcast_cb_data *mcast, *next_mcast;
++ struct sa_query_entry *query, *next_query;
++ struct sa_entry *sa, *next_sa;
++
++ down_write(&list_rwsem);
++ list_del(&client->list);
++ list_for_each_entry_safe(mcast, next_mcast, &mcast_list, list)
++ if (mcast->client == client) {
++ ib_sa_free_multicast(mcast->mcast);
++ list_del(&mcast->list);
++ kfree(mcast);
++ }
++ list_for_each_entry_safe(query, next_query, &query_list, list)
++ if (query->ibp_client == client) {
++ ib_sa_cancel_query(query->id, query->query);
++ list_del(&query->list);
++ kfree(query);
++ }
++ list_for_each_entry_safe(sa, next_sa, &sa_entry_list, list)
++ if (sa->client == client) {
++ ib_sa_unregister_client(&sa->ib_client);
++ list_del(&sa->list);
++ kfree(sa);
++ }
++ up_write(&list_rwsem);
++
++ destroy_workqueue(client->workqueue);
++
++ free_page((uintptr_t)client->tx_buf);
++ free_page((uintptr_t)client->rx_buf);
++
++ if (client->done)
++ complete(client->done);
++ else
++ scif_close(client->ep);
++
++ kfree(client);
++}
++
++static int
++(*ibp_msg_table[])(struct ibp_client *c, struct ibp_msg_header *h) = {
++ [IBP_SA_PATH_REC_GET] = ibp_cmd_sa_path_rec_get,
++ [IBP_SA_REGISTER_CLIENT] = ibp_cmd_sa_register_client,
++ [IBP_SA_UNREGISTER_CLIENT] = ibp_cmd_sa_unregister_client,
++ [IBP_SA_CANCEL_QUERY] = ibp_cmd_sa_cancel_query,
++ [IBP_INIT_AH_FROM_PATH] = ibp_cmd_init_ah_from_path,
++ [IBP_INIT_AH_FROM_MCMEMBER] = ibp_cmd_init_ah_from_mcmember,
++ [IBP_SA_JOIN_MCAST] = ibp_cmd_sa_join_multicast,
++ [IBP_SA_FREE_MCAST] = ibp_cmd_sa_free_multicast,
++ [IBP_SA_GET_MCMEMBER_REC] = ibp_cmd_sa_get_mcmember_rec,
++};
++
++int ibp_process_recvs(void *p)
++{
++ struct ibp_client *client;
++ struct ibp_msg_header *hdr;
++ int ret;
++
++ client = (struct ibp_client *) p;
++ hdr = (struct ibp_msg_header *) client->rx_buf;
++
++ for (;;) {
++ ret = ibp_recv(client->ep, hdr, sizeof(*hdr));
++ if (ret)
++ break;
++
++ if (hdr->length > MAX_MSG_SIZE) {
++ print_err("message too large, len %u max %lu\n",
++ hdr->length, MAX_MSG_SIZE);
++ ret = -EMSGSIZE;
++ break;
++ }
++
++ if (hdr->length > sizeof(*hdr)) {
++ ret = ibp_recv(client->ep, hdr->data,
++ hdr->length - sizeof(*hdr));
++ if (ret)
++ break;
++ }
++
++ if ((hdr->opcode >= ARRAY_SIZE(ibp_msg_table)) ||
++ !ibp_msg_table[hdr->opcode]) {
++ ibp_cmd_bad_request(client, hdr);
++ continue;
++ }
++
++ ret = ibp_msg_table[hdr->opcode](client, hdr);
++ if (ret)
++ break;
++ }
++
++ ibp_sa_destroy_client(client);
++
++ return ret;
++}
--- /dev/null
+From 674c5e41008346a8d68f534d408e240b152dec5e Mon Sep 17 00:00:00 2001
+From: Phil Cayton <phil.cayton@intel.com>
+Date: Wed, 28 May 2014 15:53:58 -0700
+Subject: [PATCH 08/13] Add ibscif to the Infiniband HW directory
+
+Signed-off-by: Phil Cayton <phil.cayton@intel.com>
+---
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_ah.c a8/drivers/infiniband/hw/scif/ibscif_ah.c
+--- a7/drivers/infiniband/hw/scif/ibscif_ah.c 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_ah.c 2015-02-23 10:14:37.482809663 -0800
+@@ -0,0 +1,50 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "ibscif_driver.h"
++
++struct ib_ah *ibscif_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
++{
++ struct ibscif_ah *ah;
++
++ ah = kzalloc(sizeof *ah, GFP_KERNEL);
++ if (!ah)
++ return ERR_PTR(-ENOMEM);
++
++ ah->dlid = cpu_to_be16(attr->dlid);
++
++ return &ah->ibah;
++}
++
++int ibscif_destroy_ah(struct ib_ah *ibah)
++{
++ kfree(to_ah(ibah));
++ return 0;
++}
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_cm.c a8/drivers/infiniband/hw/scif/ibscif_cm.c
+--- a7/drivers/infiniband/hw/scif/ibscif_cm.c 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_cm.c 2015-02-23 10:14:37.482809663 -0800
+@@ -0,0 +1,515 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "ibscif_driver.h"
++
++static LIST_HEAD(listen_list);
++DEFINE_SPINLOCK(listen_list_lock);
++
++static int sockaddr_in_to_node_id( struct sockaddr_in addr )
++{
++ u8 *p = (u8 *)&addr.sin_addr.s_addr;
++
++ if (p[0]==192 && p[1]==0 && p[2]==2 && p[3]>=100 && p[3]<100+IBSCIF_MAX_DEVICES)
++ return (int)(p[3]-100);
++
++ else
++ return -EINVAL;
++}
++
++static struct sockaddr_in node_id_to_sockaddr_in( int node_id )
++{
++ struct sockaddr_in addr;
++ u8 *p = (u8 *)&addr.sin_addr.s_addr;
++
++ addr.sin_family = AF_INET;
++ addr.sin_addr.s_addr = 0;
++ addr.sin_port = 0;
++
++ p[0] = 192;
++ p[1] = 0;
++ p[2] = 2;
++ p[3] = 100 + node_id;
++
++ return addr;
++}
++
++void free_cm(struct kref *kref)
++{
++ struct ibscif_cm *cm_ctx;
++ cm_ctx = container_of(kref, struct ibscif_cm, kref);
++ if (cm_ctx->conn)
++ ibscif_put_conn(cm_ctx->conn);
++ kfree(cm_ctx);
++}
++
++static inline void get_cm(struct ibscif_cm *cm_ctx)
++{
++ kref_get(&cm_ctx->kref);
++}
++
++static inline void put_cm(struct ibscif_cm *cm_ctx)
++{
++ kref_put(&cm_ctx->kref, free_cm);
++}
++
++void free_listen(struct kref *kref)
++{
++ struct ibscif_listen *listen;
++ listen = container_of(kref, struct ibscif_listen, kref);
++ kfree(listen);
++}
++
++static inline void get_listen(struct ibscif_listen *listen)
++{
++ kref_get(&listen->kref);
++}
++
++static inline void put_listen(struct ibscif_listen *listen)
++{
++ kref_put(&listen->kref, free_listen);
++}
++
++static int connect_qp(struct ibscif_cm *cm_ctx)
++{
++ struct ibscif_qp *qp;
++ struct ib_qp_attr qp_attr;
++ int qp_attr_mask;
++ int err;
++
++ qp = ibscif_get_qp(cm_ctx->qpn);
++ if (IS_ERR(qp)) {
++ printk(KERN_ERR PFX "%s: invalid QP number: %d\n", __func__, cm_ctx->qpn);
++ return -EINVAL;
++ }
++
++ qp_attr_mask = IB_QP_STATE |
++ IB_QP_AV |
++ IB_QP_DEST_QPN |
++ IB_QP_ACCESS_FLAGS |
++ IB_QP_MAX_QP_RD_ATOMIC |
++ IB_QP_MAX_DEST_RD_ATOMIC;
++
++ qp_attr.ah_attr.ah_flags = 0;
++ qp_attr.ah_attr.dlid = IBSCIF_NODE_ID_TO_LID(cm_ctx->remote_node_id);
++ qp_attr.dest_qp_num = cm_ctx->remote_qpn;
++ qp_attr.qp_state = IB_QPS_RTS;
++ qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE |
++ IB_ACCESS_REMOTE_WRITE |
++ IB_ACCESS_REMOTE_READ |
++ IB_ACCESS_REMOTE_ATOMIC;
++ qp_attr.max_rd_atomic = 16; /* 8-bit value, don't use MAX_OR */
++ qp_attr.max_dest_rd_atomic = 16;/* 8-bit value, don't use MAX_IR */
++
++ err = ib_modify_qp(&qp->ibqp, &qp_attr, qp_attr_mask);
++
++ if (!err) {
++ qp->cm_context = cm_ctx;
++ get_cm(cm_ctx);
++ }
++
++ ibscif_put_qp(qp);
++
++ return err;
++}
++
++static void event_connection_close(struct ibscif_cm *cm_ctx)
++{
++ struct iw_cm_event event;
++
++ memset(&event, 0, sizeof(event));
++ event.event = IW_CM_EVENT_CLOSE;
++ event.status = -ECONNRESET;
++ if (cm_ctx->cm_id) {
++ cm_ctx->cm_id->event_handler(cm_ctx->cm_id, &event);
++ cm_ctx->cm_id->rem_ref(cm_ctx->cm_id);
++ cm_ctx->cm_id = NULL;
++ }
++}
++
++static void event_connection_reply(struct ibscif_cm *cm_ctx, int status)
++{
++ struct iw_cm_event event;
++
++ memset(&event, 0, sizeof(event));
++ event.event = IW_CM_EVENT_CONNECT_REPLY;
++ event.status = status;
++ event.local_addr = *(struct sockaddr_storage *) &cm_ctx->local_addr;
++ event.remote_addr = *(struct sockaddr_storage *) &cm_ctx->remote_addr;
++
++ if ((status == 0) || (status == -ECONNREFUSED)) {
++ event.private_data_len = cm_ctx->plen;
++ event.private_data = cm_ctx->pdata;
++ }
++ if (cm_ctx->cm_id) {
++ cm_ctx->cm_id->event_handler(cm_ctx->cm_id, &event);
++ if (status == -ECONNREFUSED) {
++ cm_ctx->cm_id->rem_ref(cm_ctx->cm_id);
++ cm_ctx->cm_id = NULL;
++ }
++ }
++}
++
++static void event_connection_request(struct ibscif_cm *cm_ctx)
++{
++ struct iw_cm_event event;
++
++ memset(&event, 0, sizeof(event));
++ event.event = IW_CM_EVENT_CONNECT_REQUEST;
++ event.local_addr = *(struct sockaddr_storage *) &cm_ctx->local_addr;
++ event.remote_addr = *(struct sockaddr_storage *) &cm_ctx->remote_addr;
++ event.private_data_len = cm_ctx->plen;
++ event.private_data = cm_ctx->pdata;
++ event.provider_data = cm_ctx;
++ event.ird = 16;
++ event.ord = 16;
++
++ if (cm_ctx->listen) {
++ cm_ctx->listen->cm_id->event_handler( cm_ctx->listen->cm_id, &event);
++ put_listen(cm_ctx->listen);
++ cm_ctx->listen = NULL;
++ }
++}
++
++static void event_connection_established( struct ibscif_cm *cm_ctx )
++{
++ struct iw_cm_event event;
++
++ memset(&event, 0, sizeof(event));
++ event.event = IW_CM_EVENT_ESTABLISHED;
++ event.ird = 16;
++ event.ord = 16;
++ if (cm_ctx->cm_id) {
++ cm_ctx->cm_id->event_handler(cm_ctx->cm_id, &event);
++ }
++}
++
++void ibscif_cm_async_callback(void *cm_context)
++{
++ struct ibscif_cm *cm_ctx = cm_context;
++
++ if (cm_ctx) {
++ event_connection_close(cm_ctx);
++ put_cm(cm_ctx);
++ }
++}
++
++int ibscif_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
++{
++ struct ibscif_cm *cm_ctx;
++ struct sockaddr_in *local_addr = (struct sockaddr_in *) &cm_id->local_addr;
++ struct sockaddr_in *remote_addr = (struct sockaddr_in *) &cm_id->remote_addr;
++ int node_id;
++ int remote_node_id;
++ int err = 0;
++
++ cm_ctx = kzalloc(sizeof *cm_ctx, GFP_KERNEL);
++ if (!cm_ctx) {
++ printk(KERN_ALERT PFX "%s: cannot allocate cm_ctx\n", __func__);
++ return -ENOMEM;
++ }
++
++ kref_init(&cm_ctx->kref); /* refcnt <- 1 */
++ spin_lock_init(&cm_ctx->lock);
++
++ node_id = sockaddr_in_to_node_id(*local_addr);
++ remote_node_id = sockaddr_in_to_node_id(*remote_addr);
++ if (node_id<0 || remote_node_id<0) {
++ printk(KERN_ALERT PFX "%s: invalid address, local_addr=%8x, remote_addr=%8x, node_id=%d, remote_node_id=%d\n",
++ __func__, local_addr->sin_addr.s_addr, remote_addr->sin_addr.s_addr,
++ node_id, remote_node_id);
++ err = -EINVAL;
++ goto out_free;
++ }
++
++ cm_ctx->conn = ibscif_get_conn( node_id, remote_node_id, 0 );
++ if (!cm_ctx->conn) {
++ printk(KERN_ALERT PFX "%s: failed to get connection %d-->%d\n", __func__, node_id, remote_node_id);
++ err = -EINVAL;
++ goto out_free;
++ }
++
++ cm_id->add_ref(cm_id);
++ cm_id->provider_data = cm_ctx;
++
++ cm_ctx->cm_id = cm_id;
++ cm_ctx->node_id = node_id;
++ cm_ctx->remote_node_id = remote_node_id;
++ cm_ctx->local_addr = *local_addr;
++ cm_ctx->remote_addr = *remote_addr;
++ cm_ctx->qpn = conn_param->qpn;
++ cm_ctx->plen = conn_param->private_data_len;
++ if (cm_ctx->plen > IBSCIF_MAX_PDATA_SIZE) {
++ printk(KERN_ALERT PFX "%s: plen (%d) exceeds the limit (%d), truncated.\n",
++ __func__, cm_ctx->plen, IBSCIF_MAX_PDATA_SIZE);
++ cm_ctx->plen = IBSCIF_MAX_PDATA_SIZE;
++ }
++ if (cm_ctx->plen)
++ memcpy(cm_ctx->pdata, conn_param->private_data, cm_ctx->plen);
++
++ err = ibscif_send_cm_req( cm_ctx );
++
++ return err;
++
++out_free:
++ kfree(cm_ctx);
++ return err;
++}
++
++int ibscif_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
++{
++ struct ibscif_cm *cm_ctx = cm_id->provider_data;
++ int err = 0;
++
++ cm_id->add_ref(cm_id);
++ cm_ctx->cm_id = cm_id;
++ cm_ctx->qpn = conn_param->qpn;
++ cm_ctx->plen = conn_param->private_data_len;
++ if (cm_ctx->plen > IBSCIF_MAX_PDATA_SIZE) {
++ printk(KERN_ALERT PFX "%s: plen (%d) exceeds the limit (%d), truncated.\n",
++ __func__, cm_ctx->plen, IBSCIF_MAX_PDATA_SIZE);
++ cm_ctx->plen = IBSCIF_MAX_PDATA_SIZE;
++ }
++ if (cm_ctx->plen)
++ memcpy(cm_ctx->pdata, conn_param->private_data, cm_ctx->plen);
++
++ err = connect_qp( cm_ctx );
++ if (err) {
++ printk(KERN_ALERT PFX "%s: failed to modify QP into connected state\n", __func__);
++ goto err_out;
++ }
++
++ err = ibscif_send_cm_rep( cm_ctx );
++ if (err) {
++ printk(KERN_ALERT PFX "%s: failed to send REP\n", __func__);
++ goto err_out;
++ }
++
++ return 0;
++
++err_out:
++ cm_id->rem_ref(cm_id);
++ cm_ctx->cm_id = NULL;
++ put_cm(cm_ctx);
++ return err;
++}
++
++int ibscif_cm_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
++{
++ struct ibscif_cm *cm_ctx = cm_id->provider_data;
++ int err = 0;
++
++ err = ibscif_send_cm_rej( cm_ctx, pdata, pdata_len );
++
++ put_cm(cm_ctx);
++ return err;
++}
++
++int ibscif_cm_create_listen(struct iw_cm_id *cm_id, int backlog)
++{
++ struct ibscif_listen *listen;
++ struct sockaddr_in *local_addr = (struct sockaddr_in *) &cm_id->local_addr;
++
++ listen = kzalloc(sizeof *listen, GFP_KERNEL);
++ if (!listen) {
++ printk(KERN_ALERT PFX "%s: cannot allocate listen object\n", __func__);
++ return -ENOMEM;
++ }
++
++ kref_init(&listen->kref); /* refcnt <- 1 */
++
++ listen->cm_id = cm_id;
++ listen->port = local_addr->sin_port;
++ cm_id->provider_data = listen;
++ cm_id->add_ref(cm_id);
++
++ spin_lock_bh(&listen_list_lock);
++ list_add(&listen->entry, &listen_list);
++ spin_unlock_bh(&listen_list_lock);
++
++ return 0;
++}
++
++int ibscif_cm_destroy_listen(struct iw_cm_id *cm_id)
++{
++ struct ibscif_listen *listen = cm_id->provider_data;
++
++ spin_lock_bh(&listen_list_lock);
++ list_del(&listen->entry);
++ spin_unlock_bh(&listen_list_lock);
++ cm_id->rem_ref(cm_id);
++ put_listen(listen);
++
++ return 0;
++}
++
++/* similar to ibscif_get_qp(), but differs in:
++ * (1) use the "irqsave" version of the lock functions to avoid the
++ * kernel warnings about "local_bh_enable_ip";
++ * (2) don't hold the reference on success;
++ * (3) return NULL instead of error code on failure.
++ */
++struct ib_qp *ibscif_cm_get_qp(struct ib_device *ibdev, int qpn)
++{
++ struct ibscif_qp *qp;
++ unsigned long flags;
++
++ read_lock_irqsave(&wiremap_lock, flags);
++ qp = idr_find(&wiremap, qpn);
++ if (likely(qp) && unlikely(qp->magic != QP_MAGIC))
++ qp = NULL;
++ read_unlock_irqrestore(&wiremap_lock,flags);
++
++ return qp ? &qp->ibqp : NULL;
++}
++
++void ibscif_cm_add_ref(struct ib_qp *ibqp)
++{
++ struct ibscif_qp *qp;
++
++ if (likely(ibqp)) {
++ qp = to_qp(ibqp);
++ kref_get(&qp->ref);
++ }
++}
++
++void ibscif_cm_rem_ref(struct ib_qp *ibqp)
++{
++ struct ibscif_qp *qp;
++
++ if (likely(ibqp)) {
++ qp = to_qp(ibqp);
++ ibscif_put_qp(qp);
++ }
++}
++
++int ibscif_process_cm_skb(struct sk_buff *skb, struct ibscif_conn *conn)
++{
++ union ibscif_pdu *pdu = (union ibscif_pdu *)skb->data;
++ struct ibscif_cm *cm_ctx;
++ struct ibscif_listen *listen;
++ int cmd, qpn, status, plen, err, port;
++ u64 req_ctx, rep_ctx;
++
++ req_ctx = __be64_to_cpu(pdu->cm.req_ctx);
++ rep_ctx = __be64_to_cpu(pdu->cm.rep_ctx);
++ cmd = __be32_to_cpu(pdu->cm.cmd);
++ port = __be32_to_cpu(pdu->cm.port);
++ qpn = __be32_to_cpu(pdu->cm.qpn);
++ status = __be32_to_cpu(pdu->cm.status);
++ plen = __be32_to_cpu(pdu->cm.plen);
++
++ switch (cmd) {
++ case IBSCIF_CM_REQ:
++ cm_ctx = kzalloc(sizeof *cm_ctx, GFP_KERNEL);
++ if (!cm_ctx) {
++ printk(KERN_ALERT PFX "%s: cannot allocate cm_ctx\n", __func__);
++ return -ENOMEM;
++ }
++ kref_init(&cm_ctx->kref); /* refcnt <- 1 */
++ spin_lock_init(&cm_ctx->lock);
++
++ spin_lock_bh(&listen_list_lock);
++ list_for_each_entry(listen, &listen_list, entry) {
++ if (listen->port == port) {
++ cm_ctx->listen = listen;
++ get_listen(listen);
++ }
++ }
++ spin_unlock_bh(&listen_list_lock);
++
++ if (!cm_ctx->listen) {
++ printk(KERN_ALERT PFX "%s: no matching listener for connection request, port=%d\n", __func__, port);
++ put_cm(cm_ctx);
++ /* fix me: send CM_REJ */
++ return -EINVAL;
++ }
++
++ cm_ctx->cm_id = NULL;
++ cm_ctx->node_id = conn->dev->node_id;
++ cm_ctx->remote_node_id = conn->remote_node_id;
++ cm_ctx->local_addr = node_id_to_sockaddr_in(cm_ctx->node_id);
++ if (cm_ctx->listen)
++ cm_ctx->local_addr.sin_port = cm_ctx->listen->port;
++ cm_ctx->remote_addr = node_id_to_sockaddr_in(cm_ctx->remote_node_id);
++ cm_ctx->remote_qpn = qpn;
++ cm_ctx->plen = plen;
++ if (cm_ctx->plen > IBSCIF_MAX_PDATA_SIZE) {
++ printk(KERN_ALERT PFX "%s: plen (%d) exceeds the limit (%d), truncated.\n",
++ __func__, cm_ctx->plen, IBSCIF_MAX_PDATA_SIZE);
++ cm_ctx->plen = IBSCIF_MAX_PDATA_SIZE;
++ }
++ if (cm_ctx->plen)
++ memcpy(cm_ctx->pdata, pdu->cm.pdata, cm_ctx->plen);
++
++ cm_ctx->peer_context = req_ctx;
++ cm_ctx->conn = conn;
++ atomic_inc(&conn->refcnt);
++
++ event_connection_request(cm_ctx);
++ break;
++
++ case IBSCIF_CM_REP:
++ cm_ctx = (struct ibscif_cm *)req_ctx;
++ cm_ctx->plen = plen;
++ memcpy(cm_ctx->pdata, pdu->cm.pdata, plen);
++ cm_ctx->remote_qpn = qpn;
++ cm_ctx->peer_context = rep_ctx;
++ err = connect_qp( cm_ctx );
++ if (!err)
++ err = ibscif_send_cm_rtu(cm_ctx);
++ if (err)
++ printk(KERN_ALERT PFX "%s: failed to modify QP into connected state\n", __func__);
++ event_connection_reply(cm_ctx, err);
++ put_cm(cm_ctx);
++ break;
++
++ case IBSCIF_CM_REJ:
++ cm_ctx = (struct ibscif_cm *)req_ctx;
++ cm_ctx->plen = plen;
++ memcpy(cm_ctx->pdata, pdu->cm.pdata, plen);
++ event_connection_reply(cm_ctx, status);
++ put_cm(cm_ctx);
++ break;
++
++ case IBSCIF_CM_RTU:
++ cm_ctx = (struct ibscif_cm *)rep_ctx;
++ event_connection_established( cm_ctx );
++ put_cm(cm_ctx);
++ break;
++
++ default:
++ printk(KERN_ALERT PFX "%s: invalid CM cmd: %d\n", __func__, pdu->cm.cmd);
++ break;
++ }
++
++ return 0;
++}
++
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_cq.c a8/drivers/infiniband/hw/scif/ibscif_cq.c
+--- a7/drivers/infiniband/hw/scif/ibscif_cq.c 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_cq.c 2015-02-23 10:14:37.483809663 -0800
+@@ -0,0 +1,313 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "ibscif_driver.h"
++
++static void ibscif_cq_tasklet(unsigned long cq_ptr)
++{
++ struct ibscif_cq *cq = (struct ibscif_cq *)cq_ptr;
++ cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
++}
++
++#ifdef MOFED
++struct ib_cq *ibscif_create_cq(struct ib_device *ibdev, struct ib_cq_init_attr *attr,
++ struct ib_ucontext *context, struct ib_udata *udata)
++#else
++struct ib_cq *ibscif_create_cq(struct ib_device *ibdev, int entries, int comp_vector,
++ struct ib_ucontext *context, struct ib_udata *udata)
++#endif
++{
++ struct ibscif_dev *dev = to_dev(ibdev);
++ struct ibscif_cq *cq;
++ int nbytes, npages;
++ int err;
++#ifdef MOFED
++ int entries = attr->cqe;
++#endif
++
++ if (entries < 1 || entries > MAX_CQ_SIZE)
++ return ERR_PTR(-EINVAL);
++
++ if (!atomic_add_unless(&dev->cq_cnt, 1, MAX_CQS))
++ return ERR_PTR(-EAGAIN);
++
++ cq = kzalloc(sizeof *cq, GFP_KERNEL);
++ if (!cq) {
++ atomic_dec(&dev->cq_cnt);
++ return ERR_PTR(-ENOMEM);
++ }
++
++ spin_lock_init(&cq->lock);
++ tasklet_init(&cq->tasklet, ibscif_cq_tasklet, (unsigned long)cq);
++ cq->state = CQ_READY;
++
++ nbytes = PAGE_ALIGN(entries * sizeof *cq->wc);
++ npages = nbytes >> PAGE_SHIFT;
++
++ err = ibscif_reserve_quota(&npages);
++ if (err)
++ goto out;
++
++ cq->wc = vzalloc(nbytes); /* Consider using vmalloc_user */
++ if (!cq->wc) {
++ err = -ENOMEM;
++ goto out;
++ }
++
++ cq->ibcq.cqe = nbytes / sizeof *cq->wc;
++
++ return &cq->ibcq;
++out:
++ ibscif_destroy_cq(&cq->ibcq);
++ return ERR_PTR(err);
++}
++
++int ibscif_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
++{
++ struct ibscif_cq *cq = to_cq(ibcq);
++ struct ibscif_wc *old_wc, *new_wc;
++ int nbytes, old_npages, new_npages, i, err;
++
++ if (cqe < 1 || cqe > MAX_CQ_SIZE)
++ return -EINVAL;
++
++ nbytes = PAGE_ALIGN(cqe * sizeof *cq->wc);
++ new_npages = nbytes >> PAGE_SHIFT;
++ old_npages = PAGE_ALIGN(ibcq->cqe * sizeof *cq->wc) >> PAGE_SHIFT;
++ new_npages -= old_npages;
++
++ if (new_npages == 0)
++ return 0;
++
++ if (new_npages > 0) {
++ err = ibscif_reserve_quota(&new_npages);
++ if (err)
++ return err;
++ }
++
++ new_wc = vzalloc(nbytes); /* Consider using vmalloc_user */
++ if (!new_wc) {
++ err = -ENOMEM;
++ goto out1;
++ }
++ cqe = nbytes / sizeof *cq->wc;
++ old_wc = cq->wc;
++
++ spin_lock_bh(&cq->lock);
++
++ if (cqe < cq->depth) {
++ err = -EBUSY;
++ goto out2;
++ }
++
++ for (i = 0; i < cq->depth; i++) {
++ new_wc[i] = old_wc[cq->head];
++ cq->head = (cq->head + 1) % ibcq->cqe;
++ }
++
++ cq->wc = new_wc;
++ cq->head = 0;
++ cq->tail = cq->depth;
++ ibcq->cqe = cqe;
++
++ spin_unlock_bh(&cq->lock);
++
++ if (old_wc)
++ vfree(old_wc);
++ if (new_npages < 0)
++ ibscif_release_quota(-new_npages);
++
++ return 0;
++out2:
++ spin_unlock_bh(&cq->lock);
++ vfree(new_wc);
++out1:
++ if (new_npages > 0)
++ ibscif_release_quota(new_npages);
++ return err;
++}
++
++int ibscif_destroy_cq(struct ib_cq *ibcq)
++{
++ struct ibscif_dev *dev = to_dev(ibcq->device);
++ struct ibscif_cq *cq = to_cq(ibcq);
++
++ tasklet_kill(&cq->tasklet);
++
++ if (cq->wc)
++ vfree(cq->wc);
++
++ ibscif_release_quota(PAGE_ALIGN(ibcq->cqe * sizeof *cq->wc) >> PAGE_SHIFT);
++
++ atomic_dec(&dev->cq_cnt);
++
++ kfree(cq);
++ return 0;
++}
++
++int ibscif_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
++{
++ struct ibscif_cq *cq = to_cq(ibcq);
++ struct ibscif_wq *wq;
++ int i, reap;
++
++ /*
++ * The protocol layer holds WQ lock while processing a packet and acquires
++ * the CQ lock to append a work completion. To avoid a deadly embrace, do
++ * not hold the CQ lock when adjusting the WQ reap count.
++ */
++ for (i = 0; (i < num_entries) && cq->depth; i++) {
++
++ spin_lock_bh(&cq->lock);
++ entry[i] = cq->wc[cq->head].ibwc;
++ reap = cq->wc[cq->head].reap;
++ cq->depth--;
++ wq = cq->wc[cq->head].wq;
++ cq->head = (cq->head + 1) % ibcq->cqe;
++ spin_unlock_bh(&cq->lock);
++
++ /* WQ may no longer exist or has been flushed. */
++ if (wq) {
++ spin_lock_bh(&wq->lock);
++ wq->head = (wq->head + reap) % wq->size;
++ wq->depth -= reap;
++ wq->completions -= reap;
++ spin_unlock_bh(&wq->lock);
++ }
++ }
++
++ return i;
++}
++
++int ibscif_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify)
++{
++ struct ibscif_cq *cq = to_cq(ibcq);
++ int ret;
++
++ spin_lock_bh(&cq->lock);
++
++ cq->arm |= notify & IB_CQ_SOLICITED_MASK;
++
++ if (notify & IB_CQ_SOLICITED)
++ cq->solicited = 0;
++
++ ret = (notify & IB_CQ_REPORT_MISSED_EVENTS) && cq->depth;
++
++ spin_unlock_bh(&cq->lock);
++
++ return ret;
++}
++
++void ibscif_notify_cq(struct ibscif_cq *cq)
++{
++ if (!cq->arm || !cq->depth)
++ return;
++
++ spin_lock_bh(&cq->lock);
++ if ((cq->arm & IB_CQ_NEXT_COMP) || ((cq->arm & IB_CQ_SOLICITED) && cq->solicited)) {
++ cq->arm = 0; /* Disarm the CQ */
++ spin_unlock_bh(&cq->lock);
++ tasklet_hi_schedule(&cq->tasklet);
++ } else
++ spin_unlock_bh(&cq->lock);
++}
++
++void ibscif_clear_cqes(struct ibscif_cq *cq, struct ibscif_wq *wq)
++{
++ struct ibscif_wc *wc;
++ int i, j;
++
++ if (!cq)
++ return;
++
++ /*
++ * Walk the CQ work completions and clear pointers to the
++ * given WQ to prevent retiring WQEs when CQEs are polled.
++ */
++ spin_lock_bh(&cq->lock);
++ j = cq->head;
++ for (i = 0; i < cq->depth; i++) {
++ wc = &cq->wc[j];
++ if (wc->wq == wq)
++ wc->wq = NULL;
++ j = (j + 1) % cq->ibcq.cqe;
++ }
++ spin_unlock_bh(&cq->lock);
++}
++
++/*
++ * Acquire lock and reserve a completion queue entry.
++ * Note that cq->lock is held upon successful completion of this call.
++ * On error, WQs affiliated with this CQ should generate an event and
++ * transition to the error state; refer to IB Spec r1.2 C11-39 and C11-40.
++ */
++int ibscif_reserve_cqe(struct ibscif_cq *cq, struct ibscif_wc **wc)
++{
++ spin_lock_bh(&cq->lock);
++
++ if (cq->state != CQ_READY) {
++ spin_unlock_bh(&cq->lock);
++ return -EIO;
++ }
++ if (!cq->ibcq.cqe) {
++ spin_unlock_bh(&cq->lock);
++ return -ENOSPC;
++ }
++ if (cq->depth == cq->ibcq.cqe) {
++ cq->state = CQ_ERROR;
++ spin_unlock_bh(&cq->lock);
++
++ if (cq->ibcq.event_handler) {
++ struct ib_event record;
++ record.event = IB_EVENT_CQ_ERR;
++ record.device = cq->ibcq.device;
++ record.element.cq = &cq->ibcq;
++ cq->ibcq.event_handler(&record, cq->ibcq.cq_context);
++ }
++ return -ENOBUFS;
++ }
++
++ *wc = &cq->wc[cq->tail];
++
++ return 0;
++}
++
++/*
++ * Append a completion queue entry and release lock.
++ * Note that this function assumes that the cq->lock is currently held.
++ */
++void ibscif_append_cqe(struct ibscif_cq *cq, struct ibscif_wc *wc, int solicited)
++{
++ cq->solicited = !!(solicited || (wc->ibwc.status != IB_WC_SUCCESS));
++ cq->tail = (cq->tail + 1) % cq->ibcq.cqe;
++ cq->depth++;
++
++ spin_unlock_bh(&cq->lock);
++}
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_driver.h a8/drivers/infiniband/hw/scif/ibscif_driver.h
+--- a7/drivers/infiniband/hw/scif/ibscif_driver.h 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_driver.h 2015-02-23 10:14:37.483809663 -0800
+@@ -0,0 +1,787 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef IBSCIF_DRIVER_H
++#define IBSCIF_DRIVER_H
++
++#include <linux/module.h>
++#include <linux/idr.h> /* for idr routines */
++#include <linux/kthread.h> /* for kthread routines */
++#include <linux/highmem.h> /* for kmap_atomic */
++#include <linux/pkt_sched.h> /* for TC_PRIO_CONTROL */
++#include <linux/if_arp.h> /* for ARPHRD_ETHER */
++#include <linux/swap.h> /* for totalram_pages */
++#include <linux/proc_fs.h> /* for proc_mkdir */
++#include <linux/version.h> /* for LINUX_VERSION_CODE */
++#include <linux/poll.h>
++#include <linux/workqueue.h>
++#include <linux/semaphore.h>
++
++/* these macros are defined in "linux/semaphore.h".
++ * however, they may be missing on older systems.
++ */
++#ifndef DECLARE_MUTEX
++#define DECLARE_MUTEX(name) \
++ struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1)
++#endif
++
++#ifndef init_MUTEX
++#define init_MUTEX(sem) sema_init(sem, 1)
++#endif
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,4,0)
++ #include <linux/interrupt.h>
++
++ #define KMAP_ATOMIC(x,y) kmap_atomic(x)
++ #define KUNMAP_ATOMIC(x,y) kunmap_atomic(x)
++#else
++ #define KMAP_ATOMIC(x,y) kmap_atomic(x, y)
++ #define KUNMAP_ATOMIC(x,y) kunmap_atomic(x, y)
++#endif
++
++#include <rdma/ib_umem.h>
++#include <rdma/ib_verbs.h>
++#include <rdma/ib_user_verbs.h>
++#include <rdma/iw_cm.h>
++
++#include <modules/scif.h>
++#include "ibscif_protocol.h"
++
++#define IBSCIF_MTU 4096
++
++#define IBSCIF_EP_TYPE_LISTEN 0
++#define IBSCIF_EP_TYPE_COMM 1
++
++#define DRV_NAME "ibscif"
++#define PFX DRV_NAME ": "
++#define IBDEV_PFX DRV_NAME ""
++#define DRV_DESC "OpenFabrics IBSCIF Driver"
++#define DRV_VERSION "0.1"
++#define DRV_SIGNON DRV_DESC " v" DRV_VERSION
++#define DRV_BUILD " built " __DATE__ " " __TIME__
++
++#define UVERBS_ABI_VER 6
++#define VENDOR_ID 0x8086 /* Intel Corporation */
++#define DEVICE_ID 0
++#define HW_REV 1
++#define FW_REV IBSCIF_PROTOCOL_VER
++
++/*
++ * Attribute limits.
++ * These limits are imposed on client requests, however, the actual values
++ * returned may be larger than these limits on some objects due to rounding.
++ * The definitions are intended to show the thinking behind the values.
++ * E.g., MAX_PDS defined as MAX_QPS is intended to allow each QP to be
++ * on a separate PD, although that is not a usage requirement.
++ */
++#define MAX_QPS (64 * 1024)
++#define MAX_QP_SIZE (16 * 1024)
++#define MAX_CQS (MAX_QPS * 2) /* x2:send queues + recv queues */
++#define MAX_CQ_SIZE (MAX_QP_SIZE * 4) /* or combined */
++#define MAX_PDS MAX_QPS /* 1 per QP */
++#if 0
++#define MAX_MRS (MAX_QPS * 4) /* x4:local/remote,read/write */
++#else
++#define MAX_MRS 16383 /* limited by IBSCIF_MR_MAX_KEY */
++#endif
++#define MAX_MR_SIZE (2U * 1024 * 1024 * 1024)
++#define MAX_SGES (PAGE_SIZE / sizeof(struct ib_sge))
++#define MAX_OR (MAX_QP_SIZE / 2) /* half outbound reqs */
++#define MAX_IR MAX_OR /* balance inbound with outbound */
++
++extern int window_size;
++#define MIN_WINDOW_SIZE 4 /* Ack every window_size/MIN_WINDOW_SIZE packets */
++
++extern int rma_threshold;
++extern int fast_rdma;
++extern int blocking_send;
++extern int blocking_recv;
++extern int scif_loopback;
++extern int host_proxy;
++extern int new_ib_type;
++extern int verbose;
++extern int check_grh;
++
++extern struct list_head devlist;
++extern struct semaphore devlist_mutex;
++
++extern struct idr wiremap;
++extern rwlock_t wiremap_lock;
++
++extern struct ib_dma_mapping_ops ibscif_dma_mapping_ops;
++
++/* Match IB opcodes for copy in post_send; append driver specific values. */
++enum ibscif_wr_opcode {
++ WR_SEND = IB_WR_SEND,
++ WR_SEND_WITH_IMM = IB_WR_SEND_WITH_IMM,
++ WR_RDMA_WRITE = IB_WR_RDMA_WRITE,
++ WR_RDMA_WRITE_WITH_IMM = IB_WR_RDMA_WRITE_WITH_IMM,
++ WR_RDMA_READ = IB_WR_RDMA_READ,
++ WR_ATOMIC_CMP_AND_SWP = IB_WR_ATOMIC_CMP_AND_SWP,
++ WR_ATOMIC_FETCH_AND_ADD = IB_WR_ATOMIC_FETCH_AND_ADD,
++ WR_RDMA_READ_RSP,
++ WR_ATOMIC_RSP,
++ WR_RMA_RSP,
++ WR_UD,
++ NR_WR_OPCODES /* Must be last (for stats) */
++};
++
++struct ibscif_stats {
++ unsigned long packets_sent;
++ unsigned long packets_rcvd;
++ unsigned long bytes_sent;
++ unsigned long bytes_rcvd;
++ unsigned long duplicates;
++ unsigned long tx_errors;
++ unsigned long sched_exhaust;
++ unsigned long unavailable;
++ unsigned long loopback;
++ unsigned long recv;
++ unsigned long recv_imm;
++ unsigned long wr_opcode[NR_WR_OPCODES];
++ unsigned long fast_rdma_write;
++ unsigned long fast_rdma_read;
++ unsigned long fast_rdma_unavailable;
++ unsigned long fast_rdma_fallback;
++ unsigned long fast_rdma_force_ack;
++ unsigned long fast_rdma_tail_write;
++};
++
++#define DEV_STAT(dev, counter) dev->stats.counter
++
++#define IBSCIF_MAX_DEVICES 16
++#define IBSCIF_NAME_SIZE 12
++
++#define IBSCIF_NODE_ID_TO_LID(node_id) (node_id+1000)
++#define IBSCIF_LID_TO_NODE_ID(lid) (lid-1000)
++
++struct ibscif_conn {
++ struct list_head entry;
++ atomic_t refcnt;
++ scif_epd_t ep;
++ unsigned short remote_node_id;
++ union ib_gid remote_gid;
++ struct ibscif_dev *dev;
++ int local_close;
++ int remote_close;
++};
++
++struct ibscif_listen {
++ struct iw_cm_id *cm_id;
++ struct list_head entry;
++ struct kref kref;
++ int port;
++};
++
++#define IBSCIF_MAX_PDATA_SIZE 256
++struct ibscif_cm {
++ struct iw_cm_id *cm_id;
++ struct ibscif_conn *conn;
++ struct ibscif_listen *listen;
++ struct kref kref;
++ spinlock_t lock;
++ struct sockaddr_in local_addr;
++ struct sockaddr_in remote_addr;
++ unsigned short node_id;
++ unsigned short remote_node_id;
++ u32 qpn;
++ u32 remote_qpn;
++ int plen;
++ u8 pdata[IBSCIF_MAX_PDATA_SIZE];
++ u64 peer_context;
++};
++
++struct ibscif_dev {
++ struct ib_device ibdev;
++ struct net_device *netdev; /* for RDMA CM support */
++ struct list_head entry;
++
++ char name[IBSCIF_NAME_SIZE];
++ union ib_gid gid;
++ unsigned short node_id;
++ atomic_t refcnt;
++ scif_epd_t listen_ep;
++ struct list_head conn_list;
++ struct list_head mr_list;
++ struct semaphore mr_list_mutex;
++
++ struct proc_dir_entry *procfs;
++ struct ibscif_stats stats;
++
++ atomic_t pd_cnt;
++ atomic_t cq_cnt;
++ atomic_t qp_cnt;
++ atomic_t mr_cnt;
++
++ atomic_t available;
++ atomic_t was_new;
++
++ spinlock_t atomic_op;
++
++ struct semaphore mutex;
++ struct list_head wq_list; /* List of WQ's on this device */
++};
++
++struct ibscif_pd {
++ struct ib_pd ibpd;
++};
++
++struct ibscif_ah {
++ struct ib_ah ibah;
++ __be16 dlid;
++};
++
++struct ibscif_wc {
++ struct ib_wc ibwc;
++ int reap;
++ struct ibscif_wq *wq;
++};
++
++enum ibscif_cq_state {
++ CQ_READY,
++ CQ_ERROR
++};
++
++struct ibscif_cq {
++ struct ib_cq ibcq;
++ spinlock_t lock;
++ struct tasklet_struct tasklet;
++ enum ibscif_cq_state state;
++ enum ib_cq_notify_flags arm;
++ int solicited;
++ int head;
++ int tail;
++ int depth;
++ struct ibscif_wc *wc;
++};
++
++struct ibscif_ds {
++ struct ibscif_mr *mr;
++ u32 offset;
++ u32 length;
++ u32 lkey;
++ u32 in_use;
++ struct ibscif_mreg_info *current_mreg;
++};
++
++struct ibscif_segmentation {
++ struct ibscif_ds *current_ds;
++ u32 current_page_index;
++ u32 current_page_offset;
++ u32 wr_length_remaining;
++ u32 ds_length_remaining;
++ u32 starting_seq;
++ u32 next_seq;
++ u32 ending_seq;
++};
++
++struct ibscif_reassembly {
++ struct ibscif_ds *current_ds;
++ u32 current_ds_offset;
++ u32 last_packet_seq;
++ u32 last_seen_seq;
++ __be32 immediate_data;
++ int final_length;
++ u16 opcode;
++};
++
++struct ibscif_sar {
++ struct ibscif_segmentation seg;
++ struct ibscif_reassembly rea;
++};
++
++enum ibscif_wr_state {
++ WR_WAITING,
++ WR_STARTED,
++ WR_WAITING_FOR_ACK,
++ WR_WAITING_FOR_RSP,
++ WR_LAST_SEEN,
++ WR_COMPLETED
++};
++
++struct ibscif_wr {
++ u64 id;
++ enum ibscif_wr_opcode opcode;
++ int length;
++ enum ib_send_flags flags;
++
++ u32 msg_id;
++ enum ibscif_wr_state state;
++ struct ibscif_sar sar;
++ u32 use_rma;
++ u32 rma_id;
++
++ union {
++ struct ibscif_send {
++ u32 immediate_data;
++ } send;
++
++ struct ibscif_ud {
++ u16 remote_node_id;
++ u32 remote_qpn;
++ } ud;
++
++ struct ibscif_read {
++ u64 remote_address;
++ int remote_length;
++ u32 rkey;
++ } read;
++
++ struct ibscif_write {
++ u64 remote_address;
++ u32 rkey;
++ u32 immediate_data;
++ } write;
++
++ struct ibscif_cmp_swp {
++ u64 cmp_operand;
++ u64 swp_operand;
++ u64 remote_address;
++ u32 rkey;
++ } cmp_swp;
++
++ struct ibscif_fetch_add {
++ u64 add_operand;
++ u64 remote_address;
++ u32 rkey;
++ } fetch_add;
++
++ struct ibscif_atomic_rsp {
++ u64 orig_data;
++ u16 opcode;
++ } atomic_rsp;
++
++ struct ibscif_rma_rsp {
++ u32 xfer_length;
++ u32 error;
++ } rma_rsp;
++ };
++
++ u32 num_ds;
++ struct ibscif_ds ds_list[0]; /* Must be last */
++};
++
++struct ibscif_tx_state {
++ u32 next_seq;
++ u32 last_ack_seq_recvd;
++ u32 next_msg_id;
++};
++
++struct ibscif_rx_state {
++ u32 last_in_seq;
++ u32 last_seq_acked;
++ int defer_in_process;
++};
++
++struct ibscif_wirestate {
++ struct ibscif_tx_state tx;
++ struct ibscif_rx_state rx;
++};
++
++struct ibscif_wire {
++ struct ibscif_wirestate sq;
++ struct ibscif_wirestate iq;
++};
++
++struct ibscif_wq {
++ struct list_head entry;
++ struct ibscif_qp *qp;
++ spinlock_t lock;
++ struct ibscif_wr *wr;
++ int head;
++ int tail;
++ int depth;
++ int size;
++ int max_sge;
++ int wr_size;
++ int completions;
++ int reap;
++ int next_wr;
++ int next_msg_id;
++ struct ibscif_wirestate *wirestate;
++ int fast_rdma_completions;
++ int ud_msg_id;
++};
++
++enum ibscif_qp_state {
++ QP_IDLE,
++ QP_CONNECTED,
++ QP_DISCONNECT,
++ QP_ERROR,
++ QP_RESET,
++ QP_IGNORE,
++ NR_QP_STATES /* Must be last */
++};
++
++enum ibscif_schedule {
++ SCHEDULE_RESUME = 1 << 0,
++ SCHEDULE_RETRY = 1 << 1,
++ SCHEDULE_TIMEOUT = 1 << 2,
++ SCHEDULE_SQ = 1 << 6,
++ SCHEDULE_IQ = 1 << 7
++};
++
++struct ibscif_qp {
++ int magic; /* Must be first */
++# define QP_MAGIC 0x5b51505d /* "[QP]" */
++ struct kref ref;
++ struct completion done;
++ struct ib_qp ibqp;
++ struct ibscif_dev *dev;
++ enum ib_access_flags access;
++ enum ib_sig_type sq_policy;
++ enum ibscif_schedule schedule;
++ struct ibscif_wire wire;
++ int mtu;
++
++ int max_or;
++ atomic_t or_depth;
++ atomic_t or_posted;
++
++ struct semaphore modify_mutex;
++ spinlock_t lock;
++ enum ibscif_qp_state state;
++ u16 local_node_id;
++ u16 remote_node_id;
++ struct ibscif_conn *conn;
++ u32 remote_qpn;
++ int loopback;
++ struct ibscif_wq sq;
++ struct ibscif_wq rq;
++ struct ibscif_wq iq;
++ int in_scheduler;
++
++ struct ibscif_conn *ud_conn[IBSCIF_MAX_DEVICES];
++ struct ibscif_cm *cm_context;
++};
++
++#define is_sq(wq) (wq == &wq->qp->sq)
++#define is_rq(wq) (wq == &wq->qp->rq)
++#define is_iq(wq) (wq == &wq->qp->iq)
++
++/* Info about MR registered via SCIF API */
++struct ibscif_mreg_info {
++ struct list_head entry;
++ struct ibscif_conn *conn;
++ u64 offset;
++ u64 aligned_offset;
++ u32 aligned_length;
++};
++
++struct ibscif_mr {
++ int magic; /* Must be first */
++# define MR_MAGIC 0x5b4d525d /* "[MR]" */
++ struct list_head entry;
++ struct kref ref;
++ struct completion done;
++ struct ib_mr ibmr;
++ struct ib_umem *umem;
++ enum ib_access_flags access;
++ u64 addr;
++ u32 length;
++ int npages;
++ struct page **page;
++ scif_pinned_pages_t pinned_pages;
++ struct list_head mreg_list;
++};
++
++/* Canonical virtual address on X86_64 falls in the range 0x0000000000000000-0x00007fffffffffff
++ * and 0xffff800000000000-0xffffffffffffffff. The range 0x0000800000000000-0xffff7fffffffffff
++ * are unused. This basically means only 48 bits are used and the highest 16 bits are just sign
++ * extensions. We can put rkey into these 16 bits and use the result as the "offset" of SCIF's
++ * registered address space. By doing this, the SCIF_MAP_FIXED flag can be used so that the offset
++ * can be calculated directly from rkey and virtual address w/o using the "remote registration cache"
++ * mechanism.
++ *
++ * SCIF reserve the top 2 bits of the offset for internal uses, leaving 14 bits for rkey.
++ */
++#define IBSCIF_MR_MAX_KEY (0x3FFF)
++#define IBSCIF_MR_VADDR_MASK (0x0000FFFFFFFFFFFFUL)
++#define IBSCIF_MR_SIGN_MASK (0x0000800000000000UL)
++#define IBSCIF_MR_SIGN_EXT (0xFFFF000000000000UL)
++#define IBSCIF_MR_RKEY_MASK (0x3FFF000000000000UL)
++
++#define IBSCIF_MR_VADDR_TO_OFFSET(rkey, vaddr) ((((unsigned long)rkey) << 48) | \
++ (vaddr & IBSCIF_MR_VADDR_MASK))
++
++#define IBSCIF_MR_OFFSET_TO_VADDR(offset) ((offset & IBSCIF_MR_SIGN_MASK) ? \
++ (offset | IBSCIF_MR_SIGN_EXT) : \
++ (offset & IBSCIF_MR_VADDR_MASK))
++
++#define IBSCIF_MR_OFFSET_TO_RKEY(offset) ((offset & IBSCIF_MR_RKEY_MASK) >> 48)
++
++#define TO_OBJ(name, src, dst, field) \
++static inline struct dst *name(struct src *field) \
++{ \
++ return container_of(field, struct dst, field); \
++}
++TO_OBJ(to_dev, ib_device, ibscif_dev, ibdev)
++TO_OBJ(to_pd, ib_pd, ibscif_pd, ibpd)
++TO_OBJ(to_cq, ib_cq, ibscif_cq, ibcq)
++TO_OBJ(to_qp, ib_qp, ibscif_qp, ibqp)
++TO_OBJ(to_mr, ib_mr, ibscif_mr, ibmr)
++TO_OBJ(to_ah, ib_ah, ibscif_ah, ibah)
++
++#define OBJ_GET(obj, type) \
++static inline struct ibscif_##obj *ibscif_get_##obj(int id) \
++{ \
++ struct ibscif_##obj *obj; \
++ read_lock_bh(&wiremap_lock); \
++ obj = idr_find(&wiremap, id); \
++ if (likely(obj)) { \
++ if (likely(obj->magic == type)) \
++ kref_get(&obj->ref); \
++ else \
++ obj = ERR_PTR(-ENXIO); \
++ } else \
++ obj = ERR_PTR(-ENOENT); \
++ read_unlock_bh(&wiremap_lock); \
++ return obj; \
++}
++OBJ_GET(mr, MR_MAGIC)
++OBJ_GET(qp, QP_MAGIC)
++
++void ibscif_complete_mr(struct kref *kref);
++void ibscif_complete_qp(struct kref *kref);
++
++#define OBJ_PUT(obj) \
++static inline void ibscif_put_##obj(struct ibscif_##obj *obj) \
++{ \
++ if (likely(obj)) \
++ kref_put(&obj->ref, ibscif_complete_##obj); \
++}
++OBJ_PUT(mr)
++OBJ_PUT(qp)
++
++#define RHEL61_AND_ABOVE 0
++#if defined(RHEL_MAJOR) && defined(RHEL_MINOR)
++#if (RHEL_MAJOR==6) && (RHEL_MINOR>0)
++#undef RHEL61_AND_ABOVE
++#define RHEL61_AND_ABOVE 1
++#endif
++#endif
++
++#if (LINUX_VERSION_CODE<KERNEL_VERSION(2,6,37)) && ! RHEL61_AND_ABOVE
++static inline void *vzalloc(unsigned long size)
++{
++ void *addr = vmalloc(size);
++ if (addr)
++ memset(addr, 0, size);
++ return addr;
++}
++#endif
++
++/* This function assumes the WQ is protected by a lock. */
++static inline struct ibscif_wr *ibscif_get_wr(struct ibscif_wq *wq, int index)
++{
++ /* Must calculate because WQ array elements are variable sized. */
++ return (struct ibscif_wr *)((void *)wq->wr + (wq->wr_size * index));
++}
++
++/* This function assumes the WQ is protected by a lock. */
++static inline void ibscif_append_wq(struct ibscif_wq *wq)
++{
++ wq->tail = (wq->tail + 1) % wq->size;
++ wq->depth++;
++ wq->next_msg_id++;
++}
++
++static inline void ibscif_clear_ds_ref(struct ibscif_ds *ds)
++{
++ if (ds->in_use) {
++ ds->in_use = 0;
++ ibscif_put_mr(ds->mr);
++ }
++}
++
++static inline void ibscif_clear_ds_refs(struct ibscif_ds *ds, int num_ds)
++{
++ while(num_ds--)
++ ibscif_clear_ds_ref(ds++);
++}
++
++static inline enum ib_wc_opcode to_ib_wc_opcode(enum ib_wr_opcode opcode)
++{
++ /* SQ only - RQ is either IB_WC_RECV or IB_WC_RECV_RDMA_WITH_IMM. */
++ switch (opcode) {
++ case IB_WR_RDMA_WRITE: return IB_WC_RDMA_WRITE;
++ case IB_WR_RDMA_WRITE_WITH_IMM: return IB_WC_RDMA_WRITE;
++ case IB_WR_SEND: return IB_WC_SEND;
++ case IB_WR_SEND_WITH_IMM: return IB_WC_SEND;
++ case IB_WR_RDMA_READ: return IB_WC_RDMA_READ;
++ case IB_WR_ATOMIC_CMP_AND_SWP: return IB_WC_COMP_SWAP;
++ case IB_WR_ATOMIC_FETCH_AND_ADD: return IB_WC_FETCH_ADD;
++ default: return -1;
++ }
++}
++
++static inline void *ibscif_map_src(struct page *page)
++{
++ return KMAP_ATOMIC(page, KM_SOFTIRQ0);
++}
++
++static inline void *ibscif_map_dst(struct page *page)
++{
++ return KMAP_ATOMIC(page, KM_SOFTIRQ1);
++}
++
++static inline void ibscif_unmap_src(struct page *page, void *addr)
++{
++ if (likely(addr))
++ KUNMAP_ATOMIC(addr, KM_SOFTIRQ0);
++}
++
++static inline void ibscif_unmap_dst(struct page *page, void *addr)
++{
++ if (likely(addr))
++ KUNMAP_ATOMIC(addr, KM_SOFTIRQ1);
++ if (likely(page)) {
++ flush_dcache_page(page);
++ if (!PageReserved(page))
++ set_page_dirty(page);
++ }
++}
++
++#ifdef IBSCIF_PERF_TEST
++#define IBSCIF_PERF_SAMPLE(counter,next) ibscif_perf_sample(counter,next)
++#else
++#define IBSCIF_PERF_SAMPLE(counter,next)
++#endif
++
++int ibscif_atomic_copy(void *dst_addr, void *src_addr, u32 copy_len, int head_copied);
++
++int ibscif_wiremap_add(void *obj, int *id);
++void ibscif_wiremap_del(int id);
++
++int ibscif_dev_init(void);
++void ibscif_protocol_init_pre(void);
++void ibscif_protocol_init_post(void);
++
++void ibscif_dev_cleanup(void);
++void ibscif_protocol_cleanup(void);
++
++int ibscif_procfs_add_dev(struct ibscif_dev *dev);
++void ibscif_procfs_remove_dev(struct ibscif_dev *dev);
++
++int ibscif_reserve_quota(int *npages);
++void ibscif_release_quota(int npages);
++
++void ibscif_scheduler_add_qp(struct ibscif_qp *qp);
++void ibscif_scheduler_remove_qp(struct ibscif_qp *qp);
++void ibscif_schedule(struct ibscif_wq *wq);
++
++struct ib_ah *ibscif_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr);
++int ibscif_destroy_ah(struct ib_ah *ibah);
++
++struct ib_pd *ibscif_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata);
++int ibscif_dealloc_pd(struct ib_pd *ibpd);
++
++struct ib_qp *ibscif_create_qp(struct ib_pd *ibpd, struct ib_qp_init_attr *attr, struct ib_udata *udata);
++int ibscif_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_qp_init_attr *init_attr);
++int ibscif_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata);
++int ibscif_destroy_qp(struct ib_qp *ibqp);
++void ibscif_qp_internal_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason);
++void ibscif_qp_remote_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason);
++void ibscif_qp_add_ud_conn(struct ibscif_qp *qp, struct ibscif_conn *conn);
++
++#ifdef MOFED
++struct ib_cq *ibscif_create_cq(struct ib_device *ibdev, struct ib_cq_init_attr *attr,
++ struct ib_ucontext *context, struct ib_udata *udata);
++#else
++struct ib_cq *ibscif_create_cq(struct ib_device *ibdev, int entries, int comp_vector,
++ struct ib_ucontext *context, struct ib_udata *udata);
++#endif
++int ibscif_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
++int ibscif_destroy_cq(struct ib_cq *ibcq);
++int ibscif_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
++int ibscif_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify);
++void ibscif_notify_cq(struct ibscif_cq *cq);
++void ibscif_clear_cqes(struct ibscif_cq *cq, struct ibscif_wq *wq);
++int ibscif_reserve_cqe(struct ibscif_cq *cq, struct ibscif_wc **wc);
++void ibscif_append_cqe(struct ibscif_cq *cq, struct ibscif_wc *wc, int solicited);
++
++struct ib_mr *ibscif_get_dma_mr(struct ib_pd *ibpd, int access);
++struct ib_mr *ibscif_reg_phys_mr(struct ib_pd *ibpd, struct ib_phys_buf *phys_buf_array,
++ int num_phys_buf, int access, u64 *iova_start);
++#ifdef MOFED
++struct ib_mr *ibscif_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
++ u64 virt_addr, int access, struct ib_udata *udata, int mr_id);
++#else
++struct ib_mr *ibscif_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
++ u64 virt_addr, int access, struct ib_udata *udata);
++#endif
++int ibscif_dereg_mr(struct ib_mr *ibmr);
++struct ibscif_mr *ibscif_validate_mr(u32 key, u64 addr, int length,
++ struct ib_pd *ibpd, enum ib_access_flags access);
++struct ibscif_mreg_info *ibscif_mr_get_mreg(struct ibscif_mr *mr, struct ibscif_conn *conn);
++void ibscif_refresh_mreg( struct ibscif_conn *conn );
++
++int ibscif_post_send(struct ib_qp *ibqp, struct ib_send_wr *ibwr, struct ib_send_wr **bad_wr);
++int ibscif_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ibwr, struct ib_recv_wr **bad_wr);
++
++void ibscif_send_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason);
++void ibscif_send_close(struct ibscif_conn *conn);
++void ibscif_send_reopen(struct ibscif_conn *conn);
++
++void ibscif_loopback_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason);
++void ibscif_loopback(struct ibscif_wq *sq);
++
++int ibscif_xmit_wr(struct ibscif_wq *wq, struct ibscif_wr *wr, int tx_limit, int retransmit,
++ u32 from_seq, u32 *posted);
++int ibscif_process_sq_completions(struct ibscif_qp *qp);
++
++struct ibscif_conn *ibscif_get_conn( int node_id, int remote_node_id, int find_local_peer );
++void ibscif_put_conn( struct ibscif_conn *conn );
++void ibscif_do_accept(struct ibscif_dev *dev);
++void ibscif_get_pollep_list(struct scif_pollepd *polleps, struct ibscif_dev **devs,
++ int *types, struct ibscif_conn **conns, int *count);
++void ibscif_refresh_pollep_list(void);
++void ibscif_get_ep_list(scif_epd_t *eps, int *count);
++void ibscif_remove_ep(struct ibscif_dev *dev, scif_epd_t ep);
++void ibscif_free_conn(struct ibscif_conn *conn);
++int ibscif_cleanup_idle_conn( void );
++void ibscif_perf_sample(int counter, int next);
++
++int ibscif_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
++int ibscif_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
++int ibscif_cm_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
++int ibscif_cm_create_listen(struct iw_cm_id *cm_id, int backlog);
++int ibscif_cm_destroy_listen(struct iw_cm_id *cm_id);
++struct ib_qp *ibscif_cm_get_qp(struct ib_device *ibdev, int qpn);
++void ibscif_cm_add_ref(struct ib_qp *ibqp);
++void ibscif_cm_rem_ref(struct ib_qp *ibqp);
++void ibscif_cm_async_callback(void *cm_context);
++int ibscif_process_cm_skb(struct sk_buff *skb, struct ibscif_conn *conn);
++int ibscif_send_cm_req(struct ibscif_cm *cm_ctx);
++int ibscif_send_cm_rep(struct ibscif_cm *cm_ctx);
++int ibscif_send_cm_rej(struct ibscif_cm *cm_ctx, const void *pdata, u8 plen);
++int ibscif_send_cm_rtu(struct ibscif_cm *cm_ctx);
++
++#endif /* IBSCIF_DRIVER_H */
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_loopback.c a8/drivers/infiniband/hw/scif/ibscif_loopback.c
+--- a7/drivers/infiniband/hw/scif/ibscif_loopback.c 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_loopback.c 2015-02-23 10:14:37.484809663 -0800
+@@ -0,0 +1,582 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "ibscif_driver.h"
++
++struct ibscif_seg {
++ enum ib_access_flags access;
++ struct ibscif_ds *ds;
++ struct ibscif_mr *mr;
++ struct page **page;
++ void *addr;
++ u32 offset;
++ u32 ds_len;
++ u32 pg_len;
++ void *(*map)(struct page *page);
++ void (*unmap)(struct page *page, void *addr);
++};
++
++static void ibscif_seg_init(struct ibscif_seg *seg, struct ibscif_ds *ds,
++ void *(*map)(struct page *page), void (*unmap)(struct page *page, void *addr),
++ enum ib_access_flags access)
++{
++ memset(seg, 0, sizeof *seg);
++ seg->ds = ds;
++ seg->map = map;
++ seg->unmap = unmap;
++ seg->access = access;
++}
++
++static void ibscif_seg_fini(struct ibscif_seg *seg)
++{
++ seg->unmap(*seg->page, seg->addr);
++ if (likely(seg->mr))
++ ibscif_put_mr(seg->mr);
++}
++
++static int ibscif_seg_set(struct ibscif_seg *seg, u32 length, u32 copy_len)
++{
++ struct page **prev_page;
++
++ if (!seg->ds_len) {
++
++ if (seg->mr)
++ ibscif_put_mr(seg->mr);
++
++ seg->mr = ibscif_get_mr(seg->ds->lkey);
++ if (unlikely(IS_ERR(seg->mr)))
++ return PTR_ERR(seg->mr);
++
++ if (unlikely(seg->access && !(seg->mr->access & seg->access)))
++ return -EACCES;
++
++ prev_page = seg->page;
++ seg->offset = seg->ds->offset + (seg->mr->addr & ~PAGE_MASK);
++ seg->page = &seg->mr->page[seg->offset >> PAGE_SHIFT];
++ seg->offset &= ~PAGE_MASK;
++ seg->ds_len = seg->ds->length;
++ seg->pg_len = min(seg->ds_len, (u32)PAGE_SIZE - seg->offset);
++ seg->pg_len = min(seg->pg_len, length);
++
++ if (seg->page != prev_page)
++ seg->addr = seg->map(*seg->page) + seg->offset;
++
++ seg->ds++;
++
++ } else if (!seg->pg_len) {
++
++ seg->unmap(*seg->page, seg->addr);
++
++ seg->page++;
++ seg->addr = seg->map(*seg->page);
++ seg->pg_len = min(seg->ds_len, (u32)PAGE_SIZE);
++ seg->pg_len = min(seg->pg_len, length);
++ } else
++ seg->addr += copy_len;
++
++ return 0;
++}
++
++static inline int ibscif_seg_copy(struct ibscif_seg *dst, struct ibscif_seg *src, u32 length, int head_copied)
++{
++ src->ds_len -= length;
++ src->pg_len -= length;
++
++ dst->ds_len -= length;
++ dst->pg_len -= length;
++
++ return ibscif_atomic_copy(dst->addr, src->addr, length, head_copied);
++}
++
++/*
++ * Copy data from the source to the destination data segment list.
++ * This is a bit complicated since we must map and copy each page
++ * individually and because each data segment can be split across
++ * multiple pages within the memory region as illustrated below:
++ *
++ * +---page---+ +---page---+ +---page---+
++ * | .~~mr~~~|~~~|~~~~~~~~~~|~~~|~~~~~~. |
++ * | | | | [==ds===|===|====] | |
++ * | '~~~~~~~|~~~|~~~~~~~~~~|~~~|~~~~~~' |
++ * +----------+ +----------+ +----------+
++ *
++ * For example, due to different buffer page offsets, copying data
++ * between the following buffers will result in five separate copy
++ * operations as shown by the numeric labels below:
++ *
++ * +----------+ +----------+
++ * | | | |
++ * |1111111111| | |
++ * |2222222222| |1111111111|
++ * +----------+ +----------+
++ *
++ * +----------+ +----------+
++ * |3333333333| |2222222222|
++ * |3333333333| |3333333333|
++ * |4444444444| |3333333333|
++ * +----------+ +----------+
++ *
++ * +----------+ +----------+
++ * |5555555555| |4444444444|
++ * | | |5555555555|
++ * | | | |
++ * +----------+ +----------+
++ *
++ * The source and destination data segment list lengths are
++ * assumed to have been validated outside of this function.
++ */
++static int ibscif_dscopy(struct ibscif_ds *dst_ds, struct ibscif_ds *src_ds, u32 length)
++{
++ struct ibscif_seg src, dst;
++ int head_copied;
++ u32 copy_len;
++ int err = 0;
++
++ ibscif_seg_init(&src, src_ds, ibscif_map_src, ibscif_unmap_src, 0);
++ ibscif_seg_init(&dst, dst_ds, ibscif_map_dst, ibscif_unmap_dst, IB_ACCESS_LOCAL_WRITE);
++
++ head_copied = 0;
++ for (copy_len = 0; length; length -= copy_len) {
++
++ err = ibscif_seg_set(&src, length, copy_len);
++ if (unlikely(err))
++ break;
++ err = ibscif_seg_set(&dst, length, copy_len);
++ if (unlikely(err))
++ break;
++
++ copy_len = min(src.pg_len, dst.pg_len);
++ head_copied = ibscif_seg_copy(&dst, &src, copy_len, head_copied);
++ }
++
++ ibscif_seg_fini(&src);
++ ibscif_seg_fini(&dst);
++
++ return err;
++}
++
++/* Hold sq->lock during this call for synchronization. */
++static int ibscif_complete_sq_wr(struct ibscif_wq *sq, struct ibscif_wr *send_wr, enum ib_wc_status status)
++{
++ struct ibscif_qp *qp = sq->qp;
++ struct ibscif_wc *wc;
++ int err;
++
++ ibscif_clear_ds_refs(send_wr->ds_list, send_wr->num_ds);
++ sq->completions++;
++ sq->reap++;
++
++ if (send_wr->flags & IB_SEND_SIGNALED) {
++ struct ibscif_cq *cq = to_cq(qp->ibqp.send_cq);
++
++ err = ibscif_reserve_cqe(cq, &wc);
++ if (unlikely(err))
++ return err;
++
++ wc->ibwc.qp = &qp->ibqp;
++ wc->ibwc.src_qp = qp->remote_qpn;
++ wc->ibwc.wr_id = send_wr->id;
++ wc->ibwc.opcode = to_ib_wc_opcode(send_wr->opcode);
++ wc->ibwc.status = status;
++ wc->ibwc.ex.imm_data = 0;
++ wc->ibwc.port_num = 1;
++
++ if ((enum ib_wr_opcode)send_wr->opcode == IB_WR_RDMA_READ)
++ wc->ibwc.byte_len = send_wr->read.remote_length;
++ else if (((enum ib_wr_opcode)send_wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) ||
++ ((enum ib_wr_opcode)send_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD))
++ wc->ibwc.byte_len = sizeof send_wr->atomic_rsp.orig_data;
++ else
++ wc->ibwc.byte_len = send_wr->length;
++
++ wc->wq = sq;
++ wc->reap = sq->reap;
++ sq->reap = 0;
++
++ ibscif_append_cqe(cq, wc, 0);
++ }
++
++ return 0;
++}
++
++/* Hold rq->lock during this call for synchronization. */
++static int ibscif_complete_rq_wr(struct ibscif_wq *rq, struct ibscif_wr *recv_wr,
++ struct ibscif_wr *send_wr, enum ib_wc_status status)
++{
++ struct ibscif_qp *qp = rq->qp;
++ struct ibscif_cq *cq = to_cq(qp->ibqp.recv_cq);
++ struct ibscif_wc *wc;
++ int err;
++
++ ibscif_clear_ds_refs(recv_wr->ds_list, recv_wr->num_ds);
++
++ err = ibscif_reserve_cqe(cq, &wc);
++ if (unlikely(err))
++ return err;
++
++ wc->ibwc.qp = &qp->ibqp;
++ wc->ibwc.src_qp = qp->remote_qpn;
++ wc->ibwc.wr_id = recv_wr->id;
++ wc->ibwc.status = status;
++ wc->ibwc.byte_len = send_wr->length;
++ wc->ibwc.port_num = 1;
++
++ if ((enum ib_wr_opcode)send_wr->opcode == IB_WR_SEND_WITH_IMM) {
++ DEV_STAT(qp->dev, recv_imm++);
++ wc->ibwc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
++ wc->ibwc.ex.imm_data = cpu_to_be32(send_wr->send.immediate_data);
++ } else if ((enum ib_wr_opcode)send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
++ DEV_STAT(qp->dev, recv_imm++);
++ wc->ibwc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
++ wc->ibwc.ex.imm_data = cpu_to_be32(send_wr->write.immediate_data);
++ } else {
++ DEV_STAT(qp->dev, recv++);
++ wc->ibwc.opcode = IB_WC_RECV;
++ wc->ibwc.ex.imm_data = 0;
++ }
++
++ wc->wq = rq;
++ wc->reap = 1;
++ rq->completions++;
++
++ ibscif_append_cqe(cq, wc, !!(send_wr->flags & IB_SEND_SOLICITED));
++
++ return 0;
++}
++
++/* Hold wq lock during this call for synchronization. */
++static int ibscif_validate_wq(struct ibscif_wq *wq, struct ibscif_wr **wr, enum ib_access_flags access)
++{
++ if (unlikely(wq->qp->state != QP_CONNECTED))
++ return -ENOTCONN;
++
++ if (unlikely(access && !(wq->qp->access & access)))
++ return -EACCES;
++
++ if (wr) {
++ int next;
++
++ if (unlikely(!wq->size))
++ return -ENOSPC;
++
++ next = (wq->head + wq->completions) % wq->size;
++
++ if (unlikely(next == wq->tail))
++ return -ENOBUFS;
++
++ *wr = ibscif_get_wr(wq, next);
++ }
++
++ return 0;
++}
++
++static int ibscif_loopback_send(struct ibscif_wq *sq, struct ibscif_wq *rq, struct ibscif_wr *send_wr)
++{
++ struct ibscif_wr *recv_wr;
++ int err;
++
++ spin_lock_bh(&rq->lock);
++
++ err = ibscif_validate_wq(rq, &recv_wr, 0);
++ if (unlikely(err))
++ goto out;
++
++ if (likely(send_wr->length)) {
++ if (unlikely(send_wr->length > recv_wr->length)) {
++ err = -EMSGSIZE;
++ goto out;
++ }
++
++ err = ibscif_dscopy(recv_wr->ds_list, send_wr->ds_list, send_wr->length);
++ if (unlikely(err))
++ goto out;
++ }
++
++ err = ibscif_complete_rq_wr(rq, recv_wr, send_wr, IB_WC_SUCCESS);
++out:
++ spin_unlock_bh(&rq->lock);
++
++ return err;
++}
++
++static int ibscif_loopback_write(struct ibscif_wq *sq, struct ibscif_wq *rq, struct ibscif_wr *write_wr)
++{
++ struct ibscif_wr *recv_wr = NULL;
++ struct ibscif_mr *dst_mr = ERR_PTR(-ENOENT);
++ int err;
++
++ spin_lock_bh(&rq->lock);
++
++ err = ibscif_validate_wq(rq, ((enum ib_wr_opcode)write_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) ?
++ &recv_wr : NULL, IB_ACCESS_REMOTE_WRITE);
++ if (unlikely(err))
++ goto out;
++
++ if (likely(write_wr->length)) {
++ struct ibscif_ds dst_ds;
++
++ dst_mr = ibscif_validate_mr(write_wr->write.rkey, write_wr->write.remote_address,
++ write_wr->length, rq->qp->ibqp.pd, IB_ACCESS_REMOTE_WRITE);
++ if (unlikely(IS_ERR(dst_mr))) {
++ err = PTR_ERR(dst_mr);
++ goto out;
++ }
++
++ dst_ds.mr = dst_mr;
++ dst_ds.offset = write_wr->write.remote_address - dst_mr->addr;
++ dst_ds.length = write_wr->length;
++ dst_ds.lkey = dst_mr->ibmr.lkey;
++
++ err = ibscif_dscopy(&dst_ds, write_wr->ds_list, dst_ds.length);
++ if (unlikely(err))
++ goto out;
++ } else
++ err = 0;
++
++ if (recv_wr)
++ err = ibscif_complete_rq_wr(rq, recv_wr, write_wr, IB_WC_SUCCESS);
++out:
++ if (likely(!IS_ERR(dst_mr)))
++ ibscif_put_mr(dst_mr);
++
++ spin_unlock_bh(&rq->lock);
++
++ return err;
++}
++
++static int ibscif_loopback_read(struct ibscif_wq *sq, struct ibscif_wq *iq, struct ibscif_wr *read_wr)
++{
++ struct ibscif_mr *src_mr = ERR_PTR(-ENOENT);
++ int err;
++
++ spin_lock_bh(&iq->lock);
++
++ err = ibscif_validate_wq(iq, NULL, IB_ACCESS_REMOTE_READ);
++ if (unlikely(err))
++ goto out;
++
++ if (!iq->size) {
++ err = -ENOBUFS;
++ goto out;
++ }
++
++ if (likely(read_wr->read.remote_length)) {
++ struct ibscif_ds src_ds;
++
++ src_mr = ibscif_validate_mr(read_wr->read.rkey, read_wr->read.remote_address,
++ read_wr->read.remote_length, iq->qp->ibqp.pd,
++ IB_ACCESS_REMOTE_READ);
++ if (unlikely(IS_ERR(src_mr))) {
++ err = PTR_ERR(src_mr);
++ goto out;
++ }
++
++ src_ds.mr = src_mr;
++ src_ds.offset = read_wr->read.remote_address - src_mr->addr;
++ src_ds.length = read_wr->read.remote_length;
++ src_ds.lkey = src_mr->ibmr.lkey;
++
++ err = ibscif_dscopy(read_wr->ds_list, &src_ds, src_ds.length);
++ } else
++ err = 0;
++out:
++ if (likely(!IS_ERR(src_mr)))
++ ibscif_put_mr(src_mr);
++
++ spin_unlock_bh(&iq->lock);
++
++ atomic_dec(&sq->qp->or_posted);
++
++ return err;
++}
++
++static int ibscif_loopback_atomic(struct ibscif_wq *sq, struct ibscif_wq *iq, struct ibscif_wr *atomic_wr)
++{
++ struct ibscif_mr *src_mr = ERR_PTR(-ENOENT);
++ struct ibscif_ds src_ds;
++ struct page *src_page;
++ u64 *src_addr, addr;
++ u32 src_offset, rkey;
++ int err;
++
++ if ((enum ib_wr_opcode)atomic_wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
++ addr = atomic_wr->cmp_swp.remote_address;
++ rkey = atomic_wr->cmp_swp.rkey;
++ } else {
++ addr = atomic_wr->fetch_add.remote_address;
++ rkey = atomic_wr->fetch_add.rkey;
++ }
++
++ spin_lock_bh(&iq->lock);
++
++ err = ibscif_validate_wq(iq, NULL, IB_ACCESS_REMOTE_ATOMIC);
++ if (unlikely(err))
++ goto out;
++
++ if (!iq->size) {
++ err = -ENOBUFS;
++ goto out;
++ }
++
++ src_mr = ibscif_validate_mr(rkey, addr, sizeof atomic_wr->atomic_rsp.orig_data,
++ iq->qp->ibqp.pd, IB_ACCESS_REMOTE_ATOMIC);
++ if (unlikely(IS_ERR(src_mr))) {
++ err = PTR_ERR(src_mr);
++ goto out;
++ }
++
++ /* Build a source data segment to copy the original data. */
++ src_ds.mr = src_mr;
++ src_ds.offset = addr - src_mr->addr;
++ src_ds.length = sizeof atomic_wr->atomic_rsp.orig_data;
++ src_ds.lkey = src_mr->ibmr.lkey;
++
++ /* Determine which page to map. */
++ src_offset = src_ds.offset + (src_mr->addr & ~PAGE_MASK);
++ src_page = src_mr->page[src_offset >> PAGE_SHIFT];
++ src_offset &= ~PAGE_MASK;
++
++ /* Lock to perform the atomic operation atomically. */
++ spin_lock_bh(&iq->qp->dev->atomic_op);
++
++ /* Copy the original data; this handles any ds_list crossing. */
++ err = ibscif_dscopy(atomic_wr->ds_list, &src_ds, sizeof atomic_wr->atomic_rsp.orig_data);
++ if (likely(!err)) {
++ src_addr = ibscif_map_src(src_page) + src_offset;
++ if ((enum ib_wr_opcode)atomic_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
++ *src_addr += atomic_wr->fetch_add.add_operand;
++ else if (*src_addr == atomic_wr->cmp_swp.cmp_operand)
++ *src_addr = atomic_wr->cmp_swp.swp_operand;
++ ibscif_unmap_src(src_page, src_addr);
++ }
++
++ /* Atomic operation is complete. */
++ spin_unlock_bh(&iq->qp->dev->atomic_op);
++out:
++ if (likely(!IS_ERR(src_mr)))
++ ibscif_put_mr(src_mr);
++
++ spin_unlock_bh(&iq->lock);
++
++ atomic_dec(&sq->qp->or_posted);
++
++ return err;
++}
++
++void ibscif_loopback_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason)
++{
++ struct ibscif_qp *remote_qp;
++
++ remote_qp = ibscif_get_qp(qp->remote_qpn);
++ if (unlikely(IS_ERR(remote_qp)))
++ return;
++
++ /* Don't bother if the SQ is connected to the RQ on the same QP. */
++ if (remote_qp != qp)
++ ibscif_qp_remote_disconnect(remote_qp, reason);
++
++ ibscif_put_qp(remote_qp);
++}
++
++/*
++ * Loopback QPs connected through the same MAC address.
++ * This includes an SQ connected to the RQ on the same QP.
++ */
++void ibscif_loopback(struct ibscif_wq *sq)
++{
++ struct ibscif_wq *rq, *iq;
++ struct ibscif_qp *remote_qp;
++ struct ibscif_wr *wr;
++ int status = 0, err = 0;
++
++ BUG_ON(!is_sq(sq));
++
++again:
++ remote_qp = ibscif_get_qp(sq->qp->remote_qpn);
++ if (unlikely(IS_ERR(remote_qp))) {
++ ibscif_qp_remote_disconnect(sq->qp, IBSCIF_REASON_INVALID_QP);
++ return;
++ }
++ rq = &remote_qp->rq;
++ iq = &remote_qp->iq;
++
++ DEV_STAT(sq->qp->dev, loopback++);
++
++ spin_lock_bh(&sq->lock);
++ for (wr = ibscif_get_wr(sq, sq->next_wr);
++ (sq->next_wr != sq->tail) && !err;
++ sq->next_wr = (sq->next_wr + 1) % sq->size) {
++
++ switch (wr->opcode) {
++
++ case WR_SEND:
++ case WR_SEND_WITH_IMM:
++ status = ibscif_loopback_send(sq, rq, wr);
++ break;
++ case WR_RDMA_WRITE:
++ case WR_RDMA_WRITE_WITH_IMM:
++ status = ibscif_loopback_write(sq, rq, wr);
++ break;
++ case WR_RDMA_READ:
++ status = ibscif_loopback_read(sq, iq, wr);
++ break;
++ case WR_ATOMIC_CMP_AND_SWP:
++ case WR_ATOMIC_FETCH_AND_ADD:
++ status = ibscif_loopback_atomic(sq, iq, wr);
++ break;
++ default:
++ status = -ENOSYS;
++ break;
++ }
++
++ if (likely(!status)) {
++ err = ibscif_complete_sq_wr(sq, wr, IB_WC_SUCCESS);
++
++ spin_unlock_bh(&sq->lock);
++ ibscif_notify_cq(to_cq(sq->qp->ibqp.send_cq));
++ ibscif_notify_cq(to_cq(remote_qp->ibqp.recv_cq));
++ spin_lock_bh(&sq->lock);
++ } else
++ break;
++ }
++ spin_unlock_bh(&sq->lock);
++
++ if (unlikely(status) && status != -ENOBUFS)
++ ibscif_qp_remote_disconnect(sq->qp, IBSCIF_REASON_QP_FATAL);
++ else if (unlikely(err))
++ ibscif_qp_internal_disconnect(sq->qp, IBSCIF_REASON_QP_FATAL);
++
++ ibscif_put_qp(remote_qp);
++
++ if (status == -ENOBUFS) {
++ schedule();
++ goto again;
++ }
++}
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_main.c a8/drivers/infiniband/hw/scif/ibscif_main.c
+--- a7/drivers/infiniband/hw/scif/ibscif_main.c 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_main.c 2015-02-23 10:14:37.484809663 -0800
+@@ -0,0 +1,357 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "ibscif_driver.h"
++
++static const char ibscif_signon[] = DRV_SIGNON DRV_BUILD;
++
++MODULE_AUTHOR("Intel Corporation");
++MODULE_LICENSE("Dual BSD/GPL");
++MODULE_DESCRIPTION(DRV_DESC);
++MODULE_VERSION(DRV_VERSION);
++
++#define MODULE_PARAM(type, name, value, desc) \
++ type name = value; \
++ module_param(name, type, 0664); \
++ MODULE_PARM_DESC(name, desc)
++
++#define MODULE_ARRAY(name, size, value, desc) \
++ unsigned int name##_argc; \
++ char *name[size] = { [0 ... size-1] = value }; \
++ module_param_array(name, charp, &name##_argc, 0644); \
++ MODULE_PARM_DESC(name, desc)
++
++#define DEFAULT_MAX_PINNED 50
++MODULE_PARAM(int, max_pinned, DEFAULT_MAX_PINNED,
++ "Maximum percent of physical memory that may be pinned");
++
++#define DEFAULT_WINDOW_SIZE 40
++MODULE_PARAM(int, window_size, DEFAULT_WINDOW_SIZE,
++ "Maximum number of outstanding unacknowledged packets");
++
++#define DEFAULT_RMA_THRESHOLD 1024
++MODULE_PARAM(int, rma_threshold, DEFAULT_RMA_THRESHOLD,
++ "Maximum message size sent through scif_send()");
++
++MODULE_PARAM(int, fast_rdma, 1,
++ "Use scif_writeto()/scif_readfrom() directly for RDMA write/read");
++
++MODULE_PARAM(int, blocking_send, 0,
++ "Use blocking version of scif_send()");
++
++MODULE_PARAM(int, blocking_recv, 1,
++ "Use blocking version of scif_recv()");
++
++MODULE_PARAM(int, scif_loopback, 1,
++ "Use SCIF lookback instead of kernel copy based loopback");
++
++MODULE_PARAM(int, host_proxy, 0,
++ "Proxy card side RDMA operations to host");
++
++#if ((LINUX_VERSION_CODE>=KERNEL_VERSION(3,5,0)) || CONFIG_MK1OM || CONFIG_ML1OM)
++#define USE_NEW_IB_TYPE 1
++#else
++#define USE_NEW_IB_TYPE 0
++#endif
++MODULE_PARAM(int, new_ib_type, USE_NEW_IB_TYPE,
++ "Use new transport type dedicated to IBSCIF");
++
++MODULE_PARAM(int, verbose, 0,
++ "Produce more log info for debugging purpose");
++
++MODULE_PARAM(int, check_grh, 1,
++ "Detect outside-box connection by checking the global routing header");
++
++static atomic_t avail_pages; /* Calculated from max_pinned and totalram_pages */
++
++LIST_HEAD(devlist);
++DECLARE_MUTEX(devlist_mutex);
++
++DEFINE_IDR(wiremap);
++DEFINE_RWLOCK(wiremap_lock);
++static u32 reserved_0 = 0;
++
++void ibscif_dump(char *str, unsigned char* buf, int len)
++{
++ unsigned char *p, tmp[(16*3)+1];
++ int i;
++ return;
++ len = len > 64 ? 64 : len;
++ while (len) {
++ p = tmp;
++ for (i = len > 16 ? 16 : len; i; i--, len--)
++ p += sprintf(p, "%2x ", *buf++);
++ printk("(%d)%s: %s\n", smp_processor_id(), str, tmp);
++ }
++}
++
++int ibscif_reserve_quota(int *npages)
++{
++ int c, old, err;
++
++ if (!*npages)
++ return 0;
++
++ err = 0;
++ c = atomic_read(&avail_pages);
++ for (;;) {
++ if (unlikely(c < *npages))
++ break;
++ old = atomic_cmpxchg(&avail_pages, c, c - *npages);
++ if (likely(old == c))
++ break;
++ c = old;
++ }
++
++ if (c < *npages) {
++ *npages = 0;
++ err = -EDQUOT;
++ }
++
++ return err;
++}
++
++void ibscif_release_quota(int npages)
++{
++ if (npages)
++ atomic_add(npages, &avail_pages);
++}
++
++/*
++ * To work around MPI's assumptions that data is written atomically in their
++ * header structures, write the first 16 integers of a transfer atomically.
++ *
++ * Update: the assumption of MPI's ofa module is different in that the last
++ * four bytes needs to be written last and atomically. The buffers used in
++ * this case is always aligned.
++ */
++int ibscif_atomic_copy(void *dst_addr, void *src_addr, u32 copy_len, int head_copied)
++{
++ volatile int *src_x = (int *)src_addr;
++ volatile int *dst_x = (int *)dst_addr;
++ volatile u8 *src_c, *dst_c;
++ int head_aligned, tail_aligned;
++
++ if (unlikely(!copy_len))
++ return head_copied;
++
++ head_aligned = !((unsigned long)src_addr & (sizeof(int)-1)) &&
++ !((unsigned long)dst_addr & (sizeof(int)-1));
++
++
++ tail_aligned = !((unsigned long)(src_addr+copy_len) & (sizeof(int)-1)) &&
++ !((unsigned long)(dst_addr+copy_len) & (sizeof(int)-1));
++
++ if (!head_copied && head_aligned) {
++
++ switch (copy_len) {
++ case sizeof(int):
++ *dst_x = *src_x;
++ goto done;
++ case sizeof(int)*2:
++ *dst_x++ = *src_x++;
++ *dst_x = *src_x;
++ goto done;
++ case sizeof(int)*3:
++ *dst_x++ = *src_x++;
++ *dst_x++ = *src_x++;
++ *dst_x = *src_x;
++ goto done;
++ default:
++ if (copy_len >= (sizeof(int)*4)) {
++ /* We have at least a whole header to copy. */
++ head_copied = 1;
++ copy_len -= sizeof(int)*4;
++
++ *dst_x++ = *src_x++;
++ *dst_x++ = *src_x++;
++ *dst_x++ = *src_x++;
++
++ if (copy_len == 0) {
++ *dst_x = *src_x;
++ goto done;
++ }
++ *dst_x++ = *src_x++;
++ }
++ break;
++ }
++ }
++
++ /* The last integer is aligned. Copy all but the last int, then the last int */
++ if (tail_aligned && copy_len >= sizeof(int)) {
++ copy_len -= sizeof(int);
++ if (copy_len)
++ memcpy((void *)dst_x, (void *)src_x, copy_len);
++ smp_wmb();
++ src_x = (volatile int *)((char *)src_x + copy_len);
++ dst_x = (volatile int *)((char *)dst_x + copy_len);
++ *dst_x = *src_x;
++ goto done;
++ }
++
++ /* Bad alignment. Copy all but the last byte, then the last byte */
++ if (--copy_len)
++ memcpy((void *)dst_x, (void *)src_x, copy_len);
++
++ src_c = ((volatile u8 *)src_x) + copy_len;
++ dst_c = ((volatile u8 *)dst_x) + copy_len;
++ smp_wmb();
++ *dst_c = *src_c;
++done:
++ return head_copied;
++}
++
++/*
++ * Because idr_pre_get acquires the same internal spinlock used by idr_pre_get/idr_remove
++ * calls under a write_lock_bh, we need to call idr_pre_get with bottom half disabled.
++ * We cannot simply take the write_lock_bh(&wiremap_lock) because idr_pre_get does a
++ * blocking memory allocation call. Since bh is disabled, mask must be GFP_ATOMIC.
++ */
++static inline int ibscif_wiremap_pre_get(void)
++{
++ int ret;
++
++ local_bh_disable();
++ ret = idr_pre_get(&wiremap, GFP_ATOMIC);
++ local_bh_enable();
++
++ return ret;
++}
++
++int ibscif_wiremap_add(void *obj, int *id)
++{
++ int ret;
++
++ do {
++ if (!ibscif_wiremap_pre_get())
++ return -ENOMEM;
++
++ write_lock_bh(&wiremap_lock);
++ ret = idr_get_new(&wiremap, obj, id);
++ write_unlock_bh(&wiremap_lock);
++ } while (ret == -EAGAIN);
++
++ return ret;
++}
++
++void ibscif_wiremap_del(int id)
++{
++ write_lock_bh(&wiremap_lock);
++ idr_remove(&wiremap, id);
++ write_unlock_bh(&wiremap_lock);
++}
++
++static int ibscif_init_wiremap(void)
++{
++ /*
++ * Instead of treating them as opaque, some applications assert that returned key
++ * values are non-zero. As a work-around, reserve the first key from the wiremap.
++ */
++ int ret = ibscif_wiremap_add(&reserved_0, &reserved_0);
++ BUG_ON(reserved_0 != 0);
++ return ret;
++}
++
++static void ibscif_free_wiremap(void)
++{
++ write_lock_bh(&wiremap_lock);
++ idr_remove_all(&wiremap);
++ idr_destroy(&wiremap);
++ write_unlock_bh(&wiremap_lock);
++}
++
++static void ibscif_init_params(void)
++{
++ if ((max_pinned <= 0) || (max_pinned > 100)) {
++ max_pinned = DEFAULT_MAX_PINNED;
++ printk(KERN_WARNING PFX "Corrected max_pinned module parameter to %d.\n",
++ max_pinned);
++ }
++ if (window_size < MIN_WINDOW_SIZE) {
++ window_size = MIN_WINDOW_SIZE;
++ printk(KERN_WARNING PFX "Corrected window_size module parameter to %d.\n",
++ window_size);
++ }
++ if (rma_threshold < 0) {
++ rma_threshold = 0x7FFFFFFF;
++ printk(KERN_WARNING PFX "Corrected rma_threshold module parameter to %d.\n",
++ rma_threshold);
++ }
++
++ /*
++ * Hardware RDMA devices have built-in limits on the number of registered pages.
++ * The avail_pages variable provides a limit for this software device.
++ */
++ atomic_set(&avail_pages, max_pinned * (totalram_pages / 100));
++}
++
++static int __init ibscif_init(void)
++{
++ int err;
++
++ printk(KERN_INFO PFX "%s\n", ibscif_signon);
++ printk(KERN_INFO PFX "max_pinned=%d, window_size=%d, "
++ "blocking_send=%d, blocking_recv=%d, "
++ "fast_rdma=%d, "
++ "host_proxy=%d, "
++ "rma_threshold=%d, scif_loopback=%d, "
++ "new_ib_type=%d, verbose=%d, "
++ "check_grh=%d\n",
++ max_pinned, window_size,
++ blocking_send, blocking_recv,
++ fast_rdma,
++ host_proxy,
++ rma_threshold, scif_loopback,
++ new_ib_type, verbose,
++ check_grh);
++
++ ibscif_init_params();
++
++ err = ibscif_init_wiremap();
++ if (err)
++ return err;
++
++ err = ibscif_dev_init();
++ if (!err)
++ return 0;
++
++ ibscif_free_wiremap();
++ return err;
++}
++
++static void __exit ibscif_exit(void)
++{
++ ibscif_dev_cleanup();
++ ibscif_free_wiremap();
++ printk(KERN_INFO PFX "unloaded\n");
++}
++
++module_init(ibscif_init);
++module_exit(ibscif_exit);
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_mr.c a8/drivers/infiniband/hw/scif/ibscif_mr.c
+--- a7/drivers/infiniband/hw/scif/ibscif_mr.c 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_mr.c 2015-02-23 10:14:37.484809663 -0800
+@@ -0,0 +1,559 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "ibscif_driver.h"
++
++static int ibscif_mr_init_mreg(struct ibscif_mr *mr);
++
++struct ib_mr *ibscif_get_dma_mr(struct ib_pd *ibpd, int access)
++{
++ struct ibscif_dev *dev = to_dev(ibpd->device);
++ struct ibscif_mr *mr;
++ int err;
++
++ if (!atomic_add_unless(&dev->mr_cnt, 1, MAX_MRS))
++ return ERR_PTR(-EAGAIN);
++
++ mr = kzalloc(sizeof *mr, GFP_KERNEL);
++ if (!mr) {
++ err = -ENOMEM;
++ printk(KERN_ALERT PFX "%s: unable to allocate mr.\n", __func__);
++ goto out1;
++ }
++
++ kref_init(&mr->ref);
++ init_completion(&mr->done);
++
++ err = ibscif_wiremap_add(mr, &mr->ibmr.lkey);
++ if (err) {
++ printk(KERN_ALERT PFX "%s: unable to allocate lkey.\n", __func__);
++ goto out2;
++ }
++
++ if (mr->ibmr.lkey > IBSCIF_MR_MAX_KEY) {
++ err = -ENOSPC;
++ printk(KERN_ALERT PFX "%s: lkey (%x) out of range.\n", __func__, mr->ibmr.lkey);
++ goto out3;
++ }
++
++ mr->ibmr.device = ibpd->device; /* For ibscif_dereg_mr() calls below. */
++ mr->ibmr.rkey = mr->ibmr.lkey;
++ mr->access = access;
++ mr->magic = MR_MAGIC;
++ INIT_LIST_HEAD(&mr->mreg_list);
++
++ return &mr->ibmr;
++
++out3:
++ ibscif_wiremap_del(mr->ibmr.lkey);
++out2:
++ kfree(mr);
++out1:
++ atomic_dec(&dev->mr_cnt);
++ return ERR_PTR(err);
++}
++
++struct ib_mr *ibscif_reg_phys_mr(struct ib_pd *ibpd, struct ib_phys_buf *phys_buf_array,
++ int num_phys_buf, int access, u64 *iova_start)
++{
++ struct ibscif_mr *mr;
++ struct ib_mr *ibmr;
++ int i, j, k, err;
++ u64 mask;
++
++ ibmr = ibscif_get_dma_mr(ibpd, access);
++ if (IS_ERR(ibmr))
++ return ibmr;
++
++ mr = to_mr(ibmr);
++ mr->addr = *iova_start;
++
++ mask = 0;
++ for (i = 0; i < num_phys_buf; i++) {
++ if (i != 0)
++ mask |= phys_buf_array[i].addr; /* All but 1st are aligned */
++ if (i != num_phys_buf - 1)
++ mask |= phys_buf_array[i].addr + phys_buf_array[i].size; /* Middle bufs are full pages */
++
++ mr->length += phys_buf_array[i].size;
++ }
++ if ((mask & ~PAGE_MASK) || (mr->length > MAX_MR_SIZE)) {
++ err = -EINVAL;
++ goto out;
++ }
++ if (mr->length && ((mr->addr + mr->length - 1) < mr->addr)) {
++ err = -EOVERFLOW;
++ goto out;
++ }
++
++ phys_buf_array[0].size += phys_buf_array[0].addr & ~PAGE_MASK; /* Adjust 1st buf size by page offset */
++ phys_buf_array[0].addr &= PAGE_MASK; /* Truncate 1st buf to start of page */
++
++ for (i = 0; i < num_phys_buf; i++)
++ mr->npages += PAGE_ALIGN(phys_buf_array[i].size) >> PAGE_SHIFT;
++
++ if (!mr->npages)
++ return &mr->ibmr;
++
++ err = ibscif_reserve_quota(&mr->npages);
++ if (err)
++ goto out;
++
++ mr->page = vzalloc(mr->npages * sizeof *mr->page);
++ if (!mr->page) {
++ err = -ENOMEM;
++ goto out;
++ }
++
++ k = 0;
++ for (i = 0; i < num_phys_buf; i++)
++ for (j = 0; j < PAGE_ALIGN(phys_buf_array[i].size) >> PAGE_SHIFT; j++)
++ mr->page[k++] = pfn_to_page((phys_buf_array[i].addr >> PAGE_SHIFT) + j);
++
++ return &mr->ibmr;
++out:
++ ibscif_dereg_mr(ibmr);
++ return ERR_PTR(err);
++}
++
++#ifdef MOFED
++struct ib_mr *ibscif_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
++ u64 virt_addr, int access, struct ib_udata *udata, int mr_id)
++#else
++struct ib_mr *ibscif_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
++ u64 virt_addr, int access, struct ib_udata *udata)
++#endif
++{
++ struct ib_mr *ibmr;
++ struct ibscif_mr *mr;
++ struct scatterlist *sg;
++ struct ibscif_dev *dev;
++ int i, k, err;
++
++ if (length && ((start + length - 1) < start))
++ return ERR_PTR(-EOVERFLOW);
++
++ ibmr = ibscif_get_dma_mr(ibpd, access);
++ if (IS_ERR(ibmr))
++ return ibmr;
++
++ mr = to_mr(ibmr);
++ mr->addr = start;
++
++ mr->umem = ib_umem_get(ibpd->uobject->context, start, length, access, 0/*dma_sync*/);
++ if (IS_ERR(mr->umem)) {
++ err = PTR_ERR(mr->umem);
++ printk(KERN_ALERT PFX "%s: ib_umem_get returns %d.\n", __func__, err);
++ goto out;
++ }
++
++ mr->npages = ib_umem_page_count(mr->umem);
++ if (!mr->npages)
++ return &mr->ibmr;
++
++ mr->length = mr->umem->length;
++
++ err = ibscif_reserve_quota(&mr->npages);
++ if (err)
++ goto out;
++
++ mr->page = vzalloc(mr->npages * sizeof *mr->page);
++ if (!mr->page) {
++ err = -ENOMEM;
++ printk(KERN_ALERT PFX "%s: unable to allocate mr->page.\n", __func__);
++ goto out;
++ }
++
++ k = 0;
++ for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, i)
++ mr->page[k++] = sg_page(sg);
++
++ err = ibscif_mr_init_mreg(mr);
++ if (err)
++ goto out;
++
++ dev = to_dev(mr->ibmr.device);
++ down(&dev->mr_list_mutex);
++ list_add_tail(&mr->entry, &dev->mr_list);
++ up(&dev->mr_list_mutex);
++
++ return &mr->ibmr;
++out:
++ ibscif_dereg_mr(ibmr);
++ return ERR_PTR(err);
++}
++
++void ibscif_complete_mr(struct kref *ref)
++{
++ struct ibscif_mr *mr = container_of(ref, struct ibscif_mr, ref);
++ complete(&mr->done);
++}
++
++int ibscif_dereg_mr(struct ib_mr *ibmr)
++{
++ struct ibscif_dev *dev = to_dev(ibmr->device);
++ struct ibscif_mr *mr = to_mr(ibmr);
++ struct ibscif_mreg_info *mreg, *next;
++ struct ibscif_mr *mr0, *next0;
++ int ret;
++
++ ibscif_put_mr(mr);
++ wait_for_completion(&mr->done);
++
++ list_for_each_entry_safe(mreg, next, &mr->mreg_list, entry) {
++ do {
++ ret = scif_unregister(mreg->conn->ep, mreg->aligned_offset, mreg->aligned_length);
++ }
++ while (ret == -ERESTARTSYS);
++
++ if (ret && ret != -ENOTCONN)
++ printk(KERN_ALERT PFX "%s: scif_unregister returns %d. ep=%p, offset=%llx, length=%x\n",
++ __func__, ret, mreg->conn->ep, mreg->aligned_offset, mreg->aligned_length);
++
++ ibscif_put_conn(mreg->conn);
++ list_del(&mreg->entry);
++ kfree(mreg);
++ }
++
++ down(&dev->mr_list_mutex);
++ list_for_each_entry_safe(mr0, next0, &dev->mr_list, entry) {
++ if (mr0 == mr) {
++ list_del(&mr0->entry);
++ break;
++ }
++ }
++ up(&dev->mr_list_mutex);
++
++ if (mr->pinned_pages)
++ scif_unpin_pages(mr->pinned_pages);
++
++ if (mr->umem && !IS_ERR(mr->umem))
++ ib_umem_release(mr->umem);
++ if (mr->page)
++ vfree(mr->page);
++
++ ibscif_release_quota(mr->npages);
++ atomic_dec(&dev->mr_cnt);
++
++ ibscif_wiremap_del(mr->ibmr.lkey);
++
++ kfree(mr);
++ return 0;
++}
++
++/*
++ * Lookup and validate the given memory region access. A reference is held on success.
++ */
++struct ibscif_mr *ibscif_validate_mr(u32 key, u64 addr, int length,
++ struct ib_pd *ibpd, enum ib_access_flags access)
++{
++ struct ibscif_mr *mr;
++ int err;
++
++ mr = ibscif_get_mr(key);
++ if (unlikely(IS_ERR(mr)))
++ return mr;
++
++ if (unlikely(mr->ibmr.pd != ibpd)) {
++ err = -EPERM;
++ goto out;
++ }
++ if (unlikely(access && !(mr->access & access))) {
++ err = -EACCES;
++ goto out;
++ }
++ if (unlikely((addr < mr->addr) || ((addr + length) > (mr->addr + mr->length)))) {
++ err = -ERANGE;
++ goto out;
++ }
++
++ return mr;
++out:
++ ibscif_put_mr(mr);
++ return ERR_PTR(err);
++}
++
++static void ibscif_dma_nop(struct ib_device *ibdev, u64 addr, size_t size, enum dma_data_direction direction)
++{
++}
++
++static int ibscif_mapping_error(struct ib_device *ibdev, u64 dma_addr)
++{
++ return !dma_addr;
++}
++
++static u64 ibscif_dma_map_single(struct ib_device *ibdev, void *cpu_addr, size_t size,
++ enum dma_data_direction direction)
++{
++ return (u64)cpu_addr;
++}
++
++static u64 ibscif_dma_map_page(struct ib_device *ibdev, struct page *page, unsigned long offset, size_t size,
++ enum dma_data_direction direction)
++{
++ u64 addr;
++
++ if (offset + size > PAGE_SIZE)
++ return 0;
++
++ addr = (u64)page_address(page);
++ if (addr)
++ addr += offset;
++
++ return addr;
++}
++
++static int ibscif_map_sg(struct ib_device *ibdev, struct scatterlist *sg, int nents,
++ enum dma_data_direction direction)
++{
++ u64 addr;
++ int i, ret = nents;
++
++ for (i = 0; i < nents; i++, sg++) {
++ addr = (u64)page_address(sg_page(sg));
++ if (!addr) {
++ ret = 0;
++ break;
++ }
++
++ sg->dma_address = sg->offset + addr;
++ sg->dma_length = sg->length;
++ }
++ return ret;
++}
++
++static void ibscif_unmap_sg(struct ib_device *ibdev, struct scatterlist *sg, int nents,
++ enum dma_data_direction direction)
++{
++}
++
++static u64 ibscif_sg_dma_address(struct ib_device *ibdev, struct scatterlist *sg)
++{
++ return (u64)sg->dma_address;
++}
++
++static unsigned int ibscif_sg_dma_len(struct ib_device *ibdev, struct scatterlist *sg)
++{
++ return sg->dma_length;
++}
++
++static void *ibscif_dma_alloc_coherent(struct ib_device *ibdev, size_t size, u64 *dma_handle, gfp_t flag)
++{
++ struct page *p = alloc_pages(flag, get_order(size));
++ void *addr = p ? page_address(p) : NULL;
++
++ if (dma_handle)
++ *dma_handle = (u64)addr;
++
++ return addr;
++}
++
++static void ibscif_dma_free_coherent(struct ib_device *ibdev, size_t size, void *cpu_addr, u64 dma_handle)
++{
++ free_pages((unsigned long)cpu_addr, get_order(size));
++}
++
++struct ib_dma_mapping_ops ibscif_dma_mapping_ops = {
++ ibscif_mapping_error,
++ ibscif_dma_map_single,
++ ibscif_dma_nop,
++ ibscif_dma_map_page,
++ ibscif_dma_nop,
++ ibscif_map_sg,
++ ibscif_unmap_sg,
++ ibscif_sg_dma_address,
++ ibscif_sg_dma_len,
++ ibscif_dma_nop,
++ ibscif_dma_nop,
++ ibscif_dma_alloc_coherent,
++ ibscif_dma_free_coherent
++};
++
++static void ibscif_dump_mr_list( struct ibscif_dev *dev )
++{
++ struct ibscif_mr *mr;
++
++ list_for_each_entry(mr, &dev->mr_list, entry){
++ printk(KERN_ALERT PFX "%s: mr=%p [%llx, %x, %x]\n", __func__, mr, mr->addr, mr->length, mr->ibmr.rkey);
++ }
++}
++
++static int ibscif_mr_reg_with_conn(struct ibscif_mr *mr, struct ibscif_conn *conn, struct ibscif_mreg_info **new_mreg)
++{
++ struct ibscif_mreg_info *mreg;
++ off_t offset, aligned_offset;
++ u64 aligned_addr;
++ int aligned_length;
++ int offset_in_page;
++ int err;
++
++ aligned_addr = mr->addr & PAGE_MASK;
++ offset_in_page = (int)(mr->addr & ~PAGE_MASK);
++ aligned_length = (mr->length + offset_in_page + PAGE_SIZE - 1) & PAGE_MASK;
++ aligned_offset = IBSCIF_MR_VADDR_TO_OFFSET(mr->ibmr.rkey, aligned_addr);
++
++ offset = scif_register_pinned_pages(conn->ep, mr->pinned_pages, aligned_offset, SCIF_MAP_FIXED);
++
++ if (IS_ERR_VALUE(offset)) {
++ printk(KERN_ALERT PFX "%s: scif_register_pinned_pages returns %d\n", __func__, (int)offset);
++ printk(KERN_ALERT PFX "%s: conn=%p, ep=%p, mr=%p, addr=%llx, length=%x, rkey=%x, "
++ "aligned_addr=%llx, aligned_length=%x, aligned_offset=%llx\n",
++ __func__, conn, conn->ep, mr, mr->addr, mr->length, mr->ibmr.rkey,
++ aligned_addr, aligned_length, (uint64_t)aligned_offset);
++ ibscif_dump_mr_list(conn->dev);
++ return (int)offset;
++ }
++
++ BUG_ON(offset != aligned_offset);
++
++ offset += offset_in_page;
++
++ mreg = kzalloc(sizeof(struct ibscif_mreg_info), GFP_KERNEL);
++ if (!mreg) {
++ do {
++ err = scif_unregister(conn->ep, aligned_offset, aligned_length);
++ }
++ while (err == -ERESTARTSYS);
++
++ if (err && err != -ENOTCONN)
++ printk(KERN_ALERT PFX "%s: scif_unregister returns %d. ep=%p, offset=%llx, length=%x\n",
++ __func__, err, conn->ep, (uint64_t)aligned_offset, aligned_length);
++
++ return -ENOMEM;
++ }
++ mreg->conn = conn;
++ mreg->offset = (u64)offset;
++ mreg->aligned_offset = aligned_offset;
++ mreg->aligned_length = aligned_length;
++ list_add_tail(&mreg->entry, &mr->mreg_list);
++
++ atomic_inc(&conn->refcnt);
++ if (conn->local_close) {
++ conn->local_close = 0;
++ ibscif_send_reopen(conn);
++ }
++
++ if (new_mreg)
++ *new_mreg = mreg;
++
++ return 0;
++}
++
++struct ibscif_mreg_info *ibscif_mr_get_mreg(struct ibscif_mr *mr, struct ibscif_conn *conn)
++{
++ struct ibscif_mreg_info *mreg;
++ int err;
++ int i;
++
++ if (unlikely(!conn)) {
++ printk(KERN_ALERT PFX "%s: conn==NULL\n", __func__);
++ return NULL;
++ }
++
++ list_for_each_entry(mreg, &mr->mreg_list, entry){
++ if (mreg->conn == conn)
++ return mreg;
++ }
++
++ mreg = NULL;
++ err = ibscif_mr_reg_with_conn(mr, conn, &mreg);
++ if (err != -EADDRINUSE)
++ return mreg;
++
++ /* another thread is performing the registration */
++ if (verbose)
++ printk(KERN_INFO PFX "%s: mr is being registered by another thread. mr=%p, conn=%p.\n", __func__, mr, conn);
++ for (i=0; i<10000; i++) {
++ list_for_each_entry(mreg, &mr->mreg_list, entry){
++ if (mreg->conn == conn) {
++ if (verbose)
++ printk(KERN_INFO PFX "%s: got mreg after %d retries.\n", __func__, i+1);
++ return mreg;
++ }
++ }
++ schedule();
++ }
++ if (verbose)
++ printk(KERN_INFO PFX "%s: failed to get mreg after %d retries.\n", __func__, i);
++ return NULL;
++}
++
++static int ibscif_mr_init_mreg(struct ibscif_mr *mr)
++{
++ struct ibscif_dev *dev = to_dev(mr->ibmr.device);
++ struct ibscif_conn *conn;
++ int prot;
++ u64 aligned_addr;
++ int aligned_length;
++ int offset_in_page;
++ int err;
++
++ aligned_addr = mr->addr & PAGE_MASK;
++ offset_in_page = (int)(mr->addr & ~PAGE_MASK);
++ aligned_length = (mr->length + offset_in_page + PAGE_SIZE - 1) & PAGE_MASK;
++
++#if 0
++ prot = ((mr->access & IB_ACCESS_REMOTE_READ)?SCIF_PROT_READ:0) |
++ ((mr->access & IB_ACCESS_REMOTE_WRITE)?SCIF_PROT_WRITE:0);
++#else
++ // In IB, the same buffer can be registered multiple times with different access rights.
++ // SCIF doesn't have mechanism to support that. So we just turn on all the access rights.
++ // Otherwise we may end up with protection error.
++ prot = SCIF_PROT_READ | SCIF_PROT_WRITE;
++#endif
++
++ err = scif_pin_pages((void *)aligned_addr, aligned_length, prot, 0/*user addr*/, &mr->pinned_pages);
++ if (err) {
++ printk(KERN_ALERT PFX "%s: scif_pin_pages returns %d\n", __func__, err);
++ return err;
++ }
++
++ down(&dev->mutex);
++ list_for_each_entry(conn, &dev->conn_list, entry) {
++ err = ibscif_mr_reg_with_conn(mr, conn, NULL);
++ if (err)
++ break;
++ }
++ up(&dev->mutex);
++
++ return err;
++}
++
++void ibscif_refresh_mreg( struct ibscif_conn *conn )
++{
++ struct ibscif_mr *mr;
++
++ down(&conn->dev->mr_list_mutex);
++ list_for_each_entry(mr, &conn->dev->mr_list, entry){
++ ibscif_mr_get_mreg(mr, conn);
++ }
++ up(&conn->dev->mr_list_mutex);
++}
++
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_pd.c a8/drivers/infiniband/hw/scif/ibscif_pd.c
+--- a7/drivers/infiniband/hw/scif/ibscif_pd.c 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_pd.c 2015-02-23 10:14:37.484809663 -0800
+@@ -0,0 +1,56 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "ibscif_driver.h"
++
++struct ib_pd *ibscif_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata)
++{
++ struct ibscif_dev *dev = to_dev(ibdev);
++ struct ibscif_pd *pd;
++
++ if (!atomic_add_unless(&dev->pd_cnt, 1, MAX_PDS))
++ return ERR_PTR(-EAGAIN);
++
++ pd = kzalloc(sizeof *pd, GFP_KERNEL);
++ if (!pd) {
++ atomic_dec(&dev->pd_cnt);
++ return ERR_PTR(-ENOMEM);
++ }
++
++ return &pd->ibpd;
++}
++
++int ibscif_dealloc_pd(struct ib_pd *ibpd)
++{
++ struct ibscif_dev *dev = to_dev(ibpd->device);
++ atomic_dec(&dev->pd_cnt);
++ kfree(to_pd(ibpd));
++ return 0;
++}
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_post.c a8/drivers/infiniband/hw/scif/ibscif_post.c
+--- a7/drivers/infiniband/hw/scif/ibscif_post.c 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_post.c 2015-02-23 10:14:37.485809663 -0800
+@@ -0,0 +1,306 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "ibscif_driver.h"
++
++void ibscif_dump_sg(char *str, struct ib_sge *sge, int num)
++{
++ extern void ibscif_dump(char*, void*, int);
++ if (!sge)
++ return;
++ while (num--) {
++ ibscif_dump(str, (void*)sge->addr, sge->length);
++ sge++;
++ }
++}
++
++/*
++ * Build and validate the wr->ds_list from the given sg_list.
++ * If successful, a reference is held on each mr in the wr->ds_list.
++ */
++static int ibscif_wr_ds(struct ib_pd *ibpd, struct ib_sge *sg_list, int num_sge,
++ struct ibscif_wr *wr, int *total_length, enum ib_access_flags access)
++{
++ struct ibscif_ds *ds_list = wr->ds_list;
++ int err;
++
++ *total_length = 0;
++ for (wr->num_ds = 0; wr->num_ds < num_sge; sg_list++, ds_list++) {
++
++ ds_list->mr = ibscif_validate_mr(sg_list->lkey, sg_list->addr, sg_list->length, ibpd, access);
++ if (unlikely(IS_ERR(ds_list->mr))) {
++ err = PTR_ERR(ds_list->mr);
++ goto out;
++ }
++
++ ds_list->in_use = 1;
++ wr->num_ds++;
++
++ if (unlikely((*total_length + sg_list->length) < *total_length)) {
++ err = -EOVERFLOW;
++ goto out;
++ }
++
++ ds_list->offset = sg_list->addr - ds_list->mr->addr;
++ ds_list->length = sg_list->length;
++ ds_list->lkey = sg_list->lkey;
++ ds_list->current_mreg = NULL;
++
++ *total_length += ds_list->length;
++ }
++
++ return 0;
++out:
++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds);
++ return err;
++}
++
++int ibscif_post_send(struct ib_qp *ibqp, struct ib_send_wr *ibwr, struct ib_send_wr **bad_wr)
++{
++ struct ibscif_qp *qp = to_qp(ibqp);
++ struct ibscif_wq *sq = &qp->sq;
++ struct ibscif_wr *wr;
++ int nreq = 0, err;
++
++ IBSCIF_PERF_SAMPLE(0, 0);
++
++ spin_lock_bh(&sq->lock);
++
++ if (unlikely(ibqp->qp_type != IB_QPT_UD && qp->state != QP_CONNECTED)) {
++ err = -ENOTCONN;
++ goto out;
++ }
++ if (unlikely(!sq->size)) {
++ err = -ENOSPC;
++ goto out;
++ }
++
++ for (err = 0; ibwr; ibwr = ibwr->next, nreq++) {
++
++ if (unlikely(sq->depth == sq->size)) {
++ err = -ENOBUFS;
++ goto out;
++ }
++ if (unlikely(ibwr->num_sge > sq->max_sge)) {
++ err = -E2BIG;
++ goto out;
++ }
++
++ wr = ibscif_get_wr(sq, sq->tail);
++
++ memset(&wr->sar, 0, sizeof wr->sar);
++
++ wr->id = ibwr->wr_id;
++ wr->opcode = ibwr->opcode;
++ wr->flags = ibwr->send_flags | ((qp->sq_policy == IB_SIGNAL_ALL_WR) ? IB_SEND_SIGNALED : 0);
++ wr->state = WR_WAITING;
++ wr->use_rma = 0;
++ wr->rma_id = 0;
++
++ if (ibqp->qp_type == IB_QPT_UD) {
++ wr->opcode = WR_UD;
++ wr->ud.remote_node_id = IBSCIF_LID_TO_NODE_ID(be16_to_cpu(to_ah(ibwr->wr.ud.ah)->dlid));
++ wr->ud.remote_qpn = ibwr->wr.ud.remote_qpn;
++
++ /* the remainings are the same as IB_WR_SEND */
++ err = ibscif_wr_ds(ibqp->pd, ibwr->sg_list, ibwr->num_sge, wr, &wr->length, 0);
++ if (unlikely(err))
++ goto out;
++ wr->msg_id = sq->wirestate->tx.next_msg_id++;
++ }
++
++ else switch (ibwr->opcode) {
++
++ case IB_WR_SEND_WITH_IMM:
++ wr->send.immediate_data = ibwr->ex.imm_data;
++ case IB_WR_SEND:
++ err = ibscif_wr_ds(ibqp->pd, ibwr->sg_list, ibwr->num_sge, wr, &wr->length, 0);
++ if (unlikely(err))
++ goto out;
++ wr->msg_id = sq->wirestate->tx.next_msg_id++;
++ if (wr->length > rma_threshold) {
++ wr->use_rma = 1;
++ wr->rma_id = sq->next_msg_id;
++ }
++ break;
++
++ case IB_WR_RDMA_WRITE_WITH_IMM:
++ wr->msg_id = sq->wirestate->tx.next_msg_id++;
++ wr->write.immediate_data = ibwr->ex.imm_data;
++ case IB_WR_RDMA_WRITE:
++ err = ibscif_wr_ds(ibqp->pd, ibwr->sg_list, ibwr->num_sge, wr, &wr->length, 0);
++ if (unlikely(err))
++ goto out;
++ if (wr->length &&
++ ((ibwr->wr.rdma.remote_addr + wr->length - 1) < ibwr->wr.rdma.remote_addr)) {
++ err = -EOVERFLOW;
++ goto out;
++ }
++ wr->write.remote_address = ibwr->wr.rdma.remote_addr;
++ wr->write.rkey = ibwr->wr.rdma.rkey;
++ if (ibwr->opcode == IB_WR_RDMA_WRITE)
++ wr->msg_id = 0;
++ if (wr->length > rma_threshold) {
++ wr->use_rma = 1;
++ wr->rma_id = sq->next_msg_id;
++ }
++ break;
++
++ case IB_WR_RDMA_READ:
++ if (unlikely(!qp->max_or)) {
++ err = -ENOBUFS;
++ goto out;
++ }
++ err = ibscif_wr_ds(ibqp->pd, ibwr->sg_list, ibwr->num_sge, wr, &wr->length, IB_ACCESS_LOCAL_WRITE);
++ if (unlikely(err))
++ goto out;
++ if (wr->length &&
++ ((ibwr->wr.rdma.remote_addr + wr->length - 1) < ibwr->wr.rdma.remote_addr)) {
++ err = -EOVERFLOW;
++ goto out;
++ }
++ wr->read.remote_address = ibwr->wr.rdma.remote_addr;
++ wr->read.remote_length = wr->length;
++ wr->read.rkey = ibwr->wr.rdma.rkey;
++ wr->length = 0; /* No tx data with this opcode */
++ wr->msg_id = sq->next_msg_id;
++ atomic_inc(&qp->or_posted);
++ if (wr->read.remote_length > rma_threshold) {
++ wr->use_rma = 1;
++ wr->rma_id = wr->msg_id;
++ }
++ break;
++
++ case IB_WR_ATOMIC_CMP_AND_SWP:
++ case IB_WR_ATOMIC_FETCH_AND_ADD:
++ if (unlikely(!qp->max_or)) {
++ err = -ENOBUFS;
++ goto out;
++ }
++ if (unlikely(ibwr->wr.atomic.remote_addr & (sizeof wr->atomic_rsp.orig_data - 1))) {
++ err = -EADDRNOTAVAIL;
++ goto out;
++ }
++ err = ibscif_wr_ds(ibqp->pd, ibwr->sg_list, ibwr->num_sge, wr, &wr->length, IB_ACCESS_LOCAL_WRITE);
++ if (unlikely(err))
++ goto out;
++ if (unlikely(wr->length < sizeof wr->atomic_rsp.orig_data)) {
++ err = -EINVAL;
++ goto out;
++ }
++ if (ibwr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
++ wr->cmp_swp.cmp_operand = ibwr->wr.atomic.compare_add;
++ wr->cmp_swp.swp_operand = ibwr->wr.atomic.swap;
++ wr->cmp_swp.remote_address = ibwr->wr.atomic.remote_addr;
++ wr->cmp_swp.rkey = ibwr->wr.atomic.rkey;
++ } else {
++ wr->fetch_add.add_operand = ibwr->wr.atomic.compare_add;
++ wr->fetch_add.remote_address = ibwr->wr.atomic.remote_addr;
++ wr->fetch_add.rkey = ibwr->wr.atomic.rkey;
++ }
++ wr->length = 0; /* No tx data with these opcodes */
++ wr->msg_id = sq->next_msg_id;
++ atomic_inc(&qp->or_posted);
++ break;
++
++ default:
++ err = -ENOMSG;
++ goto out;
++ }
++
++ DEV_STAT(qp->dev, wr_opcode[wr->opcode]++);
++ ibscif_append_wq(sq);
++ }
++out:
++ spin_unlock_bh(&sq->lock);
++
++ IBSCIF_PERF_SAMPLE(1, 0);
++
++ if (err)
++ *bad_wr = ibwr;
++ if (nreq)
++ ibscif_schedule(sq);
++
++ IBSCIF_PERF_SAMPLE(9, 1);
++
++ return err;
++}
++
++int ibscif_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ibwr, struct ib_recv_wr **bad_wr)
++{
++ struct ibscif_qp *qp = to_qp(ibqp);
++ struct ibscif_wq *rq = &qp->rq;
++ struct ibscif_wr *wr;
++ int err;
++
++ spin_lock_bh(&rq->lock);
++
++ if ((qp->state != QP_IDLE) && (qp->state != QP_CONNECTED)) {
++ err = -ENOTCONN;
++ goto out;
++ }
++ if (unlikely(!rq->size)) {
++ err = -ENOSPC;
++ goto out;
++ }
++
++ for (err = 0; ibwr; ibwr = ibwr->next) {
++
++ if (unlikely(rq->depth == rq->size)) {
++ err = -ENOBUFS;
++ goto out;
++ }
++ if (unlikely(ibwr->num_sge > rq->max_sge)) {
++ err = -E2BIG;
++ goto out;
++ }
++
++ wr = ibscif_get_wr(rq, rq->tail);
++
++ memset(&wr->sar, 0, sizeof wr->sar);
++
++ wr->id = ibwr->wr_id;
++ wr->msg_id = rq->next_msg_id;
++ wr->state = WR_WAITING;
++
++ err = ibscif_wr_ds(ibqp->pd, ibwr->sg_list, ibwr->num_sge, wr, &wr->length, IB_ACCESS_LOCAL_WRITE);
++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds);
++ if (unlikely(err))
++ goto out;
++
++ ibscif_append_wq(rq);
++ }
++out:
++ spin_unlock_bh(&rq->lock);
++ if (err)
++ *bad_wr = ibwr;
++
++ return err;
++}
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_procfs.c a8/drivers/infiniband/hw/scif/ibscif_procfs.c
+--- a7/drivers/infiniband/hw/scif/ibscif_procfs.c 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_procfs.c 2015-02-23 10:14:37.485809663 -0800
+@@ -0,0 +1,180 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "ibscif_driver.h"
++
++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
++static int ibscif_stats_show(struct seq_file *m, void *v)
++#else
++static int ibscif_stats_read(char *page, char **start, off_t offset,
++ int count, int *eof, void *data)
++#endif
++{
++ int l = 0;
++
++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
++ struct ibscif_dev *dev = m->private;
++#else
++ struct ibscif_dev *dev = data;
++ char *m = page;
++
++ if (offset)
++ return 0;
++
++ *eof = 1;
++#endif
++
++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
++ seq_printf
++#else
++ l += sprintf
++#endif
++ (m,
++ "%s statistics:\n"
++ " tx_bytes %lu rx_bytes %lu\n"
++ " tx_pkts %lu rx_pkts %lu loopback_pkts %lu\n"
++ " sched_exhaust %lu unavailable %lu\n"
++ " tx_errors %lu duplicates %lu\n"
++ " total wr %lu :\n"
++ " send %lu send_imm %lu write %lu write_imm %lu\n"
++ " recv %lu recv_imm %lu read %lu comp %lu fetch %lu\n"
++ " read_rsp %lu atomic_rsp %lu ud %lu\n"
++ " fast_rdma :\n"
++ " write %lu read %lu unavailable %lu fallback %lu force_ack %lu tail_write %lu\n",
++ dev->ibdev.name,
++ DEV_STAT(dev, bytes_sent),
++ DEV_STAT(dev, bytes_rcvd),
++ DEV_STAT(dev, packets_sent),
++ DEV_STAT(dev, packets_rcvd),
++ DEV_STAT(dev, loopback),
++ DEV_STAT(dev, sched_exhaust),
++ DEV_STAT(dev, unavailable),
++ DEV_STAT(dev, tx_errors),
++ DEV_STAT(dev, duplicates),
++ DEV_STAT(dev, wr_opcode[WR_SEND]) +
++ DEV_STAT(dev, wr_opcode[WR_SEND_WITH_IMM]) +
++ DEV_STAT(dev, wr_opcode[WR_RDMA_WRITE]) +
++ DEV_STAT(dev, wr_opcode[WR_RDMA_WRITE_WITH_IMM]) +
++ DEV_STAT(dev, recv) +
++ DEV_STAT(dev, recv_imm) +
++ DEV_STAT(dev, wr_opcode[WR_RDMA_READ]) +
++ DEV_STAT(dev, wr_opcode[WR_ATOMIC_CMP_AND_SWP]) +
++ DEV_STAT(dev, wr_opcode[WR_ATOMIC_FETCH_AND_ADD]) +
++ DEV_STAT(dev, wr_opcode[WR_RDMA_READ_RSP]) +
++ DEV_STAT(dev, wr_opcode[WR_ATOMIC_RSP]),
++ DEV_STAT(dev, wr_opcode[WR_SEND]),
++ DEV_STAT(dev, wr_opcode[WR_SEND_WITH_IMM]),
++ DEV_STAT(dev, wr_opcode[WR_RDMA_WRITE]),
++ DEV_STAT(dev, wr_opcode[WR_RDMA_WRITE_WITH_IMM]),
++ DEV_STAT(dev, recv),
++ DEV_STAT(dev, recv_imm),
++ DEV_STAT(dev, wr_opcode[WR_RDMA_READ]),
++ DEV_STAT(dev, wr_opcode[WR_ATOMIC_CMP_AND_SWP]),
++ DEV_STAT(dev, wr_opcode[WR_ATOMIC_FETCH_AND_ADD]),
++ DEV_STAT(dev, wr_opcode[WR_RDMA_READ_RSP]),
++ DEV_STAT(dev, wr_opcode[WR_ATOMIC_RSP]),
++ DEV_STAT(dev, wr_opcode[WR_UD]),
++ DEV_STAT(dev, fast_rdma_write),
++ DEV_STAT(dev, fast_rdma_read),
++ DEV_STAT(dev, fast_rdma_unavailable),
++ DEV_STAT(dev, fast_rdma_fallback),
++ DEV_STAT(dev, fast_rdma_force_ack),
++ DEV_STAT(dev, fast_rdma_tail_write)
++ );
++
++ return l;
++}
++
++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0))
++static ssize_t ibscif_stats_write(struct file *file, const char __user *buffer,
++ size_t count, loff_t *ppos)
++{
++ struct ibscif_dev *dev = PDE_DATA(file_inode(file));
++ memset(&dev->stats, 0, sizeof dev->stats);
++ return count;
++}
++
++static int ibscif_stats_open(struct inode *inode, struct file *file)
++{
++ return single_open(file, ibscif_stats_show, PDE_DATA(inode));
++}
++
++struct file_operations ibscif_fops = {
++ .owner = THIS_MODULE,
++ .open = ibscif_stats_open,
++ .read = seq_read,
++ .write = ibscif_stats_write,
++ .llseek = seq_lseek,
++ .release = seq_release,
++};
++
++int ibscif_procfs_add_dev(struct ibscif_dev *dev)
++{
++ dev->procfs = proc_mkdir(dev->ibdev.name, init_net.proc_net);
++ if (!dev->procfs)
++ return -ENOENT;
++
++ if (proc_create_data("stats", S_IRUGO | S_IWUGO, dev->procfs,
++ &ibscif_fops ,dev))
++ return -ENOENT;
++
++ return 0;
++}
++#else /* (LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)) */
++static int ibscif_stats_write(struct file *file, const char __user *buffer, unsigned long count, void *data)
++{
++ struct ibscif_dev *dev = data;
++ memset(&dev->stats, 0, sizeof dev->stats);
++ return count;
++}
++
++int ibscif_procfs_add_dev(struct ibscif_dev *dev)
++{
++ struct proc_dir_entry *entry;
++
++ dev->procfs = proc_mkdir(dev->ibdev.name, init_net.proc_net);
++ if (!dev->procfs)
++ return -ENOENT;
++
++ entry = create_proc_read_entry("stats", S_IRUGO | S_IWUGO, dev->procfs, ibscif_stats_read, dev);
++ if (!entry)
++ return -ENOENT;
++ entry->write_proc = ibscif_stats_write;
++
++ return 0;
++}
++#endif
++
++void ibscif_procfs_remove_dev(struct ibscif_dev *dev)
++{
++ if (dev->procfs)
++ remove_proc_entry("stats", dev->procfs);
++ remove_proc_entry(dev->ibdev.name, init_net.proc_net);
++}
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_protocol.c a8/drivers/infiniband/hw/scif/ibscif_protocol.c
+--- a7/drivers/infiniband/hw/scif/ibscif_protocol.c 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_protocol.c 2015-02-23 10:14:37.487809663 -0800
+@@ -0,0 +1,2816 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "ibscif_driver.h"
++
++#include <linux/sched.h>
++/* dev/wr/qp backpointers overlayed in skb cb[] */
++struct ibscif_skb_cb {
++ struct ibscif_dev *dev;
++ struct ibscif_wr *wr;
++ scif_epd_t scif_ep;
++ struct ibscif_qp *qp; /* for UD only */
++};
++
++#define SET_SKB_DEV(skb,dev0) ((struct ibscif_skb_cb *)&skb->cb)->dev = dev0
++#define SET_SKB_WR(skb,wr0) ((struct ibscif_skb_cb *)&skb->cb)->wr = wr0
++#define SET_SKB_EP(skb,ep0) ((struct ibscif_skb_cb *)&skb->cb)->scif_ep = ep0
++#define SET_SKB_QP(skb,qp0) ((struct ibscif_skb_cb *)&skb->cb)->qp = qp0
++
++#define GET_SKB_DEV(skb) ((struct ibscif_skb_cb *)&skb->cb)->dev
++#define GET_SKB_WR(skb) ((struct ibscif_skb_cb *)&skb->cb)->wr
++#define GET_SKB_EP(skb) ((struct ibscif_skb_cb *)&skb->cb)->scif_ep
++#define GET_SKB_QP(skb) ((struct ibscif_skb_cb *)&skb->cb)->qp
++
++#define hw_addr_equal(h1, h2) (!memcmp(h1, h2, ETH_ALEN))
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0)
++ #define KMAP(x) kmap(x->page)
++ #define KUNMAP(x) kunmap(x->page)
++ #define SET_PAGE(x,y) x->page = y
++ #define GET_PAGE(x) get_page(x->page)
++#else
++ #define KMAP(x) kmap(skb_frag_page(x))
++ #define KUNMAP(x) kunmap(skb_frag_page(x))
++ #define SET_PAGE(x,y) __skb_frag_set_page(x, y)
++ #define GET_PAGE(x) __skb_frag_ref(x)
++#endif
++
++void ibscif_skb_destructor(struct sk_buff *skb)
++{
++ struct ibscif_dev *dev = GET_SKB_DEV(skb);
++
++ /* A sk_buff is now available. */
++ if (atomic_inc_return(&dev->available) == 1)
++ ; /* Could invoke the scheduler here. */
++
++ /* Release the module reference held for this sk_buff. */
++ module_put(THIS_MODULE);
++}
++
++static struct sk_buff *ibscif_alloc_tx_skb(struct ibscif_dev *dev, int hdr_size, int payload_size)
++{
++ struct sk_buff *skb;
++
++ skb = dev_alloc_skb(hdr_size);
++ if (unlikely(!skb))
++ return NULL;
++
++ skb_reset_mac_header(skb);
++ skb_reset_network_header(skb);
++
++ skb->protocol = IBSCIF_PACKET_TYPE;
++ skb->ip_summed = CHECKSUM_UNNECESSARY;
++ skb->priority = TC_PRIO_CONTROL; /* highest defined priority */
++ skb->dev = (void *) dev;
++ skb->len = hdr_size + payload_size;
++ skb->data_len = payload_size;
++ skb->tail += hdr_size;
++
++ return skb;
++}
++
++static struct sk_buff_head xmit_queue;
++static void ibscif_xmit_work_handler( struct work_struct *context );
++static DECLARE_WORK(ibscif_xmit_work, ibscif_xmit_work_handler);
++static atomic_t xmit_busy = ATOMIC_INIT(0);
++
++static void ibscif_xmit_work_handler( struct work_struct *context )
++{
++ struct sk_buff *skb;
++ scif_epd_t scif_ep;
++ int num_frags;
++ skb_frag_t *frag;
++ void *vaddr;
++ int ret;
++ int hdr_size;
++ int i;
++ struct ibscif_qp *qp;
++
++again:
++ while ((skb = skb_dequeue(&xmit_queue))) {
++ scif_ep = GET_SKB_EP(skb);
++ if (!scif_ep) {
++ printk(KERN_ALERT PFX "%s: NULL scif_ep, skb=%p\n", __func__, skb);
++ goto next;
++ }
++
++ hdr_size = skb->len - skb->data_len;
++ for (i=0; i<hdr_size; ) {
++ ret = scif_send(scif_ep, skb->data+i, hdr_size-i,
++ blocking_send ? SCIF_SEND_BLOCK : 0);
++ if (ret < 0) {
++ printk(KERN_ALERT PFX "%s: fail to send header, hdr_size=%d, ret=%d\n", __func__, hdr_size, ret);
++ goto next;
++ }
++ i += ret;
++ }
++
++ num_frags = skb_shinfo(skb)->nr_frags;
++ frag = skb_shinfo(skb)->frags;
++ while (num_frags--) {
++ vaddr = KMAP(frag); /* because scif_send() may cause scheduling */
++ for (i=0; i<frag->size; ) {
++ ret = scif_send(scif_ep, vaddr + frag->page_offset + i,
++ frag->size - i,
++ blocking_send ? SCIF_SEND_BLOCK : 0);
++ if (ret < 0) {
++ printk(KERN_ALERT PFX "%s: scif_send returns %d, frag_size=%d\n", __func__, ret, frag->size);
++ break;
++ }
++ i += ret;
++ }
++ KUNMAP(frag);
++ frag++;
++ }
++next:
++ qp = GET_SKB_QP(skb);
++ if (qp && qp->ibqp.qp_type == IB_QPT_UD) {
++ struct ibscif_full_frame *pdu = (struct ibscif_full_frame*)skb->data;
++ u16 opcode = __be16_to_cpu(pdu->ibscif.hdr.opcode);
++ if (ibscif_pdu_is_last(opcode)) {
++ struct ibscif_wr *wr = GET_SKB_WR(skb);
++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds);
++ wr->state = WR_COMPLETED;
++ ibscif_process_sq_completions(GET_SKB_QP(skb));
++ }
++ /* Release the reference held on UD QPs */
++ ibscif_put_qp(qp);
++ }
++ kfree_skb(skb);
++ }
++
++ if (!skb_queue_empty(&xmit_queue))
++ goto again;
++
++ atomic_set(&xmit_busy, 0);
++}
++
++static void ibscif_dev_queue_xmit(struct sk_buff *skb)
++{
++ struct ibscif_dev *dev=NULL;
++ int len = 0;
++
++ if (skb) {
++ dev = GET_SKB_DEV(skb);
++ len = skb->len;
++ skb_queue_tail(&xmit_queue, skb);
++ }
++
++ /* only one instance can be enqueued, otherwise there is race condition between scif_send() calls. */
++ /* notice that the current running worker may miss the newly added item, but it will be picked up in the poll_thread */
++ if (!atomic_xchg(&xmit_busy, 1))
++ schedule_work(&ibscif_xmit_work);
++
++ if (likely(dev)) {
++ DEV_STAT(dev, packets_sent++);
++ DEV_STAT(dev, bytes_sent += len);
++ }
++}
++
++static int ibscif_create_hdr(struct ibscif_qp *qp, struct ibscif_wr *wr, struct sk_buff *skb,
++ u32 seq_num, u32 wr_len_remaining, int force)
++{
++ struct ibscif_full_frame *pdu = (struct ibscif_full_frame*)skb->data;
++ u32 sq_seq, iq_seq;
++ u16 opcode;
++ int i;
++
++ sq_seq = qp->wire.sq.rx.last_in_seq;
++ iq_seq = qp->wire.iq.rx.last_in_seq;
++ qp->wire.sq.rx.last_seq_acked = sq_seq;
++ qp->wire.iq.rx.last_seq_acked = iq_seq;
++
++ pdu->ibscif.hdr.length = __cpu_to_be16(skb->data_len);
++ if (qp->ibqp.qp_type == IB_QPT_UD) {
++ pdu->ibscif.hdr.dst_qp = __cpu_to_be32(wr->ud.remote_qpn);
++ }
++ else {
++ pdu->ibscif.hdr.dst_qp = __cpu_to_be32(qp->remote_qpn);
++ }
++ pdu->ibscif.hdr.src_qp = __cpu_to_be32(qp->ibqp.qp_num);
++ pdu->ibscif.hdr.seq_num = __cpu_to_be32(seq_num);
++ pdu->ibscif.hdr.sq_ack_num = __cpu_to_be32(sq_seq);
++ pdu->ibscif.hdr.iq_ack_num = __cpu_to_be32(iq_seq);
++
++ switch (wr->opcode) {
++ case WR_UD:
++ opcode = ibscif_op_ud;
++ if (skb->data_len == wr_len_remaining) {
++ opcode = ibscif_pdu_set_last(opcode);
++ if (wr->flags & IB_SEND_SIGNALED)
++ force = 1;
++ if (wr->flags & IB_SEND_SOLICITED)
++ opcode = ibscif_pdu_set_se(opcode);
++ }
++ pdu->ibscif.ud.msg_length = __cpu_to_be32(wr->length);
++ pdu->ibscif.ud.msg_offset = __cpu_to_be32(wr->length - wr_len_remaining);
++ memset(&pdu->ibscif.ud.grh, 0, 40);
++ break;
++
++ case WR_SEND:
++ case WR_SEND_WITH_IMM:
++ opcode = ibscif_op_send;
++ if (skb->data_len == wr_len_remaining || opcode == ibscif_op_send_rma) {
++ opcode = ibscif_pdu_set_last(opcode);
++ if (wr->flags & IB_SEND_SIGNALED)
++ force = 1;
++ if (wr->opcode == WR_SEND_WITH_IMM) {
++ opcode = ibscif_pdu_set_immed(opcode);
++ pdu->ibscif.send.immed_data = __cpu_to_be32(wr->send.immediate_data);
++ } else pdu->ibscif.send.immed_data = 0;
++ if (wr->flags & IB_SEND_SOLICITED)
++ opcode = ibscif_pdu_set_se(opcode);
++ }
++ pdu->ibscif.send.msg_id = __cpu_to_be32(wr->msg_id);
++ pdu->ibscif.send.msg_length = __cpu_to_be32(wr->length);
++ pdu->ibscif.send.msg_offset = __cpu_to_be32(wr->length - wr_len_remaining);
++ if (wr->use_rma) {
++ opcode = ibscif_op_send_rma;
++ pdu->ibscif.send.rma_id = __cpu_to_be32(wr->rma_id);
++ pdu->ibscif.send.num_rma_addrs = __cpu_to_be32(wr->num_ds);
++ for (i=0; i<wr->num_ds; i++) {
++ pdu->ibscif.send.rma_addrs[i].offset = __cpu_to_be64(wr->ds_list[i].current_mreg->offset + wr->ds_list[i].offset);
++ pdu->ibscif.send.rma_addrs[i].length = __cpu_to_be32(wr->ds_list[i].length);
++ }
++ }
++ break;
++
++ case WR_RDMA_READ:
++ opcode = ibscif_op_read;
++ pdu->ibscif.read_req.rdma_id = __cpu_to_be32(wr->msg_id);
++ pdu->ibscif.read_req.rdma_key = __cpu_to_be32(wr->read.rkey);
++ pdu->ibscif.read_req.rdma_length= __cpu_to_be32(wr->read.remote_length);
++ pdu->ibscif.read_req.rdma_address = __cpu_to_be64(wr->read.remote_address);
++ if (wr->use_rma) {
++ opcode = ibscif_op_read_rma;
++ pdu->ibscif.read_req.num_rma_addrs = __cpu_to_be32(wr->num_ds);
++ for (i=0; i<wr->num_ds; i++) {
++ pdu->ibscif.read_req.rma_addrs[i].offset = __cpu_to_be64(wr->ds_list[i].current_mreg->offset + wr->ds_list[i].offset);
++ pdu->ibscif.read_req.rma_addrs[i].length = __cpu_to_be32(wr->ds_list[i].length);
++ }
++ }
++ break;
++
++ case WR_RDMA_WRITE:
++ case WR_RDMA_WRITE_WITH_IMM:
++ opcode = ibscif_op_write;
++ if ((enum ib_wr_opcode)wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
++ opcode = ibscif_pdu_set_immed(opcode);
++ pdu->ibscif.write.immed_data = __cpu_to_be32(wr->write.immediate_data);
++ if (wr->flags & IB_SEND_SOLICITED)
++ opcode = ibscif_pdu_set_se(opcode);
++ } else pdu->ibscif.write.immed_data = 0;
++ if (skb->data_len == wr_len_remaining || opcode == ibscif_op_write_rma) {
++ opcode = ibscif_pdu_set_last(opcode);
++ if (wr->flags & IB_SEND_SIGNALED)
++ force = 1;
++ }
++ pdu->ibscif.write.msg_id = __cpu_to_be32(wr->msg_id);
++ pdu->ibscif.write.rdma_key = __cpu_to_be32(wr->write.rkey);
++ pdu->ibscif.write.rdma_address = __cpu_to_be64(wr->write.remote_address +
++ (wr->length - wr_len_remaining));
++ if (wr->use_rma) {
++ opcode = ibscif_op_write_rma;
++ if (wr->opcode == WR_RDMA_WRITE_WITH_IMM)
++ opcode = ibscif_pdu_set_immed(opcode);
++ pdu->ibscif.write.rma_id = __cpu_to_be32(wr->rma_id);
++ pdu->ibscif.write.rma_length = __cpu_to_be32(wr->length);
++ pdu->ibscif.write.num_rma_addrs = __cpu_to_be32(wr->num_ds);
++ for (i=0; i<wr->num_ds; i++) {
++ pdu->ibscif.write.rma_addrs[i].offset = __cpu_to_be64(wr->ds_list[i].current_mreg->offset + wr->ds_list[i].offset);
++ pdu->ibscif.write.rma_addrs[i].length = __cpu_to_be32(wr->ds_list[i].length);
++ }
++ }
++ break;
++
++ case WR_ATOMIC_CMP_AND_SWP:
++ opcode = ibscif_pdu_set_last(ibscif_op_comp_swap);
++ pdu->ibscif.comp_swap.atomic_id = __cpu_to_be32(wr->msg_id);
++ pdu->ibscif.comp_swap.atomic_key = __cpu_to_be32(wr->cmp_swp.rkey);
++ pdu->ibscif.comp_swap.comp_data = __cpu_to_be64(wr->cmp_swp.cmp_operand);
++ pdu->ibscif.comp_swap.swap_data = __cpu_to_be64(wr->cmp_swp.swp_operand);
++ pdu->ibscif.comp_swap.atomic_address = __cpu_to_be64(wr->cmp_swp.remote_address);
++ break;
++
++ case WR_ATOMIC_FETCH_AND_ADD:
++ opcode = ibscif_pdu_set_last(ibscif_op_fetch_add);
++ pdu->ibscif.fetch_add.atomic_id = __cpu_to_be32(wr->msg_id);
++ pdu->ibscif.fetch_add.atomic_key = __cpu_to_be32(wr->fetch_add.rkey);
++ pdu->ibscif.fetch_add.add_data = __cpu_to_be64(wr->fetch_add.add_operand);
++ pdu->ibscif.fetch_add.atomic_address = __cpu_to_be64(wr->fetch_add.remote_address);
++ break;
++
++ case WR_RDMA_READ_RSP:
++ opcode = ibscif_op_read_rsp;
++ if (skb->data_len == wr_len_remaining)
++ opcode = ibscif_pdu_set_last(opcode);
++ pdu->ibscif.read_rsp.rdma_id = __cpu_to_be32(wr->msg_id);
++ pdu->ibscif.read_rsp.rdma_offset = __cpu_to_be32(wr->length - wr_len_remaining);
++ break;
++
++ case WR_ATOMIC_RSP:
++ opcode = ibscif_pdu_set_last(wr->atomic_rsp.opcode);
++ pdu->ibscif.atomic_rsp.atomic_id = __cpu_to_be32(wr->msg_id);
++ pdu->ibscif.atomic_rsp.orig_data = __cpu_to_be64(wr->atomic_rsp.orig_data);
++ break;
++
++ case WR_RMA_RSP:
++ opcode = ibscif_op_rma_rsp;
++ pdu->ibscif.rma_rsp.rma_id = __cpu_to_be32(wr->msg_id);
++ pdu->ibscif.rma_rsp.xfer_length = __cpu_to_be32(wr->rma_rsp.xfer_length);
++ pdu->ibscif.rma_rsp.error = __cpu_to_be32(wr->rma_rsp.error);
++ break;
++ default:
++ printk(KERN_ERR PFX "%s() invalid opcode %d\n", __func__, wr->opcode);
++ return 1;
++ }
++
++ if (force)
++ opcode = ibscif_pdu_set_force_ack(opcode);
++
++ pdu->ibscif.hdr.opcode = __cpu_to_be16(opcode);
++
++ return 0;
++}
++
++static struct sk_buff* ibscif_alloc_pdu(struct ibscif_dev *dev, struct ibscif_qp *qp, struct ibscif_wr *wr,
++ int hdr_size, u32 seq_num, u32 payload_size, u32 len_remaining, int force)
++{
++ struct sk_buff *skb;
++ struct ibscif_full_frame *pdu;
++
++ if (unlikely(!qp->conn && qp->ibqp.qp_type != IB_QPT_UD)) {
++ printk(KERN_ALERT PFX "%s: ERROR: qp->conn == NULL\n", __func__);
++ return NULL;
++ }
++
++ if (!atomic_add_unless(&dev->available, -1, 0)) {
++ printk(KERN_NOTICE PFX "%s throttled by available tx buffer limit\n", dev->ibdev.name);
++ DEV_STAT(dev, unavailable++);
++ return NULL;
++ }
++
++ /* Get an skb for this protocol packet. */
++ skb = ibscif_alloc_tx_skb(dev, hdr_size, payload_size);
++ if (unlikely(!skb))
++ goto bail;
++
++ /* Hold a reference on the module until skb->destructor is called. */
++ __module_get(THIS_MODULE);
++ skb->destructor = ibscif_skb_destructor;
++
++ SET_SKB_DEV(skb, dev);
++ SET_SKB_WR(skb, wr);
++
++ if (qp->ibqp.qp_type == IB_QPT_UD) {
++ struct ibscif_conn *conn;
++ int flag = qp->ibqp.qp_num > wr->ud.remote_qpn;
++ conn = ibscif_get_conn(qp->local_node_id, wr->ud.remote_node_id, flag);
++ if (unlikely(!conn)) {
++ kfree_skb(skb);
++ goto bail;
++ }
++
++ ibscif_qp_add_ud_conn(qp, conn);
++ ibscif_put_conn(conn);
++ SET_SKB_EP(skb, conn->ep);
++ SET_SKB_QP(skb, qp);
++
++ /* Reference UD QPs until the wr is transmitted by ibscif_xmit_work_handler */
++ kref_get(&qp->ref);
++ }
++ else {
++ SET_SKB_EP(skb, qp->conn->ep);
++ }
++
++ /* Construct the header and copy it to the skb. */
++ if (unlikely(ibscif_create_hdr(qp, wr, skb, seq_num, len_remaining, force))) {
++ kfree_skb(skb);
++ goto bail;
++ }
++
++ pdu = (struct ibscif_full_frame *)skb->data;
++ pdu->ibscif.hdr.hdr_size = __cpu_to_be16(hdr_size);
++
++ return skb;
++bail:
++ atomic_inc(&dev->available);
++ return NULL;
++}
++
++static int ibscif_send_null_pdu(struct ibscif_dev *dev, struct ibscif_qp *qp, struct ibscif_wr *wr, u32 hdr_size)
++{
++ struct sk_buff *skb;
++
++ /* Allocate an initialized skb with a PDU header. */
++ skb = ibscif_alloc_pdu(dev, qp, wr, hdr_size, wr->sar.seg.starting_seq, 0, 0, 0);
++ if (unlikely(!skb))
++ return 0;
++
++ ibscif_dev_queue_xmit(skb);
++ return 1;
++}
++
++static int get_hdr_size_from_wr(struct ibscif_wr *wr)
++{
++ switch (wr->opcode) {
++ case WR_UD: return sizeof(struct ud_hdr);
++ case WR_SEND:
++ case WR_SEND_WITH_IMM: return sizeof(struct send_hdr);
++ case WR_RDMA_WRITE:
++ case WR_RDMA_WRITE_WITH_IMM: return sizeof(struct write_hdr);
++ case WR_RDMA_READ: return sizeof(struct read_req_hdr);
++ case WR_ATOMIC_CMP_AND_SWP: return sizeof(struct comp_swap_hdr);
++ case WR_ATOMIC_FETCH_AND_ADD: return sizeof(struct fetch_add_hdr);
++ case WR_RDMA_READ_RSP: return sizeof(struct read_rsp_hdr);
++ case WR_ATOMIC_RSP: return sizeof(struct atomic_rsp_hdr);
++ case WR_RMA_RSP: return sizeof(struct rma_rsp_hdr);
++ default: return 0;
++ }
++}
++
++static int get_rma_addr_size_from_wr(struct ibscif_wr *wr)
++{
++ switch (wr->opcode) {
++ case WR_UD: return 0;
++ case WR_SEND:
++ case WR_SEND_WITH_IMM:
++ case WR_RDMA_WRITE:
++ case WR_RDMA_WRITE_WITH_IMM:
++ case WR_RDMA_READ: return wr->num_ds * sizeof(struct rma_addr);
++ case WR_ATOMIC_CMP_AND_SWP: return 0;
++ case WR_ATOMIC_FETCH_AND_ADD: return 0;
++ case WR_RDMA_READ_RSP: return 0;
++ case WR_ATOMIC_RSP: return 0;
++ case WR_RMA_RSP: return 0;
++ default: return 0;
++ }
++}
++
++static int setup_rma_addrs(struct ibscif_wq *wq, struct ibscif_wr *wr)
++{
++ struct ibscif_ds *ds;
++ int i;
++
++ if (!wr->num_ds)
++ return 1;
++
++ for (i=0; i<wr->num_ds; i++) {
++ ds = &wr->ds_list[i];
++ if (!ds->current_mreg)
++ ds->current_mreg = ibscif_mr_get_mreg(ds->mr, wq->qp->conn);
++
++ if (!ds->current_mreg)
++ return 0;
++ }
++
++ return 1;
++}
++
++/* when necessary SCIF will allocate temp buffer to align up cache line offset.
++ * * so we only need to use roffset to calculate the dma size.
++ * */
++static inline int ibscif_dma_size(u32 len, u64 roffset)
++{
++ u32 head, tail;
++
++ tail = (roffset + len) % 64;
++ head = (64 - roffset % 64) % 64;
++ if (len >= head + tail)
++ return (len - head - tail);
++ else
++ return 0;
++}
++
++static void ibscif_send_ack(struct ibscif_qp *qp); /* defined later in this file */
++
++static int ibscif_try_fast_rdma(struct ibscif_wq *wq, struct ibscif_wr *wr)
++{
++ struct ibscif_qp *qp;
++ int i, err;
++ u64 loffset, roffset;
++ u32 total_length, rdma_length, xfer_len;
++ u64 raddress;
++ u32 rkey;
++ enum ib_access_flags access;
++ u32 dma_size = 0;
++ int rma_flag = 0;
++
++ IBSCIF_PERF_SAMPLE(2, 0);
++
++ switch (wr->opcode) {
++ case WR_RDMA_WRITE:
++ raddress = wr->write.remote_address;
++ rkey = wr->write.rkey;
++ total_length = rdma_length = wr->length;
++ access = IB_ACCESS_REMOTE_WRITE;
++ break;
++
++ case WR_RDMA_READ:
++ raddress = wr->read.remote_address;
++ rkey = wr->read.rkey;
++ total_length = rdma_length = wr->read.remote_length; /* wr->length is 0 */
++ access = IB_ACCESS_REMOTE_READ;
++ break;
++
++ default:
++ return 0;
++ }
++
++ qp = wq->qp;
++
++ if (unlikely(!qp->conn)) {
++ printk(KERN_ALERT PFX "%s: ERROR: qp->conn == NULL\n", __func__);
++ return 0;
++ }
++
++ if (!setup_rma_addrs(wq, wr)) {
++ DEV_STAT(qp->dev, fast_rdma_fallback++);
++ return 0;
++ }
++
++ roffset = IBSCIF_MR_VADDR_TO_OFFSET( rkey, raddress );
++
++ for (i=0; i<wr->num_ds; i++) {
++ if (rdma_length == 0)
++ break;
++
++ loffset = wr->ds_list[i].current_mreg->offset + wr->ds_list[i].offset;
++ xfer_len = min(wr->ds_list[i].length, rdma_length);
++ if (xfer_len == 0)
++ continue;
++
++ IBSCIF_PERF_SAMPLE(3, 0);
++
++ dma_size = ibscif_dma_size(xfer_len, roffset);
++
++ if (i==wr->num_ds-1)
++ rma_flag = dma_size ? SCIF_RMA_SYNC : 0;
++
++ if (wr->opcode == WR_RDMA_WRITE) {
++ err = scif_writeto(wq->qp->conn->ep, loffset, xfer_len, roffset, rma_flag|SCIF_RMA_ORDERED);
++ if (err)
++ printk(KERN_INFO PFX "%s(): error writing ordered message, size=%d, err=%d.\n", __func__, xfer_len, err);
++ }
++ else {
++ err = scif_readfrom(wq->qp->conn->ep, loffset, xfer_len, roffset, rma_flag);
++ if (err)
++ printk(KERN_INFO PFX "%s(): error reading the message, size=%d, err=%d.\n", __func__, xfer_len, err);
++ }
++
++ IBSCIF_PERF_SAMPLE(4, 0);
++
++ if (err){
++ DEV_STAT(qp->dev, fast_rdma_fallback++);
++ return 0;
++ }
++
++ roffset += xfer_len;
++ rdma_length -= xfer_len;
++ }
++
++ if (rdma_length)
++ printk(KERN_INFO PFX "%s(): remaining rdma_length=%d.\n", __func__, rdma_length);
++
++ IBSCIF_PERF_SAMPLE(5, 0);
++
++ /* complete the wr */
++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds);
++ wr->state = WR_COMPLETED;
++ wr->sar.rea.final_length = total_length - rdma_length;
++
++ /* we can't call ibscif_process_sq_completions here because we are holding the sq lock.
++ * set the flag and let the upper level make the call */
++ wq->fast_rdma_completions = 1;
++
++ if (wr->opcode == WR_RDMA_WRITE)
++ DEV_STAT(qp->dev, fast_rdma_write++);
++ else
++ DEV_STAT(qp->dev, fast_rdma_read++);
++
++ /* the fast rdma protocol doesn't send any packet, and thus can not piggyback any ack
++ * for the peer. send separate ack packet when necessary. */
++ if (qp->wire.sq.rx.last_seq_acked < qp->wire.sq.rx.last_in_seq ||
++ qp->wire.iq.rx.last_seq_acked < qp->wire.iq.rx.last_in_seq) {
++ ibscif_send_ack(qp);
++ DEV_STAT(qp->dev, fast_rdma_force_ack++);
++ }
++
++ IBSCIF_PERF_SAMPLE(8, 0);
++
++ return 1;
++}
++
++/*
++ * Setup for a fresh data descriptor.
++ */
++#define DS_SETUP(ds, mr, page_offset, page_index, ds_len_left) \
++do { \
++ mr = ds->mr; \
++ ds_len_left = ds->length; \
++ page_offset = ds->offset + (mr->addr & ~PAGE_MASK); \
++ page_index = page_offset >> PAGE_SHIFT; \
++ page_offset &= ~PAGE_MASK; \
++} while(0)
++
++/*
++ * Setup for page crossing within a data descriptor.
++ */
++#define NEXT_PAGE(ds, mr, page_offset, page_index, ds_len_left) \
++do { \
++ if (!ds_len_left) { \
++ ds++; \
++ DS_SETUP(ds, mr, page_offset, page_index, ds_len_left); \
++ } else { \
++ page_index++; \
++ BUG_ON(!(mr->npages > page_index)); \
++ page_offset = 0; \
++ } \
++} while(0)
++
++/*
++ * Setup the data descriptor, page, and offset for specified sequence number
++ */
++#define SETUP_BY_SEQ(wr, ds, mr, from_seq, wr_length, page_offset, page_index, \
++ ds_len_left, max_payload) \
++do { \
++ u32 i, frag_len_max; \
++ \
++ DS_SETUP(ds, mr, page_offset, page_index, ds_len_left); \
++ for (i = wr->sar.seg.starting_seq; seq_before(i, from_seq); i++) { \
++ num_frags = 0; \
++ payload_left = max_payload; \
++ while (payload_left && (num_frags < MAX_SKB_FRAGS)) { \
++ frag_len_max = min(ds_len_left, (u32)(PAGE_SIZE - page_offset));\
++ if (wr_length > payload_left) { \
++ if (payload_left > frag_len_max) { \
++ ds_len_left -= frag_len_max; \
++ NEXT_PAGE(ds, mr, page_offset, \
++ page_index, ds_len_left); \
++ } else { \
++ frag_len_max = payload_left; /* frag->size */ \
++ ds_len_left -= payload_left; \
++ page_offset += payload_left; \
++ } \
++ } else { \
++ if (wr_length > frag_len_max) { \
++ ds_len_left -= frag_len_max; \
++ NEXT_PAGE(ds, mr, page_offset, \
++ page_index, ds_len_left); \
++ } else { \
++ printk(KERN_ERR PFX \
++ "from_seq (%d) botch wr %p opcode %d length %d\n", \
++ from_seq, wr, wr->opcode, wr_length); \
++ return 0; \
++ } \
++ } \
++ wr_length -= frag_len_max; \
++ payload_left -= frag_len_max; \
++ num_frags++; \
++ } \
++ } \
++} while(0)
++
++int ibscif_xmit_wr(struct ibscif_wq *wq, struct ibscif_wr *wr, int tx_limit, int retransmit, u32 from_seq, u32 *posted)
++{
++ struct ibscif_dev *dev;
++ struct ibscif_qp *qp;
++ struct ibscif_ds *ds;
++ struct ibscif_mr *mr;
++ int hdr_size, page_index, num_frags, num_xmited;
++ u32 max_payload, wr_length, page_offset, ds_len_left, payload_left;
++
++ /* Try to process RDMA read/write directly with SCIF functions.
++ * The usual reason for failure is that the remote memory has not yet been
++ * registered with SCIF. The normal packet based path should handle that.
++ */
++ if (host_proxy && wq->qp->local_node_id>0 && wq->qp->remote_node_id==0) {
++ /* don't try fast rdma becasue we want to let the host do the data transfer */
++ }
++ else if (fast_rdma) {
++ num_xmited = 0;
++ if (ibscif_try_fast_rdma(wq, wr))
++ goto finish2;
++ }
++
++ if (!tx_limit) {
++ printk(KERN_INFO PFX "%s() called with tx_limit of zero\n", __func__);
++ return 0;
++ }
++
++ qp = wq->qp;
++ dev = qp->dev;
++ hdr_size = get_hdr_size_from_wr(wr);
++ max_payload = qp->mtu - hdr_size;
++
++ if (wr->use_rma) {
++ struct sk_buff *skb;
++
++ wr_length = wr->length;
++ wr->sar.seg.starting_seq = from_seq;
++ wr->sar.seg.ending_seq = from_seq;
++ wr->state = WR_STARTED;
++
++ num_xmited = 0;
++ if (setup_rma_addrs(wq, wr)) {
++ /* Make room in the header for RMA addresses */
++ hdr_size += get_rma_addr_size_from_wr(wr);
++
++ /* Allocate an initialized skb with PDU header. */
++ skb = ibscif_alloc_pdu(dev, qp, wr, hdr_size, from_seq, 0, wr_length, 0);
++ if (likely(skb)) {
++ ibscif_dev_queue_xmit(skb);
++ num_xmited++;
++ from_seq++;
++ }
++ }
++ else
++ printk(KERN_ALERT PFX "%s: fail to set up RMA addresses for the work request.\n", __func__);
++
++ goto finish;
++ }
++
++ if (!wr->sar.seg.current_ds) {
++ /*
++ * This is a fresh send so intialize the wr by setting the static
++ * parts of the header and sequence number range for this wr.
++ */
++ wr_length = wr->length;
++ wr->sar.seg.starting_seq = from_seq;
++ wr->sar.seg.ending_seq = from_seq;
++ if (wr_length > max_payload) {
++ wr->sar.seg.ending_seq += (wr_length / max_payload);
++ if (!(wr_length % max_payload))
++ wr->sar.seg.ending_seq--;
++ }
++
++ wr->state = WR_STARTED;
++
++ /*
++ * If this request has a payload, setup for fragmentation.
++ * Otherwise, send it on its way.
++ */
++ if (wr_length) {
++ ds = wr->ds_list;
++ DS_SETUP(ds, mr, page_offset, page_index, ds_len_left);
++ } else {
++ num_xmited = ibscif_send_null_pdu(dev, qp, wr, hdr_size);
++ /* from_seq must always advanced even in null PDU cases. */
++ from_seq++;
++ goto finish;
++ }
++ } else {
++ /* We're picking up from a paritally sent request. */
++ ds = wr->sar.seg.current_ds;
++ mr = ds->mr;
++ wr_length = wr->sar.seg.wr_length_remaining;
++ ds_len_left = wr->sar.seg.ds_length_remaining;
++ page_index = wr->sar.seg.current_page_index;
++ page_offset = wr->sar.seg.current_page_offset;
++ from_seq = wr->sar.seg.next_seq;
++ }
++
++ /* Ok, let's break this bad-boy up. */
++ num_xmited = 0;
++ while (wr_length && (num_xmited < tx_limit) && (qp->state == QP_CONNECTED)) {
++ struct sk_buff *skb;
++ skb_frag_t *frag;
++
++ /* Allocate an initialized skb with PDU header. */
++ skb = ibscif_alloc_pdu(dev, qp, wr, hdr_size, from_seq, min(wr_length, max_payload),
++ wr_length, retransmit && (num_xmited == (tx_limit - 1)));
++ if (unlikely(!skb))
++ break;
++
++ /* Update sequence number for next pass. */
++ from_seq++;
++
++ /* Fill the skb fragment list. */
++ frag = skb_shinfo(skb)->frags;
++ num_frags = 0;
++ payload_left = max_payload;
++
++ while (payload_left && (num_frags < MAX_SKB_FRAGS)) {
++ u32 frag_len_max;
++
++ SET_PAGE(frag, mr->page[page_index]);
++ frag->page_offset = page_offset;
++
++ /* Take a reference on the page - kfree_skb will release. */
++ GET_PAGE(frag);
++
++ frag_len_max = min(ds_len_left, (u32)(PAGE_SIZE - page_offset));
++ if (wr_length > payload_left) {
++ if (payload_left > frag_len_max) {
++ /* Deal with page boundary crossing. */
++ frag->size = frag_len_max;
++ ds_len_left -= frag_len_max;
++ NEXT_PAGE(ds, mr, page_offset, page_index, ds_len_left);
++ } else {
++ frag->size = payload_left;
++ ds_len_left -= payload_left;
++ page_offset += payload_left;
++ }
++ } else {
++ if (wr_length > frag_len_max) {
++ /* Deal with page boundary crossing. */
++ frag->size = frag_len_max;
++ ds_len_left -= frag_len_max;
++ NEXT_PAGE(ds, mr, page_offset, page_index, ds_len_left);
++ } else {
++ frag->size = wr_length;
++ payload_left -= wr_length;
++ wr_length = 0;
++ num_frags++; /* Change from index to number. */
++ break;
++ }
++ }
++
++ wr_length -= frag->size;
++ payload_left -= frag->size;
++ num_frags++;
++ frag++;
++ }
++ skb_shinfo(skb)->nr_frags = num_frags;
++
++ /* Check if we need to do a fixup because we ran out of frags. */
++ if ((num_frags == MAX_SKB_FRAGS) && wr_length) {
++ struct ibscif_full_frame *pdu = (struct ibscif_full_frame*)skb->data;
++ skb->len = hdr_size + (max_payload - payload_left);
++ skb->data_len = (max_payload - payload_left);
++ pdu->ibscif.hdr.length = __cpu_to_be16(skb->data_len);
++ pdu->ibscif.hdr.opcode = __cpu_to_be16(__be16_to_cpu(pdu->ibscif.hdr.opcode) & ~ibscif_last_flag);
++ }
++
++ /*
++ * Send it.
++ */
++ ibscif_dev_queue_xmit(skb);
++ num_xmited++;
++ }
++
++ /*
++ * Update state. If this is a retransmit, don't update anything. If not and
++ * there's more to do on the wr, save state. Otherwise, setup for next wr.
++ */
++ if (wr_length && !wr->use_rma) {
++ wr->sar.seg.current_ds = ds;
++ wr->sar.seg.wr_length_remaining = wr_length;
++ wr->sar.seg.ds_length_remaining = ds_len_left;
++ wr->sar.seg.current_page_index = page_index;
++ wr->sar.seg.current_page_offset = page_offset;
++ } else {
++finish: if (wr->opcode != WR_UD)
++ wr->state = WR_WAITING_FOR_ACK;
++finish2: wq->next_wr = (wq->next_wr + 1) % wq->size;
++ }
++ wr->sar.seg.next_seq = from_seq;
++ if (posted)
++ *posted = from_seq;
++
++ return num_xmited;
++}
++
++static struct sk_buff *ibscif_create_disconnect_hdr(struct ibscif_dev *dev, u32 src_qpn,
++ u32 dst_qpn, enum ibscif_reason reason)
++{
++ struct ibscif_full_frame *pdu;
++ struct sk_buff *skb;
++
++ skb = ibscif_alloc_tx_skb(dev, sizeof pdu->ibscif.disconnect, 0);
++ if (unlikely(!skb)) {
++ printk(KERN_ERR PFX "%s() can't allocate skb\n", __func__);
++ return NULL;
++ }
++
++ pdu = (struct ibscif_full_frame *)skb->data;
++
++ /* The eth_hdr and ack fields are set by the caller. */
++ pdu->ibscif.disconnect.hdr.opcode = __cpu_to_be16(ibscif_op_disconnect);
++ pdu->ibscif.disconnect.hdr.length = 0; /* Length has no meaning. */
++ pdu->ibscif.disconnect.hdr.dst_qp = __cpu_to_be32(dst_qpn);
++ pdu->ibscif.disconnect.hdr.src_qp = __cpu_to_be32(src_qpn);
++ pdu->ibscif.disconnect.hdr.seq_num = 0; /* seq_num has no meaning. */
++ pdu->ibscif.disconnect.hdr.hdr_size = __cpu_to_be16(sizeof(pdu->ibscif.disconnect));
++ pdu->ibscif.disconnect.reason = __cpu_to_be32(reason);
++
++ SET_SKB_DEV(skb, dev);
++ SET_SKB_WR(skb, NULL);
++
++ return skb;
++}
++
++void ibscif_send_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason)
++{
++ struct ibscif_dev *dev = qp->dev;
++ struct ibscif_full_frame *pdu;
++ struct sk_buff *skb;
++
++ if (qp->ibqp.qp_type == IB_QPT_UD)
++ return;
++
++ if (qp->loopback) {
++ ibscif_loopback_disconnect(qp, reason);
++ return;
++ }
++
++ if (unlikely(!qp->conn)) {
++ printk(KERN_ALERT PFX "%s: ERROR: qp->conn == NULL\n", __func__);
++ return;
++ }
++
++ skb = ibscif_create_disconnect_hdr(dev, qp->ibqp.qp_num, qp->remote_qpn, reason);
++ if (unlikely(!skb))
++ return;
++
++ SET_SKB_EP(skb, qp->conn->ep);
++
++ pdu = (struct ibscif_full_frame *)skb->data;
++
++ pdu->ibscif.disconnect.hdr.sq_ack_num = __cpu_to_be32(qp->wire.sq.rx.last_in_seq);
++ pdu->ibscif.disconnect.hdr.iq_ack_num = __cpu_to_be32(qp->wire.iq.rx.last_in_seq);
++
++ ibscif_dev_queue_xmit(skb);
++}
++
++void ibscif_reflect_disconnect(struct ibscif_qp *qp, struct base_hdr *hdr, struct sk_buff *in_skb, enum ibscif_reason reason)
++{
++ struct ibscif_full_frame *pdu;
++ struct sk_buff *skb;
++
++ if (!qp || IS_ERR(qp)) {
++ if (qp != ERR_PTR(-ENOENT) && verbose)
++ printk(KERN_ALERT PFX "%s: qp=%p hdr=%p in_skb=%p reason=%d\n", __func__, qp, hdr, in_skb, reason);
++ return;
++ }
++
++ /* Don't send a disconnect for a disconnect. */
++ if (ibscif_pdu_base_type(hdr->opcode) == ibscif_op_disconnect)
++ return;
++
++ if (!qp->conn || !qp->conn->ep)
++ return;
++
++ skb = ibscif_create_disconnect_hdr((void *)in_skb->dev, hdr->dst_qp, hdr->src_qp, reason);
++ if (unlikely(!skb))
++ return;
++
++ SET_SKB_EP(skb, qp->conn->ep);
++
++ pdu = (struct ibscif_full_frame *)skb->data;
++
++ pdu->ibscif.disconnect.hdr.sq_ack_num = 0; /* sq_ack_num has no meaning. */
++ pdu->ibscif.disconnect.hdr.iq_ack_num = 0; /* iq_ack_num has no meaning. */
++
++ ibscif_dev_queue_xmit(skb);
++}
++
++static struct sk_buff *ibscif_create_ack_hdr(struct ibscif_qp *qp, int size)
++{
++ struct ibscif_full_frame *pdu;
++ struct sk_buff *skb;
++ u32 sq_seq, iq_seq;
++
++ if (unlikely(!qp->conn)) {
++ printk(KERN_ALERT PFX "%s: ERROR: qp->conn == NULL\n", __func__);
++ return NULL;
++ }
++
++ skb = ibscif_alloc_tx_skb(qp->dev, size, 0);
++ if (unlikely(!skb)) {
++ printk(KERN_ERR PFX "%s() can't allocate skb\n", __func__);
++ return NULL;
++ }
++
++ SET_SKB_DEV(skb, qp->dev);
++ SET_SKB_WR(skb, NULL);
++ SET_SKB_EP(skb, qp->conn->ep);
++
++ sq_seq = qp->wire.sq.rx.last_in_seq;
++ iq_seq = qp->wire.iq.rx.last_in_seq;
++ qp->wire.sq.rx.last_seq_acked = sq_seq;
++ qp->wire.iq.rx.last_seq_acked = iq_seq;
++
++ pdu = (struct ibscif_full_frame *)skb->data;
++
++ /* The opcode field set by the caller. */
++ pdu->ibscif.hdr.length = 0; /* Length has no meaning. */
++ pdu->ibscif.hdr.dst_qp = __cpu_to_be32(qp->remote_qpn);
++ pdu->ibscif.hdr.src_qp = __cpu_to_be32(qp->ibqp.qp_num);
++ pdu->ibscif.hdr.seq_num = 0; /* seq_num has no meaning. */
++ pdu->ibscif.hdr.sq_ack_num = __cpu_to_be32(sq_seq);
++ pdu->ibscif.hdr.iq_ack_num = __cpu_to_be32(iq_seq);
++ pdu->ibscif.hdr.hdr_size = __cpu_to_be16(size);
++
++ return skb;
++}
++
++static void ibscif_send_ack(struct ibscif_qp *qp)
++{
++ struct ibscif_full_frame *pdu;
++ struct sk_buff *skb;
++
++ skb = ibscif_create_ack_hdr(qp, sizeof pdu->ibscif.ack);
++ if (unlikely(!skb))
++ return;
++
++ pdu = (struct ibscif_full_frame *)skb->data;
++ pdu->ibscif.ack.hdr.opcode = __cpu_to_be16(ibscif_op_ack);
++
++ ibscif_dev_queue_xmit(skb);
++}
++
++static struct sk_buff *ibscif_create_close_hdr(struct ibscif_conn *conn, int size)
++{
++ struct ibscif_full_frame *pdu;
++ struct sk_buff *skb;
++
++ if (unlikely(!conn)) {
++ printk(KERN_ALERT PFX "%s: ERROR: conn == NULL\n", __func__);
++ return NULL;
++ }
++
++ skb = ibscif_alloc_tx_skb(conn->dev, size, 0);
++ if (unlikely(!skb)) {
++ printk(KERN_ERR PFX "%s() can't allocate skb\n", __func__);
++ return NULL;
++ }
++
++ SET_SKB_DEV(skb, conn->dev);
++ SET_SKB_WR(skb, NULL);
++ SET_SKB_EP(skb, conn->ep);
++
++ pdu = (struct ibscif_full_frame *)skb->data;
++
++ /* The opcode field set by the caller. */
++ pdu->ibscif.hdr.length = 0; /* Length has no meaning. */
++ pdu->ibscif.hdr.dst_qp = 0; /* unused */
++ pdu->ibscif.hdr.src_qp = 0; /* unused */
++ pdu->ibscif.hdr.seq_num = 0; /* seq_num has no meaning. */
++ pdu->ibscif.hdr.sq_ack_num = 0; /* unused */
++ pdu->ibscif.hdr.iq_ack_num = 0; /* unused */
++ pdu->ibscif.hdr.hdr_size = __cpu_to_be16(size);
++
++ return skb;
++}
++
++void ibscif_send_close(struct ibscif_conn *conn)
++{
++ struct ibscif_full_frame *pdu;
++ struct sk_buff *skb;
++
++ skb = ibscif_create_close_hdr(conn, sizeof pdu->ibscif.close);
++ if (unlikely(!skb))
++ return;
++
++ pdu = (struct ibscif_full_frame *)skb->data;
++ pdu->ibscif.close.hdr.opcode = __cpu_to_be16(ibscif_op_close);
++
++ ibscif_dev_queue_xmit(skb);
++}
++
++void ibscif_send_reopen(struct ibscif_conn *conn)
++{
++ struct ibscif_full_frame *pdu;
++ struct sk_buff *skb;
++
++ skb = ibscif_create_close_hdr(conn, sizeof pdu->ibscif.close);
++ if (unlikely(!skb))
++ return;
++
++ pdu = (struct ibscif_full_frame *)skb->data;
++ pdu->ibscif.close.hdr.opcode = __cpu_to_be16(ibscif_op_reopen);
++
++ ibscif_dev_queue_xmit(skb);
++}
++
++static struct sk_buff *ibscif_create_cm_hdr(struct ibscif_conn *conn, int size)
++{
++ struct ibscif_full_frame *pdu;
++ struct sk_buff *skb;
++
++ if (unlikely(!conn)) {
++ printk(KERN_ALERT PFX "%s: ERROR: conn == NULL\n", __func__);
++ return NULL;
++ }
++
++ skb = ibscif_alloc_tx_skb(conn->dev, size, 0);
++ if (unlikely(!skb)) {
++ printk(KERN_ERR PFX "%s() can't allocate skb\n", __func__);
++ return NULL;
++ }
++
++ SET_SKB_DEV(skb, conn->dev);
++ SET_SKB_WR(skb, NULL);
++ SET_SKB_EP(skb, conn->ep);
++
++ pdu = (struct ibscif_full_frame *)skb->data;
++
++ pdu->ibscif.hdr.opcode = __cpu_to_be16(ibscif_op_cm);
++ pdu->ibscif.hdr.length = 0; /* Length has no meaning. */
++ pdu->ibscif.hdr.dst_qp = 0; /* unused */
++ pdu->ibscif.hdr.src_qp = 0; /* unused */
++ pdu->ibscif.hdr.seq_num = 0; /* seq_num has no meaning. */
++ pdu->ibscif.hdr.sq_ack_num = 0; /* unused */
++ pdu->ibscif.hdr.iq_ack_num = 0; /* unused */
++ pdu->ibscif.hdr.hdr_size = __cpu_to_be16(size);
++
++ return skb;
++}
++
++int ibscif_send_cm_req(struct ibscif_cm *cm_ctx)
++{
++ struct ibscif_full_frame *pdu;
++ struct sk_buff *skb;
++
++ skb = ibscif_create_cm_hdr(cm_ctx->conn, sizeof pdu->ibscif.cm + cm_ctx->plen);
++ if (unlikely(!skb))
++ return -ENOMEM;
++
++ pdu = (struct ibscif_full_frame *)skb->data;
++ pdu->ibscif.cm.req_ctx = __cpu_to_be64((u64)(uintptr_t)cm_ctx);
++ pdu->ibscif.cm.cmd = __cpu_to_be32(IBSCIF_CM_REQ);
++ pdu->ibscif.cm.port = __cpu_to_be32((u32)cm_ctx->remote_addr.sin_port);
++ pdu->ibscif.cm.qpn = __cpu_to_be32(cm_ctx->qpn);
++ pdu->ibscif.cm.plen = __cpu_to_be32(cm_ctx->plen);
++ memcpy(pdu->ibscif.cm.pdata, cm_ctx->pdata, cm_ctx->plen);
++
++ ibscif_dev_queue_xmit(skb);
++
++ return 0;
++}
++
++int ibscif_send_cm_rep(struct ibscif_cm *cm_ctx)
++{
++ struct ibscif_full_frame *pdu;
++ struct sk_buff *skb;
++
++ skb = ibscif_create_cm_hdr(cm_ctx->conn, sizeof pdu->ibscif.cm + cm_ctx->plen);
++ if (unlikely(!skb))
++ return -ENOMEM;
++
++ pdu = (struct ibscif_full_frame *)skb->data;
++ pdu->ibscif.cm.req_ctx = __cpu_to_be64(cm_ctx->peer_context);
++ pdu->ibscif.cm.rep_ctx = __cpu_to_be64((__u64)cm_ctx);
++ pdu->ibscif.cm.cmd = __cpu_to_be32(IBSCIF_CM_REP);
++ pdu->ibscif.cm.qpn = __cpu_to_be32(cm_ctx->qpn);
++ pdu->ibscif.cm.status = __cpu_to_be32(0);
++ pdu->ibscif.cm.plen = __cpu_to_be32(cm_ctx->plen);
++ memcpy(pdu->ibscif.cm.pdata, cm_ctx->pdata, cm_ctx->plen);
++
++ ibscif_dev_queue_xmit(skb);
++
++ return 0;
++}
++
++int ibscif_send_cm_rej(struct ibscif_cm *cm_ctx, const void *pdata, u8 plen)
++{
++ struct ibscif_full_frame *pdu;
++ struct sk_buff *skb;
++
++ skb = ibscif_create_cm_hdr(cm_ctx->conn, sizeof pdu->ibscif.cm + plen);
++ if (unlikely(!skb))
++ return -ENOMEM;
++
++ pdu = (struct ibscif_full_frame *)skb->data;
++ pdu->ibscif.cm.req_ctx = __cpu_to_be64(cm_ctx->peer_context);
++ pdu->ibscif.cm.cmd = __cpu_to_be32(IBSCIF_CM_REJ);
++ pdu->ibscif.cm.status = __cpu_to_be32(-ECONNREFUSED);
++ pdu->ibscif.cm.plen = __cpu_to_be32((u32)plen);
++ memcpy(pdu->ibscif.cm.pdata, pdata, plen);
++
++ ibscif_dev_queue_xmit(skb);
++
++ return 0;
++}
++
++int ibscif_send_cm_rtu(struct ibscif_cm *cm_ctx)
++{
++ struct ibscif_full_frame *pdu;
++ struct sk_buff *skb;
++
++ skb = ibscif_create_cm_hdr(cm_ctx->conn, sizeof pdu->ibscif.cm);
++ if (unlikely(!skb))
++ return -ENOMEM;
++
++ pdu = (struct ibscif_full_frame *)skb->data;
++ pdu->ibscif.cm.rep_ctx = __cpu_to_be64(cm_ctx->peer_context);
++ pdu->ibscif.cm.cmd = __cpu_to_be32(IBSCIF_CM_RTU);
++
++ ibscif_dev_queue_xmit(skb);
++
++ return 0;
++}
++
++/* ---------------------- tx routines above this line ---------------------- */
++/* ---------------------- rx routines below this line ---------------------- */
++
++static void ibscif_protocol_error(struct ibscif_qp *qp, enum ibscif_reason reason)
++{
++ printk(KERN_NOTICE PFX "Disconnect due to protocol error %d\n", reason);
++ ibscif_qp_internal_disconnect(qp, reason);
++}
++
++int ibscif_process_sq_completions(struct ibscif_qp *qp)
++{
++ struct ibscif_cq *cq = to_cq(qp->ibqp.send_cq);
++ struct ibscif_wq *sq = &qp->sq;
++ struct ibscif_wr *wr;
++ struct ibscif_wc *wc;
++ int index, err = 0, i;
++
++ spin_lock_bh(&sq->lock);
++
++ /* Prevent divide by zero traps on wrap math. */
++ if (!sq->size)
++ goto out;
++
++ /* Iterate the send queue looking for defered completions. */
++ for (i=sq->completions; i<sq->depth; i++) {
++ index = (sq->head + i) % sq->size;
++
++ wr = ibscif_get_wr(sq, index);
++ if (wr->state != WR_COMPLETED)
++ break;
++
++ sq->completions++;
++ sq->reap++;
++
++ /* An IQ request has been completed; update the throttling variables. */
++ if ((wr->opcode == WR_RDMA_READ) ||
++ (wr->opcode == WR_ATOMIC_CMP_AND_SWP) ||
++ (wr->opcode == WR_ATOMIC_FETCH_AND_ADD)) {
++ BUG_ON(!atomic_read(&qp->or_depth));
++ atomic_dec(&qp->or_depth);
++ atomic_dec(&qp->or_posted);
++ }
++
++ /* See if we need to generate a completion. */
++ if (!(wr->flags & IB_SEND_SIGNALED))
++ continue;
++
++ err = ibscif_reserve_cqe(cq, &wc);
++ if (unlikely(err))
++ break;
++
++ wc->ibwc.qp = &qp->ibqp;
++ wc->ibwc.src_qp = qp->remote_qpn;
++ wc->ibwc.wr_id = wr->id;
++ wc->ibwc.opcode = to_ib_wc_opcode(wr->opcode);
++ wc->ibwc.wc_flags = (((enum ib_wr_opcode)wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) ||
++ ((enum ib_wr_opcode)wr->opcode == IB_WR_SEND_WITH_IMM)) ?
++ IB_WC_WITH_IMM : 0;
++ wc->ibwc.status = IB_WC_SUCCESS;
++ wc->ibwc.ex.imm_data = 0;
++ wc->ibwc.port_num = 1;
++ wc->ibwc.byte_len = (((enum ib_wr_opcode)wr->opcode == IB_WR_RDMA_READ) ||
++ ((enum ib_wr_opcode)wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) ||
++ ((enum ib_wr_opcode)wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) ?
++ wr->sar.rea.final_length : 0;
++ wc->wq = sq;
++ wc->reap = sq->reap;
++ sq->reap = 0;
++
++ ibscif_append_cqe(cq, wc, 0);
++ }
++out:
++ spin_unlock_bh(&sq->lock);
++
++ ibscif_notify_cq(cq);
++ return err;
++}
++
++static int ibscif_schedule_rx_completions(struct ibscif_qp *qp, int iq_flag, struct ibscif_rx_state *rx)
++{
++ struct ibscif_cq *cq = to_cq(qp->ibqp.recv_cq);
++ struct ibscif_wq *wq;
++ struct ibscif_wr *wr;
++ struct ibscif_wc *wc;
++ u32 last_in_seq;
++ int index, err, i;
++
++ wq = iq_flag ? &qp->sq /* yep, the SQ */ : &qp->rq;
++ last_in_seq = rx->last_in_seq;
++
++ /* Prevent divide by zero traps on wrap math. */
++ if (!wq->size)
++ return 0;
++
++ spin_lock_bh(&wq->lock);
++ for (i=wq->completions; i<wq->depth; i++) {
++ index = (wq->head + i) % wq->size;
++
++ wr = ibscif_get_wr(wq, index);
++
++ /* Skip over non-IQ entries. */
++ if (iq_flag &&
++ ((wr->opcode == WR_UD) ||
++ (wr->opcode == WR_SEND) ||
++ (wr->opcode == WR_SEND_WITH_IMM) ||
++ (wr->opcode == WR_RDMA_WRITE) ||
++ (wr->opcode == WR_RDMA_WRITE_WITH_IMM)))
++ continue;
++
++ /*
++ * If this WR hasn't seen the final segment in sequence then
++ * there is nothing more to process in this queue. We use the
++ * last seen state as a qualifier because last_packet_seq will
++ * be uninitialized until last packet is seen.
++ */
++ if ((wr->state != WR_LAST_SEEN) ||
++ seq_before(last_in_seq, wr->sar.rea.last_packet_seq))
++ break;
++
++ /* Clear references on memory regions. */
++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds);
++
++ if (iq_flag) {
++ /*
++ * Completed IQ replies are defered until earlier
++ * non-IQ WR have completed. This is determined
++ * with a second iteration of the WQ below.
++ */
++ wr->state = WR_COMPLETED;
++ continue; /* Look for more IQ completions. */
++ }
++
++ /* All receive queue completions are done here. */
++ err = ibscif_reserve_cqe(cq, &wc);
++ if (unlikely(err)) {
++ spin_unlock_bh(&wq->lock);
++ return err;
++ }
++
++ wc->ibwc.qp = &qp->ibqp;
++ wc->ibwc.src_qp = qp->remote_qpn;
++ wc->ibwc.wr_id = wr->id;
++ wc->ibwc.status = IB_WC_SUCCESS;
++ wc->ibwc.byte_len = wr->sar.rea.final_length;
++ wc->ibwc.port_num = 1;
++
++ if (ibscif_pdu_is_immed(wr->sar.rea.opcode)) {
++ DEV_STAT(qp->dev, recv_imm++);
++ wc->ibwc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
++ wc->ibwc.ex.imm_data = wr->sar.rea.immediate_data;
++ } else {
++ DEV_STAT(qp->dev, recv++);
++ wc->ibwc.opcode = IB_WC_RECV;
++ wc->ibwc.ex.imm_data = 0;
++ }
++
++ wc->wq = wq;
++ wc->reap = 1;
++ wq->completions++;
++
++ ibscif_append_cqe(cq, wc, !!ibscif_pdu_is_se(wr->sar.rea.opcode));
++ }
++ spin_unlock_bh(&wq->lock);
++
++ /* If this was the recieve queue, there is no more processing to be done. */
++ if (!iq_flag) {
++ ibscif_notify_cq(cq);
++ return 0;
++ }
++
++ err = ibscif_process_sq_completions(qp);
++ if (unlikely(err))
++ return err;
++
++ /*
++ * If we just created room for a backlogged IQ stream request
++ * and there is a tx window, reschedule to get it sent.
++ */
++ if ((atomic_read(&qp->or_posted) > atomic_read(&qp->or_depth)) &&
++ (atomic_read(&qp->or_depth) < qp->max_or) &&
++ ibscif_tx_window(&qp->wire.sq.tx))
++ qp->schedule |= SCHEDULE_RESUME | SCHEDULE_SQ;
++
++ return 0;
++}
++
++static enum ibscif_schedule ibscif_process_wq_ack(struct ibscif_wq *wq, u32 seq_num)
++{
++ struct ibscif_tx_state *tx = &wq->wirestate->tx;
++ enum ibscif_schedule status = 0;
++ int throttled, index, err = 0, i;
++
++ if (!wq->size || !wq->depth)
++ return 0;
++
++ /* If this is old news, get out. */
++ if (!seq_after(seq_num, tx->last_ack_seq_recvd))
++ return 0;
++
++ /* Capture if window was closed before updating. */
++ throttled = !ibscif_tx_window(tx);
++ tx->last_ack_seq_recvd = seq_num;
++
++ /*
++ * If were were throttled and now have an open window or
++ * simply up to date, resume streaming transfers. This
++ * can be overwritten with other schedule states below.
++ */
++ if (throttled && ibscif_tx_window(tx))
++ status = SCHEDULE_RESUME;
++
++ spin_lock_bh(&wq->lock);
++ for (i=wq->completions; i<wq->depth; i++) {
++ struct ibscif_wr *wr;
++
++ index = (wq->head + i) % wq->size;
++
++ wr = ibscif_get_wr(wq, index);
++
++ /* Get out if the WR hasn't been scheduled. */
++ if (wr->state == WR_WAITING)
++ break;
++
++ if (seq_after(wr->sar.seg.ending_seq, seq_num)) {
++
++ if ((wr->state == WR_STARTED) && !ibscif_tx_unacked_window(tx))
++ status = SCHEDULE_RESUME;
++
++ break;
++ }
++
++ /* We seem to have a completed WQ element. */
++
++ if (is_iq(wq)) {
++ /*
++ * We have a completed IQ reply.
++ * Clear references to the memory region.
++ */
++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds);
++
++ /*
++ * It's more effecient to retire an IQ wqe manually
++ * here instead of calling ibscif_retire_wqes().
++ */
++ wq->head = (wq->head + 1) % wq->size;
++ wq->depth -= 1;
++
++ } else if ((wr->opcode == WR_RDMA_READ) ||
++ (wr->opcode == WR_ATOMIC_CMP_AND_SWP) ||
++ (wr->opcode == WR_ATOMIC_FETCH_AND_ADD)||
++ (wr->opcode == WR_UD && wr->use_rma) ||
++ (wr->opcode == WR_SEND && wr->use_rma) ||
++ (wr->opcode == WR_SEND_WITH_IMM && wr->use_rma) ||
++ (wr->opcode == WR_RDMA_WRITE && wr->use_rma) ||
++ (wr->opcode == WR_RDMA_WRITE_WITH_IMM && wr->use_rma)) {
++ /*
++ * We have a request acknowledgment.
++ * Note the state change so it isn't retried.
++ *
++ * BTW, these request types are completed in the
++ * ibscif_schedule_rx_completions() routine when
++ * the data has arrived.
++ */
++ if (wr->state == WR_WAITING_FOR_ACK)
++ wr->state = WR_WAITING_FOR_RSP;
++
++ } else if (wr->state != WR_COMPLETED) {
++ /* Request is complete so no need to keep references. */
++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds);
++ wr->state = WR_COMPLETED;
++ }
++ }
++ spin_unlock_bh(&wq->lock);
++
++ if (is_sq(wq)) {
++ err = ibscif_process_sq_completions(wq->qp);
++ if (unlikely(err)) {
++ printk(KERN_ALERT PFX "%s: sq completion error: err=%d \n", __func__, err);
++ ibscif_protocol_error(wq->qp, IBSCIF_REASON_QP_FATAL);
++ status = 0;
++ }
++ }
++
++ return status;
++}
++
++static void ibscif_process_ack(struct ibscif_qp *qp, struct base_hdr *hdr)
++{
++ qp->schedule |= ibscif_process_wq_ack(&qp->sq, hdr->sq_ack_num) | SCHEDULE_SQ;
++ qp->schedule |= ibscif_process_wq_ack(&qp->iq, hdr->iq_ack_num) | SCHEDULE_IQ;
++}
++
++/* Note that the WQ lock is held on success. */
++static struct ibscif_wr *ibscif_reserve_wqe(struct ibscif_wq *wq)
++{
++ int err;
++
++ spin_lock_bh(&wq->lock);
++
++ if (unlikely(wq->qp->state != QP_CONNECTED)) {
++ err = -ENOTCONN;
++ goto out;
++ }
++ if (unlikely(!wq->size)) {
++ err = -ENOSPC;
++ goto out;
++ }
++ if (unlikely(wq->depth == wq->size)) {
++ err = -ENOBUFS;
++ goto out;
++ }
++
++ return ibscif_get_wr(wq, wq->tail);
++out:
++ spin_unlock_bh(&wq->lock);
++ return ERR_PTR(err);
++}
++
++/* Note that this assumes the WQ lock is currently held. */
++static void ibscif_append_wqe(struct ibscif_wq *wq)
++{
++ DEV_STAT(wq->qp->dev, wr_opcode[ibscif_get_wr(wq, wq->tail)->opcode]++);
++ ibscif_append_wq(wq);
++ spin_unlock_bh(&wq->lock);
++}
++
++static struct ibscif_wr* ibscif_wr_by_msg_id(struct ibscif_wq *wq, u32 msg_id)
++{
++ struct ibscif_wr *wr;
++ int size = wq->size;
++
++ if (!size)
++ return NULL;
++
++ wr = ibscif_get_wr(wq, msg_id % size);
++ if (wr->use_rma)
++ return (wr->rma_id == msg_id) ? wr : NULL;
++ else
++ return (wr->msg_id == msg_id) ? wr : NULL;
++}
++
++static int ibscif_ds_dma(struct ibscif_qp *qp, struct page **page, u32 page_offset, struct sk_buff *skb, u32 dma_len, int head_copied)
++{
++ void *dst, *src = skb->data;
++ u32 copy_len;
++
++ while (dma_len) {
++ copy_len = min(dma_len, (u32)PAGE_SIZE - page_offset);
++
++ dst = ibscif_map_dst(*page) + page_offset;
++ head_copied = ibscif_atomic_copy(dst, src, copy_len, head_copied);
++ ibscif_unmap_dst(*page, dst);
++
++ src += copy_len;
++ dma_len -= copy_len;
++
++ page++;
++ page_offset = 0;
++ }
++
++ return head_copied;
++}
++
++static int ibscif_place_data(struct ibscif_qp *qp, struct ibscif_wr *wr, struct sk_buff *skb,
++ u32 length, u32 offset, u32 seq_num)
++{
++ struct ibscif_ds *ds;
++ struct ibscif_mr *mr;
++ int seg_num, page_index;
++ u32 dma_len, ds_offset, page_offset;
++ int head_copied = 0;
++
++ if (!length) {
++ ds = NULL;
++ dma_len = 0;
++ ds_offset = 0;
++ goto no_data;
++ }
++
++ /* See if we can use our ds cache. */
++ if (likely((wr->sar.rea.current_ds) && (wr->sar.rea.last_seen_seq == seq_num - 1))) {
++ /* Take the cached entires. */
++ ds = wr->sar.rea.current_ds;
++ mr = ds->mr;
++ ds_offset = wr->sar.rea.current_ds_offset;
++ seg_num = (ds - wr->ds_list) / sizeof *wr->ds_list;
++ } else {
++ ds_offset = offset;
++ ds = wr->ds_list;
++ seg_num = 0;
++ while ((ds_offset >= ds->length) && (seg_num < wr->num_ds)) {
++ ds_offset -= ds->length;
++ ds++;
++ seg_num++;
++ }
++next_ds:
++ if (unlikely(seg_num >= wr->num_ds))
++ return -EMSGSIZE;
++ /*
++ * A memory region which may have posted receives against it can
++ * still be freed, therefore, we need to burn the cycles here to
++ * make sure it's still valid. We'll take a reference on it now
++ * that data is coming in.
++ */
++ if (!ds->in_use) {
++ mr = ibscif_get_mr(ds->lkey);
++ if (unlikely(IS_ERR(mr)))
++ return PTR_ERR(mr);
++ ds->in_use = 1;
++ if (unlikely(mr != ds->mr))
++ return -ENXIO;
++ if (unlikely(!(mr->access & IB_ACCESS_LOCAL_WRITE)))
++ return -EACCES;
++ } else
++ mr = ds->mr;
++ }
++
++ /* Place data for this descriptor. Routine will handle page boundary crossings. */
++ page_offset = ds->offset + ds_offset + (mr->addr & ~PAGE_MASK);
++ page_index = page_offset >> PAGE_SHIFT;
++ page_offset &= ~PAGE_MASK;
++
++ dma_len = min(ds->length - ds_offset, length);
++ head_copied = ibscif_ds_dma(qp, &mr->page[page_index], page_offset, skb, dma_len, head_copied);
++ length -= dma_len;
++ if (length) {
++ ds++;
++ seg_num++;
++ ds_offset = 0;
++ skb_pull(skb, dma_len);
++ goto next_ds;
++ }
++no_data:
++ wr->sar.rea.last_seen_seq = seq_num;
++
++ if (ds && ((ds_offset + dma_len) < ds->length)) {
++ wr->sar.rea.current_ds = ds;
++ wr->sar.rea.current_ds_offset = ds_offset + dma_len;
++ } else
++ wr->sar.rea.current_ds = NULL; /* Force a validation of the next ds. */
++
++ return 0;
++}
++
++static int ibscif_process_ud(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb)
++{
++ struct ibscif_wr *wr;
++ int err;
++ int grh_size = 40;
++ int msg_id;
++
++ if (unlikely(qp->ibqp.qp_type != IB_QPT_UD)) {
++ printk(KERN_ALERT PFX "%s: UD packet received on non-UD QP\n", __func__);
++ return -EINVAL;
++ }
++
++ pdu->ud.msg_length = __be32_to_cpu(pdu->ud.msg_length);
++ pdu->ud.msg_offset = __be32_to_cpu(pdu->ud.msg_offset);
++
++ /* Only one pdu is allowed for one UD packet, otherwise drop the pdu */
++ if (unlikely(pdu->ud.msg_length != pdu->hdr.length || pdu->ud.msg_offset)) {
++ printk(KERN_INFO PFX "%s: dropping fragmented UD packet. total_length=%d msg_length=%d msg_offset=%d\n",
++ __func__, pdu->hdr.length, pdu->ud.msg_length, pdu->ud.msg_offset);
++ return -EINVAL;
++ }
++
++ spin_lock_bh(&qp->rq.lock);
++ if (unlikely(qp->rq.ud_msg_id >= qp->rq.next_msg_id)) {
++ spin_unlock_bh(&qp->rq.lock);
++ printk(KERN_ALERT PFX "%s: ERROR: message arrives before recv is posted. msg_id=%d, rq.next_msg_id=%d\n",
++ __func__, pdu->send.msg_id, qp->rq.next_msg_id);
++ return -EBADRQC;
++ }
++ msg_id = qp->rq.ud_msg_id++;
++ spin_unlock_bh(&qp->rq.lock);
++
++ wr = ibscif_wr_by_msg_id(&qp->rq, msg_id);
++ if (unlikely(!wr))
++ return -EBADR;
++
++ if (unlikely((pdu->ud.msg_length + grh_size) > wr->length))
++ return -EMSGSIZE;
++
++ /* GRH is included as part of the received message */
++ skb_pull(skb, sizeof(pdu->ud)-grh_size);
++
++ err = ibscif_place_data(qp, wr, skb, pdu->hdr.length+grh_size, pdu->ud.msg_offset, pdu->hdr.seq_num);
++ if (unlikely(err))
++ return err;
++
++ wr->state = WR_LAST_SEEN;
++ wr->sar.rea.opcode = pdu->hdr.opcode;
++ wr->sar.rea.last_packet_seq = 0;
++ wr->sar.rea.immediate_data = 0;
++ wr->sar.rea.final_length = pdu->ud.msg_length+grh_size;
++
++ return 0;
++}
++
++static int ibscif_process_send(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb)
++{
++ struct ibscif_wr *wr;
++ int err;
++
++ pdu->send.msg_id = __be32_to_cpu(pdu->send.msg_id);
++ spin_lock_bh(&qp->rq.lock);
++ if (unlikely(pdu->send.msg_id >= qp->rq.next_msg_id)) {
++ spin_unlock_bh(&qp->rq.lock);
++ printk(KERN_ALERT PFX "%s: ERROR: message arrives before recv is posted. msg_id=%d, rq.next_msg_id=%d\n",
++ __func__, pdu->send.msg_id, qp->rq.next_msg_id);
++ return -EBADRQC;
++ }
++ spin_unlock_bh(&qp->rq.lock);
++
++ wr = ibscif_wr_by_msg_id(&qp->rq, pdu->send.msg_id);
++ if (unlikely(!wr))
++ return -EBADR;
++
++ pdu->send.msg_length = __be32_to_cpu(pdu->send.msg_length);
++ if (unlikely(pdu->send.msg_length > wr->length))
++ return -EMSGSIZE;
++
++ pdu->send.msg_offset = __be32_to_cpu(pdu->send.msg_offset);
++ if (unlikely(pdu->send.msg_offset > pdu->send.msg_length))
++ return -EINVAL;
++
++ if (unlikely((pdu->hdr.length + pdu->send.msg_offset) > wr->length))
++ return -ESPIPE;
++
++ skb_pull(skb, sizeof(pdu->send));
++
++ err = ibscif_place_data(qp, wr, skb, pdu->hdr.length, pdu->send.msg_offset, pdu->hdr.seq_num);
++ if (unlikely(err))
++ return err;
++
++ if (ibscif_pdu_is_last(pdu->hdr.opcode)) {
++ /*
++ * We've got the last of the message data.
++ * We always assume immediate data; if not needed, no harm, on foul.
++ */
++ wr->state = WR_LAST_SEEN;
++ wr->sar.rea.opcode = pdu->hdr.opcode;
++ wr->sar.rea.last_packet_seq = pdu->hdr.seq_num;
++ wr->sar.rea.immediate_data = __be32_to_cpu(pdu->send.immed_data);
++ wr->sar.rea.final_length = pdu->send.msg_length;
++ }
++
++ return 0;
++}
++
++static int ibscif_process_write(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb)
++{
++ struct ibscif_wr *wr;
++ struct ibscif_mr *mr;
++ u64 rdma_addr;
++ u32 rdma_len, page_offset;
++ int page_index;
++
++ if (unlikely(!(qp->access & IB_ACCESS_REMOTE_WRITE)))
++ return -EACCES;
++
++ /* Writes with immediate data consume an rq wqe. */
++ if (ibscif_pdu_is_immed(pdu->hdr.opcode)) {
++ pdu->write.msg_id = __be32_to_cpu(pdu->write.msg_id);
++ spin_lock_bh(&qp->rq.lock);
++ if (unlikely(pdu->write.msg_id >= qp->rq.next_msg_id)) {
++ spin_unlock_bh(&qp->rq.lock);
++ printk(KERN_ALERT PFX "%s: ERROR: message arrives before recv is posted. msg_id=%d, rq.next_msg_id=%d\n",
++ __func__, pdu->write.msg_id, qp->rq.next_msg_id);
++ return -EBADRQC;
++ }
++ spin_unlock_bh(&qp->rq.lock);
++
++ wr = ibscif_wr_by_msg_id(&qp->rq, pdu->write.msg_id);
++ if (unlikely(!wr))
++ return -EBADR;
++ } else
++ wr = NULL;
++
++ skb_pull(skb, sizeof(pdu->write));
++
++ rdma_addr = __be64_to_cpu(pdu->write.rdma_address);
++ rdma_len = pdu->hdr.length;
++ if (unlikely((rdma_addr + (rdma_len - 1)) < rdma_addr))
++ return -EOVERFLOW;
++
++ mr = ibscif_validate_mr(__be32_to_cpu(pdu->write.rdma_key), rdma_addr,
++ rdma_len, qp->ibqp.pd, IB_ACCESS_REMOTE_WRITE);
++ if (unlikely(IS_ERR(mr)))
++ return PTR_ERR(mr);
++
++ page_offset = rdma_addr & ~PAGE_MASK;
++ page_index = ((rdma_addr - mr->addr) + (mr->addr & ~PAGE_MASK)) >> PAGE_SHIFT;
++
++ ibscif_ds_dma(qp, &mr->page[page_index], page_offset, skb, rdma_len, 0);
++
++ ibscif_put_mr(mr);
++
++ if (wr) {
++ wr->sar.rea.final_length += rdma_len;
++ if (ibscif_pdu_is_last(pdu->hdr.opcode)) {
++ /* We've got the last of the write data. */
++ wr->state = WR_LAST_SEEN;
++ wr->sar.rea.opcode = pdu->hdr.opcode;
++ wr->sar.rea.last_packet_seq = pdu->hdr.seq_num;
++ wr->sar.rea.immediate_data = __be32_to_cpu(pdu->write.immed_data);
++ }
++ }
++
++ return 0;
++}
++
++static int ibscif_process_read(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb)
++{
++ struct ibscif_wr *wr;
++ struct ibscif_mr *mr;
++ u64 rdma_addr;
++ u32 rdma_len;
++
++ if (unlikely(!(qp->access & IB_ACCESS_REMOTE_READ)))
++ return -EACCES;
++
++ rdma_addr = __be64_to_cpu(pdu->read_req.rdma_address);
++ rdma_len = __be32_to_cpu(pdu->read_req.rdma_length);
++ if (unlikely((rdma_addr + (rdma_len - 1)) < rdma_addr))
++ return -EOVERFLOW;
++
++ mr = ibscif_validate_mr(__be32_to_cpu(pdu->read_req.rdma_key), rdma_addr,
++ rdma_len, qp->ibqp.pd, IB_ACCESS_REMOTE_READ);
++ if (unlikely(IS_ERR(mr)))
++ return PTR_ERR(mr);
++
++ wr = ibscif_reserve_wqe(&qp->iq);
++ if (unlikely(IS_ERR(wr))) {
++ ibscif_put_mr(mr);
++ return PTR_ERR(wr);
++ }
++
++ memset(&wr->sar, 0, sizeof wr->sar);
++
++ wr->opcode = WR_RDMA_READ_RSP;
++ wr->state = WR_WAITING;
++ wr->length = rdma_len;
++ wr->msg_id = __be32_to_cpu(pdu->read_req.rdma_id);
++ wr->num_ds = 1;
++ wr->ds_list[0].mr = mr;
++ wr->ds_list[0].offset = rdma_addr - mr->addr;
++ wr->ds_list[0].length = rdma_len;
++ wr->ds_list[0].in_use = 1;
++
++ ibscif_append_wqe(&qp->iq);
++ qp->schedule |= SCHEDULE_RESUME | SCHEDULE_IQ;
++
++ return 0;
++}
++
++static int ibscif_process_read_rsp(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb)
++{
++ struct ibscif_wr *wr;
++ int err;
++
++ /* Find the requesting sq wr. */
++ wr = ibscif_wr_by_msg_id(&qp->sq, __be32_to_cpu(pdu->read_rsp.rdma_id));
++ if (unlikely(!wr))
++ return -EBADR;
++ if (unlikely(wr->opcode != WR_RDMA_READ))
++ return -ENOMSG;
++
++ skb_pull(skb, sizeof(pdu->read_rsp));
++
++ pdu->read_rsp.rdma_offset = __be32_to_cpu(pdu->read_rsp.rdma_offset);
++
++ err = ibscif_place_data(qp, wr, skb, pdu->hdr.length, pdu->read_rsp.rdma_offset, pdu->hdr.seq_num);
++ if (unlikely(err))
++ return err;
++
++ if (ibscif_pdu_is_last(pdu->hdr.opcode)) {
++ /* We've got the last of the read data. */
++ wr->state = WR_LAST_SEEN;
++ wr->sar.rea.opcode = pdu->hdr.opcode;
++ wr->sar.rea.last_packet_seq = pdu->hdr.seq_num;
++ wr->sar.rea.final_length = pdu->read_rsp.rdma_offset + pdu->hdr.length;
++ }
++
++ return 0;
++}
++
++static int ibscif_process_atomic_req(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb)
++{
++ struct ibscif_wr *wr;
++ struct ibscif_mr *mr;
++ struct page *page;
++ u64 *addr;
++ u32 offset, rkey, msg_id;
++ u16 opcode;
++
++ if (unlikely(!(qp->access & IB_ACCESS_REMOTE_ATOMIC)))
++ return -EACCES;
++
++ opcode = ibscif_pdu_base_type(pdu->hdr.opcode);
++ if (opcode == ibscif_op_comp_swap) {
++ addr = (u64 *)__be64_to_cpu(pdu->comp_swap.atomic_address);
++ rkey = __be32_to_cpu(pdu->comp_swap.atomic_key);
++ msg_id = __be32_to_cpu(pdu->comp_swap.atomic_id);
++ } else {
++ addr = (u64 *)__be64_to_cpu(pdu->fetch_add.atomic_address);
++ rkey = __be32_to_cpu(pdu->fetch_add.atomic_key);
++ msg_id = __be32_to_cpu(pdu->fetch_add.atomic_id);
++ }
++
++ if (unlikely((u64)addr & (sizeof *addr - 1)))
++ return -EADDRNOTAVAIL;
++ if (unlikely((addr + (sizeof *addr - 1)) < addr))
++ return -EOVERFLOW;
++
++ mr = ibscif_validate_mr(rkey, (u64)addr, sizeof *addr, qp->ibqp.pd, IB_ACCESS_REMOTE_ATOMIC);
++ if (unlikely(IS_ERR(mr)))
++ return PTR_ERR(mr);
++
++ wr = ibscif_reserve_wqe(&qp->iq);
++ if (unlikely(IS_ERR(wr))) {
++ ibscif_put_mr(mr);
++ return PTR_ERR(wr);
++ }
++
++ /* Determine which page to map. */
++ offset = ((u64)addr - mr->addr) + (mr->addr & ~PAGE_MASK);
++ page = mr->page[offset >> PAGE_SHIFT];
++ offset &= ~PAGE_MASK;
++
++ /* Lock to perform the atomic operation atomically. */
++ spin_lock_bh(&qp->dev->atomic_op);
++
++ addr = ibscif_map_src(page) + offset;
++ wr->atomic_rsp.orig_data = *addr;
++ if (opcode == ibscif_op_fetch_add)
++ *addr += __be64_to_cpu(pdu->fetch_add.add_data);
++ else if (wr->atomic_rsp.orig_data == __be64_to_cpu(pdu->comp_swap.comp_data))
++ *addr = __be64_to_cpu(pdu->comp_swap.swap_data);
++ ibscif_unmap_src(page, addr);
++
++ ibscif_put_mr(mr);
++
++ /* Atomic operation is complete. */
++ spin_unlock_bh(&qp->dev->atomic_op);
++
++ memset(&wr->sar, 0, sizeof wr->sar);
++
++ wr->opcode = WR_ATOMIC_RSP;
++ wr->state = WR_WAITING;
++ wr->length = 0;
++ wr->msg_id = msg_id;
++ wr->num_ds = 0;
++ wr->atomic_rsp.opcode = (opcode==ibscif_op_comp_swap)? ibscif_op_comp_swap_rsp : ibscif_op_fetch_add_rsp;
++ /* The wr->atomic_rsp.orig_data field was set above. */
++
++ ibscif_append_wqe(&qp->iq);
++ qp->schedule |= SCHEDULE_RESUME | SCHEDULE_IQ;
++
++ return 0;
++}
++
++static int ibscif_process_atomic_rsp(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb)
++{
++ struct ibscif_wr *wr;
++ u16 opcode;
++ int err;
++
++ if (unlikely(!ibscif_pdu_is_last(pdu->atomic_rsp.hdr.opcode)))
++ return -EINVAL;
++
++ /* Find the requesting sq wr. */
++ wr = ibscif_wr_by_msg_id(&qp->sq, __be32_to_cpu(pdu->atomic_rsp.atomic_id));
++ if (unlikely(!wr))
++ return -EBADR;
++
++ opcode = ibscif_pdu_base_type(pdu->hdr.opcode);
++ if (unlikely(wr->opcode != ((opcode == ibscif_op_comp_swap_rsp) ?
++ WR_ATOMIC_CMP_AND_SWP : WR_ATOMIC_FETCH_AND_ADD)))
++ return -ENOMSG;
++
++ skb_pull(skb, (unsigned long)&pdu->atomic_rsp.orig_data - (unsigned long)pdu);
++
++ pdu->atomic_rsp.orig_data = __be64_to_cpu(pdu->atomic_rsp.orig_data);
++ err = ibscif_place_data(qp, wr, skb, sizeof pdu->atomic_rsp.orig_data, 0, pdu->hdr.seq_num);
++ if (unlikely(err))
++ return err;
++
++ wr->state = WR_LAST_SEEN;
++ wr->sar.rea.opcode = pdu->hdr.opcode;
++ wr->sar.rea.last_packet_seq = pdu->hdr.seq_num;
++ wr->sar.rea.final_length = sizeof pdu->atomic_rsp.orig_data;
++
++ return 0;
++}
++
++static int ibscif_process_disconnect(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb)
++{
++ ibscif_qp_remote_disconnect(qp, __be32_to_cpu(pdu->disconnect.reason));
++ return 0;
++}
++
++static int ibscif_process_send_rma(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb)
++{
++ struct ibscif_ds *ds;
++ struct ibscif_wr *wr;
++ struct ibscif_mr *mr;
++ struct ibscif_mreg_info *mreg;
++ u32 num_rma_addrs;
++ u64 rma_offset;
++ u32 rma_length;
++ u32 total;
++ int seg_num;
++ int cur_rma_addr;
++ u32 xfer_len, ds_offset;
++ int err;
++ u64 loffset;
++ u32 dma_size = 0;
++ int rma_flag = 0;
++
++ if (unlikely(!qp->conn)) {
++ printk(KERN_ALERT PFX "%s: ERROR: qp->conn == NULL\n", __func__);
++ return -EACCES;
++ }
++
++ pdu->send.msg_id = __be32_to_cpu(pdu->send.msg_id);
++ spin_lock_bh(&qp->rq.lock);
++ if (unlikely(pdu->send.msg_id >= qp->rq.next_msg_id)) {
++ spin_unlock_bh(&qp->rq.lock);
++ printk(KERN_ALERT PFX "%s: ERROR: message arrives before recv is posted. msg_id=%d, rq.next_msg_id=%d\n",
++ __func__, pdu->send.msg_id, qp->rq.next_msg_id);
++ return -EBADRQC;
++ }
++ spin_unlock_bh(&qp->rq.lock);
++
++ wr = ibscif_wr_by_msg_id(&qp->rq, pdu->send.msg_id);
++ if (unlikely(!wr))
++ return -EBADR;
++
++ pdu->send.msg_length = __be32_to_cpu(pdu->send.msg_length);
++ if (unlikely(pdu->send.msg_length > wr->length))
++ return -EMSGSIZE;
++
++ pdu->send.msg_offset = __be32_to_cpu(pdu->send.msg_offset);
++ if (unlikely(pdu->send.msg_offset > pdu->send.msg_length))
++ return -EINVAL;
++
++ if (unlikely((pdu->hdr.length + pdu->send.msg_offset) > wr->length))
++ return -ESPIPE;
++
++ total = 0;
++
++ num_rma_addrs = __be32_to_cpu(pdu->send.num_rma_addrs);
++ cur_rma_addr = 0;
++ rma_offset = __be64_to_cpu(pdu->send.rma_addrs[cur_rma_addr].offset);
++ rma_length = __be32_to_cpu(pdu->send.rma_addrs[cur_rma_addr].length);
++
++ ds_offset = pdu->send.msg_offset;
++ ds = wr->ds_list;
++ seg_num = 0;
++ while ((ds_offset >= ds->length) && (seg_num < wr->num_ds)) {
++ ds_offset -= ds->length;
++ ds++;
++ seg_num++;
++ }
++
++ err = 0;
++ while (total < pdu->send.msg_length && !err) {
++ if (unlikely(seg_num >= wr->num_ds))
++ return -EMSGSIZE;
++
++ if (!ds->in_use) {
++ mr = ibscif_get_mr(ds->lkey);
++ if (unlikely(IS_ERR(mr)))
++ return PTR_ERR(mr);
++ ds->in_use = 1;
++ if (unlikely(mr != ds->mr))
++ return -ENXIO;
++ if (unlikely(!(mr->access & IB_ACCESS_LOCAL_WRITE)))
++ return -EACCES;
++ } else
++ mr = ds->mr;
++
++ mreg = ibscif_mr_get_mreg(mr, qp->conn);
++ if (!mreg)
++ return -EACCES;
++
++ while (ds->length > ds_offset) {
++ xfer_len = min( ds->length - ds_offset, rma_length );
++ if (xfer_len) {
++ loffset = mreg->offset + ds->offset + ds_offset;
++ dma_size += ibscif_dma_size(xfer_len, rma_offset);
++
++ if ((total + xfer_len >= pdu->send.msg_length) && dma_size)
++ rma_flag = SCIF_RMA_SYNC;
++
++ err = scif_readfrom(qp->conn->ep, loffset, xfer_len, rma_offset, rma_flag);
++ if (err) {
++ printk(KERN_ALERT PFX "%s: scif_readfrom (%d bytes) returns %d\n", __func__, xfer_len, err);
++ break;
++ }
++
++ ds_offset += xfer_len;
++ rma_offset += xfer_len;
++ rma_length -= xfer_len;
++ total += xfer_len;
++
++ if (total >= pdu->send.msg_length)
++ break;
++ }
++ if (rma_length == 0) {
++ cur_rma_addr++;
++ if (unlikely(cur_rma_addr >= num_rma_addrs))
++ return -EMSGSIZE;
++
++ rma_offset = __be64_to_cpu(pdu->send.rma_addrs[cur_rma_addr].offset);
++ rma_length = __be32_to_cpu(pdu->send.rma_addrs[cur_rma_addr].length);
++ }
++ }
++
++ seg_num++;
++ ds++;
++ }
++
++ wr->state = WR_LAST_SEEN;
++ wr->sar.rea.opcode = pdu->hdr.opcode;
++ wr->sar.rea.last_packet_seq = pdu->hdr.seq_num;
++ wr->sar.rea.immediate_data = __be32_to_cpu(pdu->send.immed_data);
++ wr->sar.rea.final_length = pdu->send.msg_length;
++
++ /* Respond to the initiator with the result */
++ wr = ibscif_reserve_wqe(&qp->iq);
++ if (unlikely(IS_ERR(wr))) {
++ return PTR_ERR(wr);
++ }
++
++ memset(&wr->sar, 0, sizeof wr->sar);
++
++ wr->opcode = WR_RMA_RSP;
++ wr->state = WR_WAITING;
++ wr->length = 0;
++ wr->msg_id = __be32_to_cpu(pdu->send.rma_id);
++ wr->num_ds = 0;
++ wr->rma_rsp.xfer_length = total;
++ wr->rma_rsp.error = err;
++
++ ibscif_append_wqe(&qp->iq);
++ qp->schedule |= SCHEDULE_RESUME | SCHEDULE_IQ;
++
++ return 0;
++}
++
++static int ibscif_process_write_rma(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb)
++{
++ struct ibscif_wr *wr;
++ struct ibscif_mr *mr;
++ u64 rdma_addr;
++ u32 rdma_len;
++ struct ibscif_mreg_info *mreg;
++ u32 num_rma_addrs;
++ u64 rma_offset;
++ u32 rma_length;
++ u32 total;
++ int i;
++ int err;
++ u64 loffset;
++ u32 dma_size = 0;
++ int rma_flag = 0;
++
++ if (unlikely(!qp->conn)) {
++ printk(KERN_ALERT PFX "%s: ERROR: qp->conn == NULL\n", __func__);
++ return -EACCES;
++ }
++
++ if (unlikely(!(qp->access & IB_ACCESS_REMOTE_WRITE)))
++ return -EACCES;
++
++ /* Writes with immediate data consume an rq wqe. */
++ if (ibscif_pdu_is_immed(pdu->hdr.opcode)) {
++ pdu->write.msg_id = __be32_to_cpu(pdu->write.msg_id);
++ spin_lock_bh(&qp->rq.lock);
++ if (unlikely(pdu->write.msg_id >= qp->rq.next_msg_id)) {
++ spin_unlock_bh(&qp->rq.lock);
++ return -EBADRQC;
++ }
++ spin_unlock_bh(&qp->rq.lock);
++
++ wr = ibscif_wr_by_msg_id(&qp->rq, pdu->write.msg_id);
++ if (unlikely(!wr))
++ return -EBADR;
++ }
++ else
++ wr = NULL;
++
++ rdma_addr = __be64_to_cpu(pdu->write.rdma_address);
++ rdma_len = __be32_to_cpu(pdu->write.rma_length);
++ if (unlikely((rdma_addr + (rdma_len - 1)) < rdma_addr))
++ return -EOVERFLOW;
++
++ mr = ibscif_validate_mr(__be32_to_cpu(pdu->write.rdma_key), rdma_addr,
++ rdma_len, qp->ibqp.pd, IB_ACCESS_REMOTE_WRITE);
++ if (unlikely(IS_ERR(mr)))
++ return PTR_ERR(mr);
++
++ mreg = ibscif_mr_get_mreg(mr, qp->conn);
++ if (!mreg)
++ return -EACCES;
++
++ total = 0;
++ err = 0;
++ num_rma_addrs = __be32_to_cpu(pdu->write.num_rma_addrs);
++ for (i=0; i<num_rma_addrs; i++) {
++ rma_offset = __be64_to_cpu(pdu->write.rma_addrs[i].offset);
++ rma_length = __be32_to_cpu(pdu->write.rma_addrs[i].length);
++
++ if (rdma_len < rma_length)
++ rma_length = rdma_len;
++
++ if (rma_length == 0)
++ continue;
++
++ loffset = mreg->offset + (rdma_addr - mr->addr) + total;
++ dma_size += ibscif_dma_size(rma_length, rma_offset);
++
++ if ((i==num_rma_addrs-1) && dma_size)
++ rma_flag = SCIF_RMA_SYNC;
++
++ err = scif_readfrom(qp->conn->ep, loffset, rma_length, rma_offset, rma_flag);
++ if (err) {
++ printk(KERN_ALERT PFX "%s: scif_readfrom (%d bytes) returns %d\n", __func__, rma_length, err);
++ break;
++ }
++
++ rdma_len -= rma_length;
++ total += rma_length;
++ }
++
++ ibscif_put_mr(mr);
++
++ if (wr) {
++ wr->sar.rea.final_length = total;
++ wr->state = WR_LAST_SEEN;
++ wr->sar.rea.opcode = pdu->hdr.opcode;
++ wr->sar.rea.last_packet_seq = pdu->hdr.seq_num;
++ wr->sar.rea.immediate_data = __be32_to_cpu(pdu->write.immed_data);
++ }
++
++ /* Respond to the initiator with the result */
++ wr = ibscif_reserve_wqe(&qp->iq);
++ if (unlikely(IS_ERR(wr))) {
++ return PTR_ERR(wr);
++ }
++
++ memset(&wr->sar, 0, sizeof wr->sar);
++
++ wr->opcode = WR_RMA_RSP;
++ wr->state = WR_WAITING;
++ wr->length = 0;
++ wr->msg_id = __be32_to_cpu(pdu->write.rma_id);
++ wr->num_ds = 0;
++ wr->rma_rsp.xfer_length = total;
++ wr->rma_rsp.error = err;
++
++ ibscif_append_wqe(&qp->iq);
++ qp->schedule |= SCHEDULE_RESUME | SCHEDULE_IQ;
++
++ return 0;
++}
++
++static int ibscif_process_read_rma(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb)
++{
++ struct ibscif_wr *wr;
++ struct ibscif_mr *mr;
++ u64 rdma_addr;
++ u32 rdma_len;
++ struct ibscif_mreg_info *mreg;
++ u32 num_rma_addrs;
++ u64 rma_offset;
++ u32 rma_length;
++ u32 total;
++ int i;
++ int err;
++ u64 loffset;
++ u32 dma_size = 0;
++ int rma_flag = 0;
++
++ if (unlikely(!qp->conn)) {
++ printk(KERN_ALERT PFX "%s: ERROR: qp->conn == NULL\n", __func__);
++ return -EACCES;
++ }
++
++ if (unlikely(!(qp->access & IB_ACCESS_REMOTE_READ)))
++ return -EACCES;
++
++ rdma_addr = __be64_to_cpu(pdu->read_req.rdma_address);
++ rdma_len = __be32_to_cpu(pdu->read_req.rdma_length);
++ if (unlikely((rdma_addr + (rdma_len - 1)) < rdma_addr))
++ return -EOVERFLOW;
++
++ mr = ibscif_validate_mr(__be32_to_cpu(pdu->read_req.rdma_key), rdma_addr,
++ rdma_len, qp->ibqp.pd, IB_ACCESS_REMOTE_READ);
++ if (unlikely(IS_ERR(mr)))
++ return PTR_ERR(mr);
++
++ mreg = ibscif_mr_get_mreg(mr, qp->conn);
++ if (!mreg)
++ return -EACCES;
++
++ total = 0;
++ err = 0;
++ num_rma_addrs = __be32_to_cpu(pdu->read_req.num_rma_addrs);
++ for (i=0; i<num_rma_addrs; i++) {
++ rma_offset = __be64_to_cpu(pdu->read_req.rma_addrs[i].offset);
++ rma_length = __be32_to_cpu(pdu->read_req.rma_addrs[i].length);
++
++ if (rdma_len < rma_length)
++ rma_length = rdma_len;
++
++ if (rma_length == 0)
++ continue;
++
++ loffset = mreg->offset + (rdma_addr - mr->addr) + total;
++ dma_size += ibscif_dma_size(rma_length, rma_offset);
++
++ if ((i==num_rma_addrs-1) && dma_size)
++ rma_flag = SCIF_RMA_SYNC;
++
++ err = scif_writeto(qp->conn->ep, loffset, rma_length, rma_offset, rma_flag);
++ if (err) {
++ printk(KERN_ALERT PFX "%s: scif_writeto (%d bytes) returns %d\n", __func__, rma_length, err);
++ break;
++ }
++
++ rdma_len -= rma_length;
++ total += rma_length;
++ }
++
++ ibscif_put_mr(mr);
++
++ /* Respond to the initiator with the result */
++ wr = ibscif_reserve_wqe(&qp->iq);
++ if (unlikely(IS_ERR(wr))) {
++ return PTR_ERR(wr);
++ }
++
++ memset(&wr->sar, 0, sizeof wr->sar);
++
++ wr->opcode = WR_RMA_RSP;
++ wr->state = WR_WAITING;
++ wr->length = 0;
++ wr->msg_id = __be32_to_cpu(pdu->read_req.rdma_id);
++ wr->num_ds = 0;
++ wr->rma_rsp.xfer_length = total;
++ wr->rma_rsp.error = err;
++
++ ibscif_append_wqe(&qp->iq);
++ qp->schedule |= SCHEDULE_RESUME | SCHEDULE_IQ;
++
++ return 0;
++}
++
++static int ibscif_process_rma_rsp(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb)
++{
++ struct ibscif_wr *wr;
++
++ wr = ibscif_wr_by_msg_id(&qp->sq, __be32_to_cpu(pdu->rma_rsp.rma_id));
++ if (unlikely(!wr))
++ return -EBADR;
++ if (unlikely(!wr->use_rma))
++ return -ENOMSG;
++
++ if (wr->opcode == WR_RDMA_READ) {
++ /* ibscif_clear_ds_refs() is called in ibscif_schedule_rx_completions() */
++ wr->state = WR_LAST_SEEN;
++ }
++ else {
++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds);
++ wr->state = WR_COMPLETED;
++ }
++
++ wr->sar.rea.opcode = pdu->hdr.opcode;
++ wr->sar.rea.last_packet_seq = pdu->hdr.seq_num;
++ wr->sar.rea.final_length = pdu->rma_rsp.xfer_length;
++
++ return 0;
++}
++
++static int ibscif_process_pdu(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb)
++{
++ int err;
++
++ switch (ibscif_pdu_base_type(pdu->hdr.opcode)) {
++ case ibscif_op_ud:
++ err = ibscif_process_ud(qp, pdu, skb);
++ break;
++ case ibscif_op_send:
++ err = ibscif_process_send(qp, pdu, skb);
++ break;
++ case ibscif_op_write:
++ err = ibscif_process_write(qp, pdu, skb);
++ break;
++ case ibscif_op_read:
++ err = ibscif_process_read(qp, pdu, skb);
++ break;
++ case ibscif_op_read_rsp:
++ err = ibscif_process_read_rsp(qp, pdu, skb);
++ break;
++ case ibscif_op_comp_swap_rsp:
++ case ibscif_op_fetch_add_rsp:
++ err = ibscif_process_atomic_rsp(qp, pdu, skb);
++ break;
++ case ibscif_op_comp_swap:
++ case ibscif_op_fetch_add:
++ err = ibscif_process_atomic_req(qp, pdu, skb);
++ break;
++ case ibscif_op_ack:
++ /* Handled in piggyback ack processing. */
++ err = 0;
++ break;
++ case ibscif_op_disconnect:
++ /* Post send completions before the disconnect flushes the queues. */
++ ibscif_process_ack(qp, &pdu->hdr);
++ /* Now disconnect the QP. */
++ err = ibscif_process_disconnect(qp, pdu, skb);
++ break;
++ case ibscif_op_send_rma:
++ err = ibscif_process_send_rma(qp, pdu, skb);
++ break;
++ case ibscif_op_write_rma:
++ err = ibscif_process_write_rma(qp, pdu, skb);
++ break;
++ case ibscif_op_read_rma:
++ err = ibscif_process_read_rma(qp, pdu, skb);
++ break;
++ case ibscif_op_rma_rsp:
++ err = ibscif_process_rma_rsp(qp, pdu, skb);
++ break;
++ default:
++ printk(KERN_INFO PFX "Received invalid opcode (%x)\n",
++ ibscif_pdu_base_type(pdu->hdr.opcode));
++ err = IBSCIF_REASON_INVALID_OPCODE;
++ break;
++ }
++
++ if (unlikely(err)) {
++ printk(KERN_ALERT PFX "%s: ERROR: err=%d, opcode=%d\n", __func__, err, ibscif_pdu_base_type(pdu->hdr.opcode));
++ ibscif_protocol_error(qp, IBSCIF_REASON_QP_FATAL);
++ }
++
++ return err;
++}
++
++static int update_rx_seq_numbers(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct ibscif_rx_state *rx)
++{
++ u32 seq_num = pdu->hdr.seq_num;
++
++ if (pdu->hdr.opcode == ibscif_op_ack)
++ return 0;
++
++ if (seq_num != rx->last_in_seq + 1)
++ return 0;
++
++ rx->last_in_seq = seq_num;
++
++ return 1;
++}
++
++static void ibscif_process_qp_skb(struct ibscif_qp *qp, struct sk_buff *skb)
++{
++ union ibscif_pdu *pdu = (union ibscif_pdu *)skb->data;
++ struct ibscif_rx_state *rx;
++ int err = 0;
++
++ /* Start with no scheduling. */
++ qp->schedule = 0;
++
++ rx = ibscif_pdu_is_iq(pdu->hdr.opcode) ? &qp->wire.iq.rx : &qp->wire.sq.rx;
++
++ if (ibscif_process_pdu(qp, pdu, skb) == IBSCIF_REASON_INVALID_OPCODE)
++ return;
++
++ /* skip ack and seq_num for UD QP */
++ if (qp->ibqp.qp_type == IB_QPT_UD) {
++ err = ibscif_schedule_rx_completions(qp, 0, rx);
++ if (unlikely(err)) {
++ printk(KERN_ALERT PFX "%s: rx completion error: err=%d, opcode=%d\n", __func__, err, ibscif_pdu_base_type(pdu->hdr.opcode));
++ ibscif_protocol_error(qp, IBSCIF_REASON_QP_FATAL);
++ }
++ goto done;
++ }
++
++ /* Process piggybacked acks. */
++ ibscif_process_ack(qp, &pdu->hdr);
++
++ if (update_rx_seq_numbers(qp, pdu, rx)) {
++ /* PDU is in sequence so schedule/remove completed work requests. */
++ err = ibscif_schedule_rx_completions(qp, ibscif_pdu_is_iq(pdu->hdr.opcode), rx);
++ if (unlikely(err)) {
++ printk(KERN_ALERT PFX "%s: rx completion error: err=%d, opcode=%d\n", __func__, err, ibscif_pdu_base_type(pdu->hdr.opcode));
++ ibscif_protocol_error(qp, IBSCIF_REASON_QP_FATAL);
++ goto done;
++ }
++ }
++
++ /* Generate an ack if forced or if the current window dictates it. */
++ if (ibscif_pdu_is_force_ack(pdu->hdr.opcode)) {
++ ibscif_send_ack(qp);
++ } else if (pdu->hdr.opcode != ibscif_op_ack) {
++ u32 window = ibscif_rx_window(rx);
++ if (window && (window % (window_size / MIN_WINDOW_SIZE)) == 0)
++ ibscif_send_ack(qp);
++ }
++done:
++ /* Run the scheduler if it was requested. */
++ if (qp->schedule & SCHEDULE_RESUME) {
++ if (qp->schedule & SCHEDULE_SQ)
++ ibscif_schedule(&qp->sq);
++ if (qp->schedule & SCHEDULE_IQ)
++ ibscif_schedule(&qp->iq);
++ }
++
++ return;
++}
++
++#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,21)
++#define skb_mac_header(skb) (skb->mac.raw)
++#endif
++
++static int ibscif_recv_pkt(struct sk_buff *skb, struct ibscif_dev *dev, scif_epd_t ep, struct ibscif_conn *conn)
++{
++ union ibscif_pdu *pdu = (union ibscif_pdu *)skb->data;
++ struct ibscif_qp *qp = ERR_PTR(-ENOENT);
++
++ /* Convert the base header. */
++ pdu->hdr.opcode = __be16_to_cpu(pdu->hdr.opcode);
++ pdu->hdr.length = __be16_to_cpu(pdu->hdr.length);
++ pdu->hdr.dst_qp = __be32_to_cpu(pdu->hdr.dst_qp);
++ pdu->hdr.src_qp = __be32_to_cpu(pdu->hdr.src_qp);
++ pdu->hdr.seq_num = __be32_to_cpu(pdu->hdr.seq_num);
++ pdu->hdr.sq_ack_num = __be32_to_cpu(pdu->hdr.sq_ack_num);
++ pdu->hdr.iq_ack_num = __be32_to_cpu(pdu->hdr.iq_ack_num);
++
++ if (pdu->hdr.opcode == ibscif_op_close) {
++ //printk(KERN_INFO PFX "%s: op_close, conn=%p, local_close=%d\n", __func__, conn, conn->local_close);
++ conn->remote_close = 1;
++ goto done_no_qp;
++ }
++ else if (pdu->hdr.opcode == ibscif_op_reopen) {
++ //printk(KERN_INFO PFX "%s: op_reopen, conn=%p, local_close=%d\n", __func__, conn, conn->local_close);
++ conn->remote_close = 0;
++ goto done_no_qp;
++ }
++ else if (pdu->hdr.opcode == ibscif_op_cm) {
++ ibscif_process_cm_skb(skb, conn);
++ goto done_no_qp;
++ }
++
++ qp = ibscif_get_qp(pdu->hdr.dst_qp);
++ if (unlikely(IS_ERR(qp) ||
++ (qp->state != QP_CONNECTED && qp->ibqp.qp_type != IB_QPT_UD) ||
++ (qp->ibqp.qp_num != pdu->hdr.dst_qp) ||
++ (qp->remote_qpn != pdu->hdr.src_qp && qp->ibqp.qp_type != IB_QPT_UD) ||
++ 0)) {
++ /* Disconnect the rogue. */
++ ibscif_reflect_disconnect(qp, &pdu->hdr, skb, IBSCIF_REASON_INVALID_QP);
++ goto done;
++ }
++
++ if (qp->ibqp.qp_type == IB_QPT_UD)
++ ibscif_qp_add_ud_conn(qp, conn);
++
++ DEV_STAT(qp->dev, packets_rcvd++);
++ DEV_STAT(qp->dev, bytes_rcvd += skb->len);
++
++ ibscif_process_qp_skb(qp, skb);
++done:
++ if (likely(!IS_ERR(qp)))
++ ibscif_put_qp(qp);
++
++done_no_qp:
++ kfree_skb(skb);
++ return 0;
++}
++
++void ibscif_do_recv( struct ibscif_dev *dev, scif_epd_t ep, struct ibscif_conn *conn )
++{
++ struct sk_buff *skb;
++ union ibscif_pdu *pdu;
++ int hdr_size, payload_size, recv_size, pdu_size;
++ char *recv_buffer;
++ int ret;
++
++ skb = dev_alloc_skb( IBSCIF_MTU );
++ if (unlikely(skb==NULL)) {
++ printk(KERN_ALERT PFX "%s(): fail to allocate skb, exiting\n", __func__);
++ return;
++ }
++
++ skb->protocol = IBSCIF_PACKET_TYPE;
++ skb->ip_summed = CHECKSUM_UNNECESSARY;
++ skb->priority = TC_PRIO_CONTROL; /* highest defined priority */
++ skb->dev = (void *) dev;
++
++ pdu = (union ibscif_pdu *)skb->data;
++
++ /* get the base header first so the packet size can be determinied */
++ recv_size = sizeof(pdu->hdr);
++ recv_buffer = (char *)&pdu->hdr;
++ while (recv_size) {
++ ret = scif_recv(ep, recv_buffer, recv_size, blocking_recv ? SCIF_RECV_BLOCK : 0);
++ if (ret < 0) {
++ printk(KERN_ALERT PFX "%s(): fail to receive hdr, ret=%d, expecting %d\n", __func__, ret, (int)recv_size);
++ if (ret == -ENOTCONN || ret == -ECONNRESET) {
++ if (verbose)
++ printk(KERN_INFO PFX "%s: ep disconnected by peer (%d). conn=%p, local_close=%d\n",
++ __func__, ret, conn, conn->local_close);
++ ibscif_remove_ep( dev, ep );
++ ibscif_refresh_pollep_list();
++ conn->remote_close = 1;
++ if (conn->local_close) {
++ ibscif_free_conn(conn);
++ }
++ }
++ goto errout;
++ }
++ recv_size -= ret;
++ recv_buffer += ret;
++ }
++
++ hdr_size = __be16_to_cpu(pdu->hdr.hdr_size);
++ payload_size = __be16_to_cpu(pdu->hdr.length);
++ pdu_size = hdr_size + payload_size;
++ if (unlikely(pdu_size > IBSCIF_MTU)) {
++ printk(KERN_ALERT PFX "%s(): packet size exceed MTU, size=%d\n", __func__, pdu_size);
++ goto errout;
++ }
++
++ recv_size = pdu_size - sizeof(pdu->hdr);
++ recv_buffer = (char *)pdu + sizeof(pdu->hdr);
++
++ /* get the remaining of the packet */
++ //printk(KERN_INFO PFX "%s(): hdr_size=%d payload_size=%d pdu_size=%d recv_size=%d\n", __func__, hdr_size, payload_size, pdu_size, recv_size);
++ ret = 0;
++ while (recv_size) {
++ ret = scif_recv(ep, recv_buffer, recv_size, blocking_recv ? SCIF_RECV_BLOCK : 0);
++
++ if (ret < 0) {
++ printk(KERN_ALERT PFX "%s(): fail to receive data, ret=%d, expecting %d\n", __func__, ret, recv_size);
++ break;
++ }
++
++ recv_size -= ret;
++ recv_buffer += ret;
++ }
++
++ if (ret < 0)
++ goto errout;
++
++ skb->len = pdu_size;
++ skb->data_len = payload_size;
++ skb->tail += pdu_size;
++
++ ibscif_recv_pkt(skb, dev, ep, conn);
++ return;
++
++errout:
++ kfree_skb(skb);
++}
++
++#define IBSCIF_MAX_POLL_COUNT (IBSCIF_MAX_DEVICES * 2)
++static struct scif_pollepd poll_eps[IBSCIF_MAX_POLL_COUNT];
++static struct ibscif_dev *poll_devs[IBSCIF_MAX_POLL_COUNT];
++static int poll_types[IBSCIF_MAX_POLL_COUNT];
++static struct ibscif_conn *poll_conns[IBSCIF_MAX_POLL_COUNT];
++static struct task_struct *poll_thread = NULL;
++static atomic_t poll_eps_changed = ATOMIC_INIT(0);
++static volatile int poll_thread_running = 0;
++
++void ibscif_refresh_pollep_list( void )
++{
++ atomic_set(&poll_eps_changed, 1);
++}
++
++int ibscif_poll_thread( void *unused )
++{
++ int poll_count = 0;
++ int ret;
++ int i;
++ int busy;
++ int idle_count = 0;
++
++ poll_thread_running = 1;
++ while (!kthread_should_stop()) {
++ if (atomic_xchg(&poll_eps_changed, 0)) {
++ poll_count = IBSCIF_MAX_POLL_COUNT;
++ ibscif_get_pollep_list( poll_eps, poll_devs, poll_types, poll_conns, &poll_count );
++ }
++
++ if (poll_count == 0) {
++ schedule();
++ continue;
++ }
++
++ ret = scif_poll(poll_eps, poll_count, 1000); /* 1s timeout */
++
++ busy = 0;
++ if (ret > 0) {
++ for (i=0; i<poll_count; i++) {
++ if (poll_eps[i].revents & POLLIN) {
++ if (poll_types[i] == IBSCIF_EP_TYPE_LISTEN) {
++ ibscif_do_accept( poll_devs[i] );
++ busy = 1;
++ }
++ else {
++ ibscif_do_recv( poll_devs[i], poll_eps[i].epd, poll_conns[i] );
++ busy = 1;
++ }
++ }
++ else if (poll_eps[i].revents & POLLERR) {
++ if (verbose)
++ printk(KERN_INFO PFX "%s: ep error, conn=%p.\n", __func__, poll_conns[i]);
++ ibscif_remove_ep( poll_devs[i], poll_eps[i].epd );
++ ibscif_refresh_pollep_list();
++ /* in most the case, the error is caused by ep being already closed */
++ busy = 1;
++ }
++ else if (poll_eps[i].revents & POLLHUP) {
++ struct ibscif_conn *conn = poll_conns[i];
++ if (verbose)
++ printk(KERN_INFO PFX "%s: ep disconnected by peer.\n", __func__);
++ ibscif_remove_ep( poll_devs[i], poll_eps[i].epd );
++ ibscif_refresh_pollep_list();
++ if (conn) {
++ if (verbose)
++ printk(KERN_INFO PFX "%s: conn=%p, local_close=%d.\n", __func__, conn, conn->local_close);
++ conn->remote_close = 1;
++ if (conn->local_close) {
++ ibscif_free_conn(conn);
++ }
++ }
++ busy = 1;
++ }
++ }
++ }
++
++ if (busy) {
++ idle_count = 0;
++ }
++ else {
++ idle_count++;
++ /* close unused endpoint after 60 seconds */
++ if (idle_count == 60) {
++ if (ibscif_cleanup_idle_conn())
++ ibscif_refresh_pollep_list();
++ idle_count = 0;
++ }
++ /* pick up the unprocessed items in the xmit queue */
++ if (!skb_queue_empty(&xmit_queue))
++ ibscif_dev_queue_xmit(NULL);
++ schedule();
++ }
++ }
++
++ poll_thread_running = 0;
++ return 0;
++}
++
++void ibscif_protocol_init_pre(void)
++{
++ skb_queue_head_init(&xmit_queue);
++}
++
++void ibscif_protocol_init_post(void)
++{
++ poll_thread = kthread_run( ibscif_poll_thread, NULL, "ibscif_polld" );
++}
++
++void ibscif_protocol_cleanup(void)
++{
++ kthread_stop( poll_thread );
++
++ while (poll_thread_running)
++ schedule();
++}
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_protocol.h a8/drivers/infiniband/hw/scif/ibscif_protocol.h
+--- a7/drivers/infiniband/hw/scif/ibscif_protocol.h 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_protocol.h 2015-02-23 10:14:37.487809663 -0800
+@@ -0,0 +1,395 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef IBSCIF_PROTOCOL_H
++#define IBSCIF_PROTOCOL_H
++
++/*
++ * Protocol EtherType
++ */
++#define IBSCIF_PACKET_TYPE 0x8086
++
++/*
++ * Base protocol header version
++ */
++#define IBSCIF_PROTOCOL_VER_1 1
++#define IBSCIF_PROTOCOL_VER IBSCIF_PROTOCOL_VER_1
++
++/*
++ * Protocol opcode values - All other values are reserved.
++ */
++#define ibscif_last_flag 0x4000
++#define ibscif_immed_flag 0x2000
++#define ibscif_se_flag 0x1000
++#define ibscif_force_ack_flag 0x0800
++#define ibscif_iq_flag 0x0400
++
++#define ibscif_op_send 0
++#define ibscif_op_send_last (ibscif_op_send | ibscif_last_flag)
++#define ibscif_op_send_last_se (ibscif_op_send | ibscif_last_flag | ibscif_se_flag)
++#define ibscif_op_send_immed (ibscif_op_send | ibscif_immed_flag)
++#define ibscif_op_send_immed_se (ibscif_op_send | ibscif_immed_flag | ibscif_se_flag)
++
++#define ibscif_op_write 1
++#define ibscif_op_write_last (ibscif_op_write | ibscif_last_flag)
++#define ibscif_op_write_immed (ibscif_op_write | ibscif_immed_flag)
++#define ibscif_op_write_immed_se (ibscif_op_write | ibscif_immed_flag | ibscif_se_flag)
++
++#define ibscif_op_read 2
++#define ibscif_op_read_rsp (ibscif_op_read | ibscif_iq_flag)
++#define ibscif_op_read_rsp_last (ibscif_op_read_rsp | ibscif_last_flag)
++
++#define ibscif_op_comp_swap 3
++#define ibscif_op_comp_swap_rsp (ibscif_op_comp_swap | ibscif_iq_flag)
++
++#define ibscif_op_fetch_add 4
++#define ibscif_op_fetch_add_rsp (ibscif_op_fetch_add | ibscif_iq_flag)
++
++#define ibscif_op_ack 5
++#define ibscif_op_disconnect 6
++
++#define ibscif_op_send_rma 7
++#define ibscif_op_send_rma_se (ibscif_op_send_rma | ibscif_se_flag)
++#define ibscif_op_send_rma_immed (ibscif_op_send_rma | ibscif_immed_flag)
++#define ibscif_op_send_rma_immed_se (ibscif_op_send_rma | ibscif_immed_flag | ibscif_se_flag)
++
++#define ibscif_op_write_rma 8
++#define ibscif_op_write_rma_immed (ibscif_op_write_rma | ibscif_immed_flag)
++#define ibscif_op_write_rma_immed_se (ibscif_op_write_rma | ibscif_immed_flag | ibscif_se_flag)
++
++#define ibscif_op_read_rma 9
++#define ibscif_op_rma_rsp (10 | ibscif_iq_flag)
++
++#define ibscif_op_reg 11
++#define ibscif_op_dereg 12
++
++#define ibscif_op_close 13
++#define ibscif_op_reopen 14
++
++#define ibscif_op_ud 15
++#define ibscif_op_cm 16
++
++#define ibscif_pdu_is_last(op) (op & ibscif_last_flag)
++#define ibscif_pdu_is_immed(op) (op & ibscif_immed_flag)
++#define ibscif_pdu_is_se(op) (op & ibscif_se_flag)
++#define ibscif_pdu_is_force_ack(op) (op & ibscif_force_ack_flag)
++#define ibscif_pdu_is_iq(op) (op & ibscif_iq_flag)
++
++#define ibscif_pdu_set_last(op) (op | ibscif_last_flag)
++#define ibscif_pdu_set_immed(op) (op | ibscif_immed_flag)
++#define ibscif_pdu_set_se(op) (op | ibscif_se_flag)
++#define ibscif_pdu_set_force_ack(op) (op | ibscif_force_ack_flag)
++#define ibscif_pdu_set_iq(op) (op | ibscif_iq_flag)
++
++#define ibscif_pdu_base_type(op) \
++ (op & ~(ibscif_last_flag | \
++ ibscif_se_flag | \
++ ibscif_immed_flag | \
++ ibscif_force_ack_flag))
++
++/*
++ * Remote address descriptor for SCIF RMA operations
++ */
++struct rma_addr {
++ __be64 offset;
++ __be32 length;
++ __be32 reserved;
++} __attribute__ ((packed));
++
++/*
++ * Base header present in every packet
++ */
++struct base_hdr {
++ __be16 opcode;
++ __be16 length;
++ __be32 dst_qp;
++ __be32 src_qp;
++ __be32 seq_num;
++ __be32 sq_ack_num;
++ __be32 iq_ack_num;
++ __be16 hdr_size;
++ __be16 reserved[3];
++} __attribute__ ((packed));
++
++/*
++ * UD Header
++ */
++struct ud_hdr {
++ struct base_hdr hdr;
++ __be32 msg_id;
++ __be32 msg_length;
++ __be32 msg_offset;
++ u8 grh[40];
++} __attribute__ ((packed));
++
++/*
++ * Send Header
++ */
++struct send_hdr {
++ struct base_hdr hdr;
++ __be32 msg_id;
++ __be32 msg_length;
++ __be32 msg_offset;
++ __be32 immed_data;
++ __be32 rma_id; /* RMA */
++ __be32 num_rma_addrs; /* RMA */
++ struct rma_addr rma_addrs[0]; /* RMA */
++} __attribute__ ((packed));
++
++/*
++ * RDMA Write Header
++ */
++struct write_hdr {
++ struct base_hdr hdr;
++ __be64 rdma_address;
++ __be32 rdma_key;
++ __be32 immed_data;
++ __be32 msg_id;
++ __be32 rma_length; /* RMA */
++ __be32 rma_id; /* RMA */
++ __be32 num_rma_addrs; /* RMA */
++ struct rma_addr rma_addrs[0]; /* RMA */
++} __attribute__ ((packed));
++
++/*
++ * RDMA Read Request Header
++ */
++struct read_req_hdr {
++ struct base_hdr hdr;
++ __be64 rdma_address;
++ __be32 rdma_key;
++ __be32 rdma_length; /* shared with RMA */
++ __be32 rdma_id; /* shared with RMA */
++ __be32 num_rma_addrs; /* RMA */
++ struct rma_addr rma_addrs[0]; /* RMA */
++} __attribute__ ((packed));
++
++/*
++ * RDMA Read Response Header
++ */
++struct read_rsp_hdr {
++ struct base_hdr hdr;
++ __be32 rdma_offset;
++ __be32 rdma_id;
++} __attribute__ ((packed));
++
++
++/*
++ * Atomic Compare and Swap Header
++ */
++struct comp_swap_hdr {
++ struct base_hdr hdr;
++ __be64 atomic_address;
++ __be64 comp_data;
++ __be64 swap_data;
++ __be32 atomic_key;
++ __be32 atomic_id;
++ /* no pad needed */
++} __attribute__ ((packed));
++
++
++/*
++ * Atomic Fetch/Add Header
++ */
++struct fetch_add_hdr {
++ struct base_hdr hdr;
++ __be64 atomic_address;
++ __be64 add_data;
++ __be32 atomic_key;
++ __be32 atomic_id;
++ /* no pad needed */
++} __attribute__ ((packed));
++
++/*
++ * Atomic Response Header
++ */
++struct atomic_rsp_hdr {
++ struct base_hdr hdr;
++ __be64 orig_data;
++ __be32 atomic_id;
++} __attribute__ ((packed));
++
++/*
++ * ACK Header
++ */
++struct ack_hdr {
++ struct base_hdr hdr;
++} __attribute__ ((packed));
++
++/*
++ * Disconnect Header
++ */
++struct disconnect_hdr {
++ struct base_hdr hdr;
++ __be32 reason;
++} __attribute__ ((packed));
++
++/*
++ * RMA Response Header
++ */
++struct rma_rsp_hdr {
++ struct base_hdr hdr;
++ __be32 rma_id;
++ __be32 xfer_length;
++ __be32 error;
++} __attribute__ ((packed));
++
++/*
++ * MR Reg/Dereg Info Header
++ */
++struct reg_hdr {
++ struct base_hdr hdr;
++ __be64 scif_offset;
++ __be64 address;
++ __be32 length;
++ __be32 rkey;
++ __be32 access;
++} __attribute__ ((packed));
++
++/*
++ * SCIF endpoint close notiffication
++ */
++struct close_hdr {
++ struct base_hdr hdr;
++} __attribute__ ((packed));
++
++
++#define IBSCIF_CM_REQ 1
++#define IBSCIF_CM_REP 2
++#define IBSCIF_CM_REJ 3
++#define IBSCIF_CM_RTU 4
++
++/*
++ * RDMA CM Header
++ */
++
++struct cm_hdr {
++ struct base_hdr hdr;
++ __be64 req_ctx;
++ __be64 rep_ctx;
++ __be32 cmd;
++ __be32 port;
++ __be32 qpn;
++ __be32 status;
++ __be32 plen;
++ u8 pdata[0];
++} __attribute__ ((packed));
++
++enum ibscif_reason { /* Set each value to simplify manual lookup. */
++
++ /* Local Events */
++ IBSCIF_REASON_USER_GENERATED = 0,
++ IBSCIF_REASON_CQ_COMPLETION = 1,
++ IBSCIF_REASON_NIC_FATAL = 2,
++ IBSCIF_REASON_NIC_REMOVED = 3,
++
++ /* Disconnect Event */
++ IBSCIF_REASON_DISCONNECT = 4,
++
++ /* CQ Error */
++ IBSCIF_REASON_CQ_OVERRUN = 5,
++ IBSCIF_REASON_CQ_FATAL = 6,
++
++ /* QP Errors */
++ IBSCIF_REASON_QP_SQ_ERROR = 7,
++ IBSCIF_REASON_QP_RQ_ERROR = 8,
++ IBSCIF_REASON_QP_DESTROYED = 9,
++ IBSCIF_REASON_QP_ERROR = 10,
++ IBSCIF_REASON_QP_FATAL = 11,
++
++ /* Operation Errors */
++ IBSCIF_REASON_INVALID_OPCODE = 12,
++ IBSCIF_REASON_INVALID_LENGTH = 13,
++ IBSCIF_REASON_INVALID_QP = 14,
++ IBSCIF_REASON_INVALID_MSG_ID = 15,
++ IBSCIF_REASON_INVALID_LKEY = 16,
++ IBSCIF_REASON_INVALID_RDMA_RKEY = 17,
++ IBSCIF_REASON_INVALID_RDMA_ID = 18,
++ IBSCIF_REASON_INVALID_ATOMIC_RKEY = 19,
++ IBSCIF_REASON_INVALID_ATOMIC_ID = 20,
++ IBSCIF_REASON_MAX_IR_EXCEEDED = 21,
++ IBSCIF_REASON_ACK_TIMEOUT = 22,
++
++ /* Protection Errors */
++ IBSCIF_REASON_PROTECTION_VIOLATION = 23,
++ IBSCIF_REASON_BOUNDS_VIOLATION = 24,
++ IBSCIF_REASON_ACCESS_VIOLATION = 25,
++ IBSCIF_REASON_WRAP_ERROR = 26
++};
++
++union ibscif_pdu {
++ struct base_hdr hdr;
++ struct ud_hdr ud;
++ struct send_hdr send;
++ struct write_hdr write;
++ struct read_req_hdr read_req;
++ struct read_rsp_hdr read_rsp;
++ struct comp_swap_hdr comp_swap;
++ struct fetch_add_hdr fetch_add;
++ struct atomic_rsp_hdr atomic_rsp;
++ struct ack_hdr ack;
++ struct disconnect_hdr disconnect;
++ struct rma_rsp_hdr rma_rsp;
++ struct reg_hdr reg;
++ struct close_hdr close;
++ struct cm_hdr cm;
++};
++
++struct ibscif_full_frame {
++ union ibscif_pdu ibscif;
++};
++
++static inline int seq_before(u32 seq1, u32 seq2)
++{
++ return (s32)(seq1 - seq2) < 0;
++}
++
++static inline int seq_after(u32 seq1, u32 seq2)
++{
++ return (s32)(seq2 - seq1) < 0;
++}
++
++static inline int seq_between(u32 seq_target, u32 seq_low, u32 seq_high)
++{
++ return seq_high - seq_low >= seq_target - seq_low;
++}
++
++static inline u32 seq_window(u32 earlier, u32 later)
++{
++ return earlier > later ? ((u32)~0 - earlier) + later : later - earlier;
++}
++
++#define ibscif_tx_unacked_window(tx) seq_window((tx)->last_ack_seq_recvd, (tx)->next_seq - 1)
++
++#define ibscif_rx_window(rx) seq_window((rx)->last_seq_acked, (rx)->last_in_seq)
++
++#define ibscif_tx_window(tx) ((u32)window_size - ibscif_tx_unacked_window(tx))
++
++#endif /* IBSCIF_PROTOCOL_H */
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_provider.c a8/drivers/infiniband/hw/scif/ibscif_provider.c
+--- a7/drivers/infiniband/hw/scif/ibscif_provider.c 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_provider.c 2015-02-23 10:14:37.488809663 -0800
+@@ -0,0 +1,406 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "ibscif_driver.h"
++
++static int ibscif_query_device(struct ib_device *ibdev, struct ib_device_attr *attr)
++{
++ memset(attr, 0, sizeof *attr);
++
++ attr->vendor_id = VENDOR_ID;
++ attr->vendor_part_id = DEVICE_ID;
++ attr->hw_ver = HW_REV;
++ attr->fw_ver = FW_REV;
++ attr->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT;
++ attr->max_mr_size = MAX_MR_SIZE;
++ attr->page_size_cap = PAGE_SIZE;
++ attr->max_qp = MAX_QPS;
++ attr->max_qp_wr = MAX_QP_SIZE;
++ attr->max_sge = MAX_SGES;
++ attr->max_cq = MAX_CQS;
++ attr->max_cqe = MAX_CQ_SIZE;
++ attr->max_mr = MAX_MRS;
++ attr->max_pd = MAX_PDS;
++ attr->max_qp_rd_atom = MAX_IR>255 ? 255 : MAX_IR;
++ attr->max_qp_init_rd_atom = MAX_OR>255 ? 255 : MAX_OR;
++ attr->max_res_rd_atom = MAX_IR>255 ? 255 : MAX_IR;
++ attr->atomic_cap = IB_ATOMIC_HCA;
++ attr->sys_image_guid = ibdev->node_guid;
++
++ return 0;
++}
++
++static int ibscif_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *attr)
++{
++ struct ibscif_dev *dev = to_dev(ibdev);
++
++ memset(attr, 0, sizeof *attr);
++
++ /* See IB Spec r1.2 Table 145 for physical port state values. */
++ attr->lid = IBSCIF_NODE_ID_TO_LID(dev->node_id);
++ attr->sm_lid = 1;
++ attr->gid_tbl_len = 1;
++ attr->pkey_tbl_len = 1;
++ attr->max_msg_sz = MAX_MR_SIZE;
++ attr->phys_state = 5; /* LinkUp */
++ attr->state = IB_PORT_ACTIVE;
++ attr->max_mtu = IB_MTU_4096;
++ attr->active_mtu = IB_MTU_4096;
++ attr->active_width = IB_WIDTH_4X;
++ attr->active_speed = 4;
++ attr->max_vl_num = 1;
++ attr->port_cap_flags = IB_PORT_SM_DISABLED;
++
++ return 0;
++}
++
++static int ibscif_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
++{
++ *pkey = 0xffff; /* IB_DEFAULT_PKEY_FULL */
++ return 0;
++}
++
++static int ibscif_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *ibgid)
++{
++ struct ibscif_dev *dev = to_dev(ibdev);
++
++ memcpy(ibgid, &dev->gid, sizeof(*ibgid));
++ return 0;
++}
++
++static struct ib_ucontext *ibscif_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata)
++{
++ struct ib_ucontext *context = kzalloc(sizeof *context, GFP_KERNEL);
++ return (!context) ? ERR_PTR(-ENOMEM) : context;
++}
++
++static int ibscif_dealloc_ucontext(struct ib_ucontext *context)
++{
++ kfree(context);
++ return 0;
++}
++
++static void ibscif_generate_eui64(struct ibscif_dev *dev, u8 *eui64)
++{
++ memcpy(eui64, dev->netdev->dev_addr, 3);
++ eui64[3] = 0xFF;
++ eui64[4] = 0xFE;
++ memcpy(eui64+5, dev->netdev->dev_addr+3, 3);
++}
++
++static int ibscif_register_device(struct ibscif_dev *dev)
++{
++ strncpy(dev->ibdev.node_desc, DRV_SIGNON, sizeof dev->ibdev.node_desc);
++ ibscif_generate_eui64(dev, (u8 *)&dev->ibdev.node_guid);
++ dev->ibdev.owner = THIS_MODULE;
++ dev->ibdev.uverbs_abi_ver = UVERBS_ABI_VER;
++ dev->ibdev.uverbs_cmd_mask =
++ (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
++ (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
++ (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
++ (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
++ (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
++ (1ull << IB_USER_VERBS_CMD_CREATE_AH) |
++ (1ull << IB_USER_VERBS_CMD_DESTROY_AH) |
++ (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
++ (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
++ (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
++ (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
++ (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
++ (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
++ (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) |
++ (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
++ (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
++ (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
++ (1ull << IB_USER_VERBS_CMD_REG_MR) |
++ (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
++ (1ull << IB_USER_VERBS_CMD_POST_SEND) |
++ (1ull << IB_USER_VERBS_CMD_POST_RECV);
++#if defined(MOFED) && !defined(MOFED_2_1)
++ dev->ibdev.node_type = new_ib_type ? RDMA_EXP_NODE_MIC : RDMA_NODE_RNIC;
++#else
++ dev->ibdev.node_type = new_ib_type ? RDMA_NODE_MIC : RDMA_NODE_RNIC;
++#endif
++ dev->ibdev.phys_port_cnt = 1;
++
++ dev->ibdev.query_device = ibscif_query_device; // Mandatory
++ dev->ibdev.num_comp_vectors = 1; // Mandatory
++ dev->ibdev.query_port = ibscif_query_port; // Mandatory
++ dev->ibdev.query_pkey = ibscif_query_pkey; // Mandatory
++ dev->ibdev.query_gid = ibscif_query_gid; // Mandatory
++ dev->ibdev.alloc_ucontext = ibscif_alloc_ucontext; // Required
++ dev->ibdev.dealloc_ucontext = ibscif_dealloc_ucontext; // Required
++ dev->ibdev.alloc_pd = ibscif_alloc_pd; // Mandatory
++ dev->ibdev.dealloc_pd = ibscif_dealloc_pd; // Mandatory
++ dev->ibdev.create_ah = ibscif_create_ah; // Mandatory
++ dev->ibdev.destroy_ah = ibscif_destroy_ah; // Mandatory
++ dev->ibdev.create_qp = ibscif_create_qp; // Mandatory
++ dev->ibdev.query_qp = ibscif_query_qp; // Optional
++ dev->ibdev.modify_qp = ibscif_modify_qp; // Mandatory
++ dev->ibdev.destroy_qp = ibscif_destroy_qp; // Mandatory
++ dev->ibdev.create_cq = ibscif_create_cq; // Mandatory
++ dev->ibdev.resize_cq = ibscif_resize_cq; // Optional
++ dev->ibdev.destroy_cq = ibscif_destroy_cq; // Mandatory
++ dev->ibdev.poll_cq = ibscif_poll_cq; // Mandatory
++ dev->ibdev.req_notify_cq = ibscif_arm_cq; // Mandatory
++ dev->ibdev.get_dma_mr = ibscif_get_dma_mr; // Mandatory
++ dev->ibdev.reg_phys_mr = ibscif_reg_phys_mr; // Required
++ dev->ibdev.reg_user_mr = ibscif_reg_user_mr; // Required
++ dev->ibdev.dereg_mr = ibscif_dereg_mr; // Mandatory
++ dev->ibdev.post_send = ibscif_post_send; // Mandatory
++ dev->ibdev.post_recv = ibscif_post_receive; // Mandatory
++ dev->ibdev.dma_ops = &ibscif_dma_mapping_ops; // ??
++
++ dev->ibdev.iwcm = kzalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
++ if (!dev->ibdev.iwcm)
++ return -ENOMEM;
++
++ dev->ibdev.iwcm->connect = ibscif_cm_connect;
++ dev->ibdev.iwcm->accept = ibscif_cm_accept;
++ dev->ibdev.iwcm->reject = ibscif_cm_reject;
++ dev->ibdev.iwcm->create_listen = ibscif_cm_create_listen;
++ dev->ibdev.iwcm->destroy_listen = ibscif_cm_destroy_listen;
++ dev->ibdev.iwcm->add_ref = ibscif_cm_add_ref;
++ dev->ibdev.iwcm->rem_ref = ibscif_cm_rem_ref;
++ dev->ibdev.iwcm->get_qp = ibscif_cm_get_qp;
++
++ return ib_register_device(&dev->ibdev, NULL);
++}
++
++static void ibscif_dev_release(struct device *dev)
++{
++ kfree(dev);
++}
++
++/*
++ * Hold devlist_mutex during this call for synchronization as needed.
++ * Upon return, dev is invalid.
++ */
++static void ibscif_remove_dev(struct ibscif_dev *dev)
++{
++ struct ibscif_conn *conn, *next;
++
++ if (dev->ibdev.reg_state == IB_DEV_REGISTERED)
++ ib_unregister_device(&dev->ibdev);
++
++ WARN_ON(!list_empty(&dev->wq_list));
++
++ down(&devlist_mutex);
++ list_del(&dev->entry);
++ up(&devlist_mutex);
++
++ ibscif_refresh_pollep_list();
++
++ down(&dev->mutex);
++ list_for_each_entry_safe(conn, next, &dev->conn_list, entry) {
++ scif_close(conn->ep);
++ list_del(&conn->entry);
++ kfree(conn);
++ }
++ up(&dev->mutex);
++
++ if (dev->listen_ep)
++ scif_close(dev->listen_ep);
++ ibscif_procfs_remove_dev(dev);
++
++ dev_put(dev->netdev);
++ device_unregister(dev->ibdev.dma_device);
++ ib_dealloc_device(&dev->ibdev);
++}
++
++static void ibscif_remove_one(struct net_device *netdev)
++{
++ struct ibscif_dev *dev, *next;
++
++ list_for_each_entry_safe(dev, next, &devlist, entry) {
++ if (netdev == dev->netdev) {
++ ibscif_remove_dev(dev);
++ break;
++ }
++ }
++}
++
++static int node_cnt;
++static uint16_t node_ids[IBSCIF_MAX_DEVICES];
++static uint16_t my_node_id;
++
++static void ibscif_add_one(struct net_device *netdev)
++{
++ static int dev_cnt;
++ static dma_addr_t dma_mask = -1;
++ struct ibscif_dev *dev;
++ int ret;
++
++ dev = (struct ibscif_dev *)ib_alloc_device(sizeof *dev);
++ if (!dev) {
++ printk(KERN_ALERT PFX "%s: fail to allocate ib_device\n", __func__);
++ return;
++ }
++
++ INIT_LIST_HEAD(&dev->conn_list);
++ INIT_LIST_HEAD(&dev->mr_list);
++ init_MUTEX(&dev->mr_list_mutex);
++ init_MUTEX(&dev->mutex);
++ spin_lock_init(&dev->atomic_op);
++ INIT_LIST_HEAD(&dev->wq_list);
++ atomic_set(&dev->available, 256); /* FIXME */
++
++ dev_hold(netdev);
++ dev->netdev = netdev;
++
++ /* use the MAC address of the netdev as the GID so that RDMA CM can
++ * find the ibdev from the IP address associated with the netdev.
++ */
++ memcpy(&dev->gid, dev->netdev->dev_addr, ETH_ALEN);
++
++ dev->ibdev.dma_device = kzalloc(sizeof *dev->ibdev.dma_device, GFP_KERNEL);
++ if (!dev->ibdev.dma_device) {
++ printk(KERN_ALERT PFX "%s: fail to allocate dma_device\n", __func__);
++ goto out_free_ibdev;
++ }
++
++ snprintf(dev->name, IBSCIF_NAME_SIZE, "scif_dma_%d", dev_cnt);
++ snprintf(dev->ibdev.name, IB_DEVICE_NAME_MAX, "scif%d", dev_cnt++);
++ dev->ibdev.dma_device->release = ibscif_dev_release;
++ dev->ibdev.dma_device->init_name = dev->name;
++ dev->ibdev.dma_device->dma_mask = &dma_mask;
++ ret = device_register(dev->ibdev.dma_device);
++ if (ret) {
++ printk(KERN_ALERT PFX "%s: fail to register dma_device, ret=%d\n", __func__, ret);
++ kfree(dev->ibdev.dma_device);
++ goto out_free_ibdev;
++ }
++
++ /* Notice: set up listen ep before inserting to devlist */
++
++ dev->listen_ep = scif_open();
++ if (!dev->listen_ep || IS_ERR(dev->listen_ep)) {
++ printk(KERN_ALERT PFX "%s: scif_open returns %ld\n", __func__, PTR_ERR(dev->listen_ep));
++ goto out_unreg_dmadev ;
++ }
++
++ ret = scif_get_nodeIDs( node_ids, IBSCIF_MAX_DEVICES, &my_node_id);
++ if (ret < 0) {
++ printk(KERN_ALERT PFX "%s: scif_get_nodeIDS returns %d\n",
++ __func__, ret);
++ goto out_close_ep;
++ }
++
++ node_cnt = ret;
++ dev->node_id = my_node_id;
++ printk(KERN_ALERT PFX "%s: my node_id is %d\n", __func__, dev->node_id);
++
++ ret = scif_bind(dev->listen_ep, SCIF_OFED_PORT_0);
++ if (ret < 0) {
++ printk(KERN_ALERT PFX "%s: scif_bind returns %d, port=%d\n",
++ __func__, ret, SCIF_OFED_PORT_0);
++ goto out_close_ep;
++ }
++
++ ret = scif_listen(dev->listen_ep, IBSCIF_MAX_DEVICES);
++ if (ret < 0) {
++ printk(KERN_ALERT PFX "%s: scif_listen returns %d\n", __func__, ret);
++ goto out_close_ep;
++ }
++
++ down(&devlist_mutex);
++ list_add_tail(&dev->entry, &devlist);
++ up(&devlist_mutex);
++
++ if (ibscif_register_device(dev))
++ ibscif_remove_dev(dev);
++ else
++ ibscif_procfs_add_dev(dev);
++
++ ibscif_refresh_pollep_list();
++
++ return;
++
++out_close_ep:
++ scif_close(dev->listen_ep);
++
++out_unreg_dmadev:
++ device_unregister(dev->ibdev.dma_device); /* it will free the memory, too */
++
++out_free_ibdev:
++ ib_dealloc_device(&dev->ibdev);
++}
++
++static int ibscif_notifier(struct notifier_block *nb, unsigned long event, void *ptr)
++{
++ struct net_device *netdev = (struct net_device *)ptr;
++
++ if (strcmp(netdev->name, "mic0"))
++ return NOTIFY_DONE;
++
++ switch(event) {
++ case NETDEV_REGISTER:
++ ibscif_add_one(netdev);
++ ibscif_protocol_init_post();
++ break;
++
++ case NETDEV_UNREGISTER:
++ ibscif_remove_one(netdev);
++ break;
++
++ default:
++ /* we only care about the MAC address, ignore other notifications */
++ break;
++ }
++
++ return NOTIFY_DONE;
++}
++
++static struct notifier_block ibscif_notifier_block = {
++ .notifier_call = ibscif_notifier,
++};
++
++int ibscif_dev_init(void)
++{
++ int err = 0;
++
++ ibscif_protocol_init_pre();
++
++ err = register_netdevice_notifier(&ibscif_notifier_block);
++ if (err)
++ ibscif_protocol_cleanup();
++
++ return err;
++}
++
++void ibscif_dev_cleanup(void)
++{
++ struct ibscif_dev *dev, *next;
++
++ ibscif_protocol_cleanup();
++ unregister_netdevice_notifier(&ibscif_notifier_block);
++ list_for_each_entry_safe(dev, next, &devlist, entry)
++ ibscif_remove_dev(dev);
++}
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_qp.c a8/drivers/infiniband/hw/scif/ibscif_qp.c
+--- a7/drivers/infiniband/hw/scif/ibscif_qp.c 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_qp.c 2015-02-23 10:14:37.488809663 -0800
+@@ -0,0 +1,868 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "ibscif_driver.h"
++
++static struct ibscif_wr *ibscif_alloc_wr(struct ibscif_wq *wq, int new_size, int bytes)
++{
++ if (new_size && (new_size != wq->size)) {
++ struct ibscif_wr *new_wr = vzalloc(bytes);
++ return new_wr ? new_wr : ERR_PTR(-ENOMEM);
++ }
++ return NULL;
++}
++
++static void ibscif_move_wr(struct ibscif_wq *wq, struct ibscif_wr *new_wr, int new_size)
++{
++ int i;
++
++ if (wq->size == new_size)
++ return;
++
++ for (i = 0; i < wq->depth; i++) {
++ memcpy(&new_wr[i], &wq->wr[wq->head], wq->wr_size);
++ wq->head = (wq->head + 1) % wq->size;
++ }
++
++ if (wq->wr) {
++ vfree(wq->wr);
++ }
++
++ wq->wr = new_wr;
++ wq->head = 0;
++ wq->tail = wq->depth;
++ wq->size = new_size;
++}
++
++/* Caller must provide proper synchronization. */
++static int ibscif_resize_qp(struct ibscif_qp *qp, int sq_size, int rq_size, int iq_size)
++{
++ struct ibscif_wr *new_sq, *new_rq, *new_iq;
++ int sq_bytes, rq_bytes, iq_bytes;
++ int old_npages, new_npages, err;
++
++ sq_bytes = PAGE_ALIGN(sq_size * qp->sq.wr_size);
++ rq_bytes = PAGE_ALIGN(rq_size * qp->rq.wr_size);
++ iq_bytes = PAGE_ALIGN(iq_size * qp->iq.wr_size);
++
++ sq_size = sq_bytes / qp->sq.wr_size;
++ rq_size = rq_bytes / qp->rq.wr_size;
++ iq_size = iq_bytes / qp->iq.wr_size;
++
++ if ((sq_size == qp->sq.size) &&
++ (rq_size == qp->rq.size) &&
++ (iq_size == qp->iq.size))
++ return 0;
++
++ if ((sq_size < qp->sq.depth) ||
++ (rq_size < qp->rq.depth) ||
++ (iq_size < qp->iq.depth))
++ return -EINVAL;
++
++ /* Calculate the number of new pages required for this allocation. */
++ new_npages = (sq_bytes + rq_bytes + iq_bytes) >> PAGE_SHIFT;
++ old_npages = (PAGE_ALIGN(qp->sq.size * qp->sq.wr_size) +
++ PAGE_ALIGN(qp->rq.size * qp->rq.wr_size) +
++ PAGE_ALIGN(qp->iq.size * qp->iq.wr_size)) >> PAGE_SHIFT;
++ new_npages -= old_npages;
++
++ if (new_npages > 0) {
++ err = ibscif_reserve_quota(&new_npages);
++ if (err)
++ return err;
++ }
++
++ new_sq = ibscif_alloc_wr(&qp->sq, sq_size, sq_bytes);
++ new_rq = ibscif_alloc_wr(&qp->rq, rq_size, rq_bytes);
++ new_iq = ibscif_alloc_wr(&qp->iq, iq_size, iq_bytes);
++ if (IS_ERR(new_sq) || IS_ERR(new_rq) || IS_ERR(new_iq))
++ goto out;
++
++ ibscif_move_wr(&qp->sq, new_sq, sq_size);
++ ibscif_move_wr(&qp->rq, new_rq, rq_size);
++ ibscif_move_wr(&qp->iq, new_iq, iq_size);
++
++ if (new_npages < 0)
++ ibscif_release_quota(-new_npages);
++
++ return 0;
++out:
++ if (new_sq && !IS_ERR(new_sq))
++ vfree(new_sq);
++ if (new_rq && !IS_ERR(new_rq))
++ vfree(new_rq);
++ if (new_iq && !IS_ERR(new_iq))
++ vfree(new_iq);
++
++ return -ENOMEM;
++}
++
++static int ibscif_init_wqs(struct ibscif_qp *qp, struct ib_qp_init_attr *attr)
++{
++ spin_lock_init(&qp->sq.lock);
++ spin_lock_init(&qp->rq.lock);
++ spin_lock_init(&qp->iq.lock);
++
++ qp->sq.qp = qp;
++ qp->rq.qp = qp;
++ qp->iq.qp = qp;
++
++ qp->sq.wirestate = &qp->wire.sq;
++ qp->iq.wirestate = &qp->wire.iq;
++
++ qp->sq.max_sge = attr->cap.max_send_sge;
++ qp->rq.max_sge = attr->cap.max_recv_sge;
++ qp->iq.max_sge = 1;
++
++ qp->sq.wr_size = sizeof *qp->sq.wr + (sizeof *qp->sq.wr->ds_list * qp->sq.max_sge);
++ qp->rq.wr_size = sizeof *qp->rq.wr + (sizeof *qp->rq.wr->ds_list * qp->rq.max_sge);
++ qp->iq.wr_size = sizeof *qp->iq.wr + (sizeof *qp->iq.wr->ds_list * qp->iq.max_sge);
++
++ return ibscif_resize_qp(qp, attr->cap.max_send_wr, attr->cap.max_recv_wr, (rma_threshold==0x7FFFFFFF)?0:attr->cap.max_send_wr);
++}
++
++static void ibscif_reset_tx_state(struct ibscif_tx_state *tx)
++{
++ tx->next_seq = 1;
++ tx->last_ack_seq_recvd = 0;
++ tx->next_msg_id = 0;
++}
++
++static void ibscif_reset_rx_state(struct ibscif_rx_state *rx)
++{
++ rx->last_in_seq = 0;
++ rx->last_seq_acked = 0;
++ rx->defer_in_process = 0;
++}
++
++static void ibscif_reset_wirestate(struct ibscif_wirestate *wirestate)
++{
++ ibscif_reset_tx_state(&wirestate->tx);
++ ibscif_reset_rx_state(&wirestate->rx);
++}
++
++static void ibscif_reset_wire(struct ibscif_wire *wire)
++{
++ ibscif_reset_wirestate(&wire->sq);
++ ibscif_reset_wirestate(&wire->iq);
++}
++
++static void ibscif_init_wire(struct ibscif_wire *wire)
++{
++ ibscif_reset_wire(wire);
++}
++
++static void ibscif_query_qp_cap(struct ibscif_qp *qp, struct ib_qp_cap *cap)
++{
++ memset(cap, 0, sizeof *cap);
++ cap->max_send_wr = qp->sq.size;
++ cap->max_recv_wr = qp->rq.size;
++ cap->max_send_sge = qp->sq.max_sge;
++ cap->max_recv_sge = qp->rq.max_sge;
++}
++
++struct ib_qp *ibscif_create_qp(struct ib_pd *ibpd, struct ib_qp_init_attr *attr, struct ib_udata *udata)
++{
++ struct ibscif_dev *dev = to_dev(ibpd->device);
++ struct ibscif_qp *qp;
++ int err;
++
++ if ((attr->qp_type != IB_QPT_RC && attr->qp_type != IB_QPT_UD) ||
++ (attr->cap.max_send_wr > MAX_QP_SIZE) ||
++ (attr->cap.max_recv_wr > MAX_QP_SIZE) ||
++ (attr->cap.max_send_sge > MAX_SGES) ||
++ (attr->cap.max_recv_sge > MAX_SGES) ||
++ (attr->cap.max_send_wr && !attr->send_cq) ||
++ (attr->cap.max_recv_wr && !attr->recv_cq))
++ return ERR_PTR(-EINVAL);
++
++ if (!atomic_add_unless(&dev->qp_cnt, 1, MAX_QPS))
++ return ERR_PTR(-EAGAIN);
++
++ qp = kzalloc(sizeof *qp, GFP_KERNEL);
++ if (!qp) {
++ atomic_dec(&dev->qp_cnt);
++ return ERR_PTR(-ENOMEM);
++ }
++
++ qp->local_node_id = dev->node_id;
++
++ kref_init(&qp->ref);
++ init_completion(&qp->done);
++ init_MUTEX(&qp->modify_mutex);
++ spin_lock_init(&qp->lock);
++ ibscif_init_wire(&qp->wire);
++ qp->sq_policy = attr->sq_sig_type;
++ qp->dev = dev;
++ qp->mtu = IBSCIF_MTU; /* FIXME */
++ qp->state = QP_IDLE;
++
++ err = ibscif_init_wqs(qp, attr);
++ if (err)
++ goto out;
++
++ ibscif_query_qp_cap(qp, &attr->cap);
++
++ err = ibscif_wiremap_add(qp, &qp->ibqp.qp_num);
++ if (err)
++ goto out;
++
++ qp->magic = QP_MAGIC;
++
++ ibscif_scheduler_add_qp(qp);
++ qp->in_scheduler = 1;
++
++ return &qp->ibqp;
++out:
++ ibscif_destroy_qp(&qp->ibqp);
++ return ERR_PTR(err);
++}
++
++static inline enum ib_qp_state to_ib_qp_state(enum ibscif_qp_state state)
++{
++ switch (state) {
++ case QP_IDLE: return IB_QPS_INIT;
++ case QP_CONNECTED: return IB_QPS_RTS;
++ case QP_DISCONNECT: return IB_QPS_SQD;
++ case QP_ERROR: return IB_QPS_ERR;
++ case QP_RESET: return IB_QPS_RESET;
++ default: return -1;
++ }
++}
++
++static inline enum ibscif_qp_state to_ibscif_qp_state(enum ib_qp_state state)
++{
++ switch (state) {
++ case IB_QPS_INIT: return QP_IDLE;
++ case IB_QPS_RTS: return QP_CONNECTED;
++ case IB_QPS_SQD: return QP_DISCONNECT;
++ case IB_QPS_ERR: return QP_ERROR;
++ case IB_QPS_RESET: return QP_RESET;
++ case IB_QPS_RTR: return QP_IGNORE;
++ default: return -1;
++ }
++}
++
++/* Caller must provide proper synchronization. */
++static void __ibscif_query_qp(struct ibscif_qp *qp, struct ib_qp_attr *attr, struct ib_qp_init_attr *init_attr)
++{
++ struct ib_qp_cap cap;
++
++ ibscif_query_qp_cap(qp, &cap);
++
++ if (attr) {
++ attr->qp_state = to_ib_qp_state(qp->state);
++ attr->cur_qp_state = attr->qp_state;
++ attr->port_num = 1;
++ attr->path_mtu = qp->mtu;
++ attr->dest_qp_num = qp->remote_qpn;
++ attr->qp_access_flags = qp->access;
++ attr->max_rd_atomic = qp->max_or;
++ attr->max_dest_rd_atomic = qp->iq.size;
++ attr->cap = cap;
++ }
++
++ if (init_attr) {
++ init_attr->qp_type = qp->ibqp.qp_type;
++ init_attr->sq_sig_type = qp->sq_policy;
++ init_attr->cap = cap;
++ }
++}
++
++int ibscif_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_qp_init_attr *init_attr)
++{
++ struct ibscif_qp *qp = to_qp(ibqp);
++
++ memset(attr, 0, sizeof *attr);
++ memset(init_attr, 0, sizeof *init_attr);
++
++ spin_lock_bh(&qp->lock);
++ __ibscif_query_qp(qp, attr, init_attr);
++ spin_unlock_bh(&qp->lock);
++
++ return 0;
++}
++
++static int ibscif_flush_wq(struct ibscif_wq *wq, struct ibscif_cq *cq)
++{
++ struct ibscif_wr *wr;
++ struct ibscif_wc *wc;
++ int i, num_wr, err;
++
++ /* Prevent divide by zero traps on wrap math. */
++ if (!wq->size)
++ return 0;
++
++ spin_lock_bh(&wq->lock);
++ for (i = (wq->head + wq->completions) % wq->size, num_wr = 0;
++ wq->depth && (wq->completions != wq->depth);
++ i = (i + 1) % wq->size, num_wr++) {
++
++ wr = ibscif_get_wr(wq, i);
++
++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds);
++
++ if (!cq) {
++ wq->completions++;
++ continue;
++ }
++
++ err = ibscif_reserve_cqe(cq, &wc);
++ if (err) {
++ num_wr = err;
++ break;
++ }
++
++ wc->ibwc.qp = &wq->qp->ibqp;
++ wc->ibwc.src_qp = wq->qp->remote_qpn;
++ wc->ibwc.wr_id = wr->id;
++ wc->ibwc.opcode = is_rq(wq) ? IB_WC_RECV : to_ib_wc_opcode(wr->opcode);
++ wc->ibwc.status = IB_WC_WR_FLUSH_ERR;
++ wc->ibwc.ex.imm_data = 0;
++ wc->ibwc.byte_len = 0;
++ wc->ibwc.port_num = 1;
++
++ wc->wq = wq;
++ wc->reap = wq->reap + 1;
++ wq->reap = 0;
++ wq->completions++;
++
++ ibscif_append_cqe(cq, wc, 0);
++ }
++ spin_unlock_bh(&wq->lock);
++
++ if (num_wr && cq)
++ ibscif_notify_cq(cq);
++
++ return num_wr;
++}
++
++static void ibscif_flush_wqs(struct ibscif_qp *qp)
++{
++ int ret;
++
++ ret = ibscif_flush_wq(&qp->sq, to_cq(qp->ibqp.send_cq));
++ if (ret) /* A clean SQ flush should have done nothing. */
++ qp->state = QP_ERROR;
++
++ ret = ibscif_flush_wq(&qp->rq, to_cq(qp->ibqp.recv_cq));
++ if (ret < 0)
++ qp->state = QP_ERROR;
++
++ ibscif_flush_wq(&qp->iq, NULL);
++}
++
++static void ibscif_reset_wq(struct ibscif_wq *wq, struct ibscif_cq *cq)
++{
++ ibscif_clear_cqes(cq, wq);
++
++ wq->head = 0;
++ wq->tail = 0;
++ wq->depth = 0;
++ wq->reap = 0;
++ wq->next_wr = 0;
++ wq->next_msg_id = 0;
++ wq->completions = 0;
++}
++
++static void ibscif_reset_wqs(struct ibscif_qp *qp)
++{
++ ibscif_reset_wq(&qp->sq, to_cq(qp->ibqp.send_cq));
++ ibscif_reset_wq(&qp->rq, to_cq(qp->ibqp.recv_cq));
++ ibscif_reset_wq(&qp->iq, NULL);
++}
++
++static void ibscif_qp_event(struct ibscif_qp *qp, enum ib_event_type event)
++{
++ if (qp->ibqp.event_handler) {
++ struct ib_event record;
++ record.event = event;
++ record.device = qp->ibqp.device;
++ record.element.qp = &qp->ibqp;
++ qp->ibqp.event_handler(&record, qp->ibqp.qp_context);
++ }
++}
++
++/* Caller must provide proper synchronization. */
++static void ibscif_qp_error(struct ibscif_qp *qp)
++{
++ if (qp->state == QP_ERROR)
++ return;
++
++ if (qp->state == QP_CONNECTED)
++ ibscif_send_disconnect(qp, IBSCIF_REASON_DISCONNECT);
++
++ qp->state = QP_ERROR;
++
++ ibscif_flush_wqs(qp);
++
++ ibscif_cm_async_callback(qp->cm_context);
++ qp->cm_context = NULL;
++
++ /* don't generate the error event because transitioning to IB_QPS_ERR
++ state is normal when a QP is disconnected */
++
++ //ibscif_qp_event(qp, IB_EVENT_QP_FATAL);
++}
++
++/* Caller must provide proper synchronization. */
++static void ibscif_qp_reset(struct ibscif_qp *qp)
++{
++ if (qp->state == QP_RESET)
++ return;
++
++ if (qp->state == QP_CONNECTED)
++ ibscif_send_disconnect(qp, IBSCIF_REASON_DISCONNECT);
++
++ ibscif_reset_wqs(qp);
++ ibscif_reset_wire(&qp->wire);
++
++ ibscif_cm_async_callback(qp->cm_context);
++ qp->cm_context = NULL;
++
++ qp->state = QP_RESET;
++}
++
++/* Caller must provide proper synchronization. */
++void ibscif_qp_idle(struct ibscif_qp *qp)
++{
++ if (qp->state == QP_IDLE)
++ return;
++
++ ibscif_reset_wqs(qp);
++ ibscif_reset_wire(&qp->wire);
++
++ qp->state = QP_IDLE;
++}
++
++/* Caller must provide proper synchronization. */
++static void ibscif_qp_connect(struct ibscif_qp *qp, enum ibscif_qp_state cur_state)
++{
++ if (cur_state == QP_CONNECTED)
++ return;
++
++ qp->loopback = (qp->ibqp.qp_type != IB_QPT_UD) && !scif_loopback && (qp->local_node_id == qp->remote_node_id);
++ qp->conn = NULL;
++
++ qp->state = QP_CONNECTED;
++}
++
++/* Caller must provide proper synchronization. */
++static void ibscif_qp_local_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason)
++{
++ if (qp->state != QP_CONNECTED)
++ return;
++
++ if (reason != IBSCIF_REASON_DISCONNECT)
++ printk(KERN_NOTICE PFX "QP %u sending abnormal disconnect %d\n",
++ qp->ibqp.qp_num, reason);
++
++ qp->state = QP_DISCONNECT;
++ ibscif_send_disconnect(qp, reason);
++
++ ibscif_flush_wqs(qp);
++
++ ibscif_cm_async_callback(qp->cm_context);
++ qp->cm_context = NULL;
++
++ if (reason != IBSCIF_REASON_DISCONNECT) {
++ qp->state = QP_ERROR;
++ ibscif_qp_event(qp, IB_EVENT_QP_FATAL);
++ } else
++ ibscif_qp_idle(qp);
++}
++
++void ibscif_qp_internal_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason)
++{
++ spin_lock_bh(&qp->lock);
++ ibscif_qp_local_disconnect(qp, reason);
++ spin_unlock_bh(&qp->lock);
++}
++
++void ibscif_qp_remote_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason)
++{
++ if (reason != IBSCIF_REASON_DISCONNECT)
++ printk(KERN_NOTICE PFX "QP %u received abnormal disconnect %d\n",
++ qp->ibqp.qp_num, reason);
++
++ if (qp->loopback) {
++ /*
++ * Prevent simultaneous loopback QP disconnect deadlocks.
++ * This is no worse than dropping a disconnect packet.
++ */
++ if (!spin_trylock_bh(&qp->lock))
++ return;
++ } else
++ spin_lock_bh(&qp->lock);
++
++ if (qp->state != QP_CONNECTED) {
++ spin_unlock_bh(&qp->lock);
++ return;
++ }
++
++ ibscif_flush_wqs(qp);
++
++ ibscif_cm_async_callback(qp->cm_context);
++ qp->cm_context = NULL;
++
++ if (reason != IBSCIF_REASON_DISCONNECT) {
++ qp->state = QP_ERROR;
++ ibscif_qp_event(qp, IB_EVENT_QP_FATAL);
++ } else
++ qp->state = QP_IDLE;
++
++ spin_unlock_bh(&qp->lock);
++}
++
++#define MODIFY_ALLOWED 1
++#define MODIFY_INVALID 0
++#define VALID_TRANSITION(next_state, modify_allowed) { 1, modify_allowed },
++#define INVAL_TRANSITION(next_state) { 0, MODIFY_INVALID },
++#define START_STATE(current_state) {
++#define CEASE_STATE(current_state) },
++
++static const struct {
++
++ int valid;
++ int modify_allowed;
++
++} qp_transition[NR_QP_STATES][NR_QP_STATES] = {
++
++ START_STATE(QP_IDLE)
++ VALID_TRANSITION( QP_IDLE, MODIFY_ALLOWED )
++ VALID_TRANSITION( QP_CONNECTED, MODIFY_ALLOWED )
++ INVAL_TRANSITION( QP_DISCONNECT )
++ VALID_TRANSITION( QP_ERROR, MODIFY_INVALID )
++ VALID_TRANSITION( QP_RESET, MODIFY_INVALID )
++ VALID_TRANSITION( QP_IGNORE, MODIFY_ALLOWED )
++ CEASE_STATE(QP_IDLE)
++
++ START_STATE(QP_CONNECTED)
++ INVAL_TRANSITION( QP_IDLE )
++ VALID_TRANSITION( QP_CONNECTED, MODIFY_INVALID )
++ VALID_TRANSITION( QP_DISCONNECT, MODIFY_INVALID )
++ VALID_TRANSITION( QP_ERROR, MODIFY_INVALID )
++ VALID_TRANSITION( QP_RESET, MODIFY_INVALID )
++ VALID_TRANSITION( QP_IGNORE, MODIFY_ALLOWED )
++ CEASE_STATE(QP_CONNECTED)
++
++ START_STATE(QP_DISCONNECT) /* Automatic transition to IDLE */
++ INVAL_TRANSITION( QP_IDLE )
++ INVAL_TRANSITION( QP_CONNECTED )
++ INVAL_TRANSITION( QP_DISCONNECT )
++ INVAL_TRANSITION( QP_ERROR )
++ INVAL_TRANSITION( QP_RESET )
++ INVAL_TRANSITION( QP_IGNORE )
++ CEASE_STATE(QP_DISCONNECT)
++
++ START_STATE(QP_ERROR)
++ VALID_TRANSITION( QP_IDLE, MODIFY_INVALID )
++ INVAL_TRANSITION( QP_CONNECTED )
++ INVAL_TRANSITION( QP_DISCONNECT )
++ VALID_TRANSITION( QP_ERROR, MODIFY_INVALID )
++ VALID_TRANSITION( QP_RESET, MODIFY_INVALID )
++ VALID_TRANSITION( QP_IGNORE, MODIFY_ALLOWED )
++ CEASE_STATE(QP_ERROR)
++
++ START_STATE(QP_RESET)
++ VALID_TRANSITION( QP_IDLE, MODIFY_ALLOWED )
++ INVAL_TRANSITION( QP_CONNECTED )
++ INVAL_TRANSITION( QP_DISCONNECT )
++ VALID_TRANSITION( QP_ERROR, MODIFY_INVALID )
++ VALID_TRANSITION( QP_RESET, MODIFY_INVALID )
++ VALID_TRANSITION( QP_IGNORE, MODIFY_ALLOWED )
++ CEASE_STATE(QP_RESET)
++};
++
++int ibscif_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata)
++{
++ struct ibscif_qp *qp = to_qp(ibqp);
++ enum ibscif_qp_state cur_state, new_state;
++ int sq_size, rq_size, max_or, max_ir;
++ int err = -EINVAL;
++
++ /*
++ * Mutex prevents simultaneous user-mode QP modifies.
++ */
++ down(&qp->modify_mutex);
++
++ cur_state = qp->state;
++
++ if ((attr_mask & IB_QP_CUR_STATE) && (to_ibscif_qp_state(attr->cur_qp_state) != cur_state))
++ goto out;
++ if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || attr->port_num > 1))
++ goto out;
++
++ /* Validate any state transition. */
++ if (attr_mask & IB_QP_STATE) {
++ new_state = to_ibscif_qp_state(attr->qp_state);
++ if (new_state < 0 || new_state >= NR_QP_STATES)
++ goto out;
++
++ if (!qp_transition[cur_state][new_state].valid)
++ goto out;
++ } else
++ new_state = cur_state;
++
++ /* Validate any attribute modify request. */
++ if (attr_mask & (IB_QP_AV |
++ IB_QP_CAP |
++ IB_QP_DEST_QPN |
++ IB_QP_ACCESS_FLAGS |
++ IB_QP_MAX_QP_RD_ATOMIC |
++ IB_QP_MAX_DEST_RD_ATOMIC)) {
++
++ if (!qp_transition[cur_state][new_state].modify_allowed)
++ goto out;
++
++ if ((attr_mask & IB_QP_AV) && (attr->ah_attr.ah_flags & IB_AH_GRH) && check_grh) {
++ int remote_node_id = IBSCIF_LID_TO_NODE_ID(attr->ah_attr.dlid);
++ struct ibscif_conn *conn;
++ union ib_gid *dgid;
++
++ if (verbose)
++ printk(KERN_INFO PFX "%s: %d-->%d, DGID=%llx:%llx\n",
++ __func__, qp->local_node_id, remote_node_id,
++ __be64_to_cpu(attr->ah_attr.grh.dgid.global.subnet_prefix),
++ __be64_to_cpu(attr->ah_attr.grh.dgid.global.interface_id));
++
++ if (remote_node_id == qp->local_node_id) {
++ dgid = &qp->dev->gid;
++ }
++ else {
++ spin_lock(&qp->lock);
++ conn = ibscif_get_conn(qp->local_node_id, remote_node_id, 0);
++ spin_unlock(&qp->lock);
++ if (!conn) {
++ if (verbose)
++ printk(KERN_INFO PFX "%s: failed to make SCIF connection %d-->%d.\n",
++ __func__, qp->local_node_id, remote_node_id);
++ goto out;
++ }
++ dgid = &conn->remote_gid;
++ ibscif_put_conn(conn);
++ }
++
++ if (verbose)
++ printk(KERN_INFO PFX "%s: local GID[%d]=%llx:%llx\n",
++ __func__, remote_node_id,
++ __be64_to_cpu(dgid->global.subnet_prefix),
++ __be64_to_cpu(dgid->global.interface_id));
++
++ if (memcmp(dgid, &attr->ah_attr.grh.dgid, sizeof(*dgid))) {
++ if (verbose)
++ printk(KERN_INFO PFX "%s: connecting to DGID outside the box is unsupported.\n",
++ __func__);
++ goto out;
++ }
++ }
++
++ if (attr_mask & IB_QP_CAP) {
++ sq_size = attr->cap.max_send_wr;
++ rq_size = attr->cap.max_recv_wr;
++ if ((sq_size > MAX_QP_SIZE) || (rq_size > MAX_QP_SIZE))
++ goto out;
++ } else {
++ sq_size = qp->sq.size;
++ rq_size = qp->rq.size;
++ }
++ if ((sq_size && !qp->ibqp.send_cq) || (rq_size && !qp->ibqp.recv_cq))
++ goto out;
++
++ max_or = (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) ?
++ attr->max_rd_atomic : qp->max_or;
++ max_ir = (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) ?
++ attr->max_dest_rd_atomic : qp->iq.size;
++
++ if (rma_threshold<0x7FFFFFFF && max_ir>MAX_IR && max_ir>=qp->sq.size)
++ max_ir -= qp->sq.size;
++
++ if ((max_or > MAX_OR) || (max_ir > MAX_IR))
++ goto out;
++
++ /* Validation successful; resize the QP as needed. */
++ err = ibscif_resize_qp(qp, sq_size, rq_size, max_ir + ((rma_threshold==0x7FFFFFFFF)?0:sq_size));
++ if (err)
++ goto out;
++
++ /* No failure paths below the QP resize. */
++
++ qp->max_or = max_or;
++
++ if (attr_mask & IB_QP_ACCESS_FLAGS)
++ qp->access = attr->qp_access_flags;
++
++ if (attr_mask & IB_QP_DEST_QPN)
++ qp->remote_qpn = attr->dest_qp_num;
++
++ if (attr_mask & IB_QP_AV)
++ qp->remote_node_id = IBSCIF_LID_TO_NODE_ID(attr->ah_attr.dlid);
++ }
++
++ err = 0;
++ if (attr_mask & IB_QP_STATE) {
++
++ /* Perform state change processing. */
++ spin_lock_bh(&qp->lock);
++ switch (new_state) {
++ case QP_IDLE:
++ ibscif_qp_idle(qp);
++ break;
++ case QP_CONNECTED:
++ ibscif_qp_connect(qp, cur_state);
++ break;
++ case QP_DISCONNECT:
++ ibscif_qp_local_disconnect(qp, IBSCIF_REASON_DISCONNECT);
++ break;
++ case QP_ERROR:
++ ibscif_qp_error(qp);
++ break;
++ case QP_RESET:
++ ibscif_qp_reset(qp);
++ break;
++ default:
++ break;
++ }
++ spin_unlock_bh(&qp->lock);
++
++ /* scif_connect() can not be called with spin_lock_bh() held */
++ if (ibqp->qp_type != IB_QPT_UD && new_state == QP_CONNECTED && !qp->loopback) {
++ int flag = (qp->ibqp.qp_num > qp->remote_qpn);
++ spin_lock(&qp->lock);
++ qp->conn = ibscif_get_conn( qp->local_node_id, qp->remote_node_id, flag );
++ spin_unlock(&qp->lock);
++ }
++ }
++
++ __ibscif_query_qp(qp, attr, NULL);
++out:
++ up(&qp->modify_mutex);
++ return err;
++}
++
++void ibscif_complete_qp(struct kref *ref)
++{
++ struct ibscif_qp *qp = container_of(ref, struct ibscif_qp, ref);
++ complete(&qp->done);
++}
++
++int ibscif_destroy_qp(struct ib_qp *ibqp)
++{
++ struct ibscif_qp *qp = to_qp(ibqp);
++ struct ibscif_dev *dev = qp->dev;
++ int i, j;
++ struct ibscif_conn *conn[IBSCIF_MAX_DEVICES];
++
++ if (qp->cm_context) {
++ ibscif_cm_async_callback(qp->cm_context);
++ qp->cm_context = NULL;
++ }
++
++ if (ibqp->qp_num)
++ ibscif_wiremap_del(ibqp->qp_num);
++
++ if (qp->in_scheduler)
++ ibscif_scheduler_remove_qp(qp);
++
++ spin_lock_bh(&qp->lock);
++ if (qp->state == QP_CONNECTED)
++ ibscif_send_disconnect(qp, IBSCIF_REASON_DISCONNECT);
++ spin_unlock_bh(&qp->lock);
++
++ ibscif_put_qp(qp);
++ wait_for_completion(&qp->done);
++
++ ibscif_flush_wqs(qp);
++ ibscif_reset_wqs(qp);
++ ibscif_reset_wire(&qp->wire);
++
++ if (qp->sq.wr)
++ vfree(qp->sq.wr);
++ if (qp->rq.wr)
++ vfree(qp->rq.wr);
++ if (qp->iq.wr)
++ vfree(qp->iq.wr);
++
++ ibscif_release_quota((PAGE_ALIGN(qp->sq.size * qp->sq.wr_size) +
++ PAGE_ALIGN(qp->rq.size * qp->rq.wr_size) +
++ PAGE_ALIGN(qp->iq.size * qp->iq.wr_size)) >> PAGE_SHIFT);
++
++ atomic_dec(&dev->qp_cnt);
++
++ ibscif_put_conn(qp->conn);
++
++ if (qp->ibqp.qp_type == IB_QPT_UD) {
++ spin_lock_bh(&qp->lock);
++ for (i=0, j=0; i<IBSCIF_MAX_DEVICES; i++) {
++ if (qp->ud_conn[i]) {
++ conn[j++] = qp->ud_conn[i];
++ qp->ud_conn[i] = NULL;
++ }
++ }
++ spin_unlock_bh(&qp->lock);
++
++ /* ibscif_put_conn() may call scif_unregister(), should not hold a lock */
++ for (i=0; i<j; i++)
++ ibscif_put_conn(conn[i]);
++ }
++
++ kfree(qp);
++ return 0;
++}
++
++void ibscif_qp_add_ud_conn(struct ibscif_qp *qp, struct ibscif_conn *conn)
++{
++ int i;
++
++ if (!qp || !conn)
++ return;
++
++ if (qp->ibqp.qp_type != IB_QPT_UD)
++ return;
++
++
++ spin_lock_bh(&qp->lock);
++
++ for (i=0; i<IBSCIF_MAX_DEVICES; i++) {
++ if (qp->ud_conn[i] == conn)
++ goto done;
++ }
++
++ for (i=0; i<IBSCIF_MAX_DEVICES; i++) {
++ if (qp->ud_conn[i] == NULL) {
++ atomic_inc(&conn->refcnt);
++ qp->ud_conn[i] = conn;
++ break;
++ }
++ }
++done:
++ spin_unlock_bh(&qp->lock);
++}
++
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_scheduler.c a8/drivers/infiniband/hw/scif/ibscif_scheduler.c
+--- a7/drivers/infiniband/hw/scif/ibscif_scheduler.c 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_scheduler.c 2015-02-23 10:14:37.488809663 -0800
+@@ -0,0 +1,195 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "ibscif_driver.h"
++
++static int ibscif_schedule_tx(struct ibscif_wq *wq, int max_send)
++{
++ struct ibscif_tx_state *tx = &wq->wirestate->tx;
++ struct ibscif_qp *qp = wq->qp;
++ struct ibscif_wr *wr;
++ int index, sent = 0;
++
++ while ((wq->next_wr != wq->tail) && ibscif_tx_window(tx) && max_send) {
++
++ index = wq->next_wr;
++ wr = ibscif_get_wr(wq, index);
++
++ /*
++ * Ack processing can reschedule a WR that is in retry; only process
++ * it if we are all caught up. Also, do not start a fenced WR until
++ * all prior RDMA read and atomic operations have completed.
++ */
++ if ((wr->flags & IB_SEND_FENCE) && atomic_read(&qp->or_depth) &&
++ (wr->state == WR_WAITING))
++ break;
++
++ switch (wr->opcode) {
++ case WR_RDMA_READ:
++ case WR_ATOMIC_CMP_AND_SWP:
++ case WR_ATOMIC_FETCH_AND_ADD:
++ /* Throttle IQ stream requests if needed. */
++ if (wr->state == WR_WAITING) {
++ if (atomic_read(&qp->or_depth) == qp->max_or)
++ return 0;
++ atomic_inc(&qp->or_depth);
++ }
++ /* Fall through. */
++ case WR_SEND:
++ case WR_SEND_WITH_IMM:
++ case WR_RDMA_WRITE:
++ case WR_RDMA_WRITE_WITH_IMM:
++ case WR_RDMA_READ_RSP:
++ case WR_ATOMIC_RSP:
++ case WR_RMA_RSP:
++ sent = ibscif_xmit_wr(wq, wr, min((u32)max_send, ibscif_tx_window(tx)),
++ 0, tx->next_seq, &tx->next_seq);
++ break;
++ case WR_UD:
++ sent = ibscif_xmit_wr(wq, wr, min((u32)max_send, ibscif_tx_window(tx)),
++ 0, 0, NULL);
++ break;
++ default:
++ printk(KERN_ERR PFX "%s() botch: found opcode %d on work queue\n",
++ __func__, wr->opcode);
++ return -EOPNOTSUPP;
++ }
++
++ /* If an IQ stream request did not get started we need to back off or_depth. */
++ if ((wr->state == WR_WAITING) &&
++ ((wr->opcode == WR_RDMA_READ) ||
++ (wr->opcode == WR_ATOMIC_CMP_AND_SWP) || (wr->opcode == WR_ATOMIC_FETCH_AND_ADD)))
++ atomic_dec(&qp->or_depth);
++
++ if (sent < 0)
++ return sent;
++
++ max_send -= sent;
++
++ /*
++ * The tx engine bumps next_wr when finished sending a whole WR.
++ * Bail if it didn't this time around.
++ */
++ if (wq->next_wr == index)
++ break;
++ }
++
++ return 0;
++}
++
++static int ibscif_schedule_wq(struct ibscif_wq *wq)
++{
++ int max_send, err = 0;
++ int need_call_sq_completions = 0;
++
++ /* Ignore loopback QPs that may be scheduled by retry processing. */
++ if (wq->qp->loopback)
++ return 0;
++
++ if (!(max_send = atomic_read(&wq->qp->dev->available)))
++ return -EBUSY;
++
++ spin_lock(&wq->lock);
++ err = ibscif_schedule_tx(wq, max_send);
++ need_call_sq_completions = wq->fast_rdma_completions;
++ wq->fast_rdma_completions = 0;
++ spin_unlock(&wq->lock);
++
++ if (unlikely(err))
++ ibscif_qp_internal_disconnect(wq->qp, IBSCIF_REASON_QP_FATAL);
++
++ if (fast_rdma && need_call_sq_completions)
++ ibscif_process_sq_completions(wq->qp);
++
++ return err;
++}
++
++void ibscif_schedule(struct ibscif_wq *wq)
++{
++ struct ibscif_dev *dev;
++ struct list_head processed;
++
++ if (wq->qp->loopback) {
++ ibscif_loopback(wq);
++ return;
++ }
++ dev = wq->qp->dev;
++
++ if (!ibscif_schedule_wq(wq))
++ goto out;
++
++ while (atomic_xchg(&dev->was_new, 0)) {
++ /* Bail if the device is busy. */
++ if (down_trylock(&dev->mutex))
++ goto out;
++
++ /*
++ * Schedule each WQ on the device and move it to the processed list.
++ * When complete, append the processed list to the device WQ list.
++ */
++ INIT_LIST_HEAD(&processed);
++ while (!list_empty(&dev->wq_list)) {
++ wq = list_entry(dev->wq_list.next, typeof(*wq), entry);
++ if (!ibscif_schedule_wq(wq)) {
++ DEV_STAT(dev, sched_exhaust++);
++ list_splice(&processed, dev->wq_list.prev);
++ up(&dev->mutex);
++ goto out;
++ }
++ list_move_tail(&wq->entry, &processed);
++ }
++ list_splice(&processed, dev->wq_list.prev);
++
++ up(&dev->mutex);
++ }
++ return;
++out:
++ atomic_inc(&dev->was_new);
++}
++
++void ibscif_scheduler_add_qp(struct ibscif_qp *qp)
++{
++ struct ibscif_dev *dev = qp->dev;
++
++ down(&dev->mutex);
++ list_add_tail(&qp->sq.entry, &dev->wq_list);
++ list_add_tail(&qp->iq.entry, &dev->wq_list);
++ up(&dev->mutex);
++}
++
++void ibscif_scheduler_remove_qp(struct ibscif_qp *qp)
++{
++ struct ibscif_dev *dev = qp->dev;
++
++ down(&dev->mutex);
++ list_del(&qp->sq.entry);
++ list_del(&qp->iq.entry);
++ up(&dev->mutex);
++}
+diff -urN a7/drivers/infiniband/hw/scif/ibscif_util.c a8/drivers/infiniband/hw/scif/ibscif_util.c
+--- a7/drivers/infiniband/hw/scif/ibscif_util.c 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/ibscif_util.c 2015-02-23 10:14:37.488809663 -0800
+@@ -0,0 +1,623 @@
++/*
++ * Copyright (c) 2008 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the
++ * GNU General Public License (GPL) Version 2, available from the
++ * file COPYING in the main directory of this source tree, or the
++ * OpenFabrics.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "ibscif_driver.h"
++
++#define IBSCIF_CONN_IDLE 0
++#define IBSCIF_CONN_REQ_SENT 1
++#define IBSCIF_CONN_REQ_RCVD 2
++#define IBSCIF_CONN_ESTABLISHED 3
++#define IBSCIF_CONN_ACTIVE 4
++
++DEFINE_SPINLOCK(conn_state_lock);
++static int conn_state[IBSCIF_MAX_DEVICES][IBSCIF_MAX_DEVICES];
++
++#define IBSCIF_CONN_REP 1
++#define IBSCIF_CONN_REJ 2
++#define IBSCIF_CONN_ERR 3
++
++struct ibscif_conn_resp {
++ int cmd;
++ union ib_gid gid;
++};
++
++void ibscif_do_accept(struct ibscif_dev *dev)
++{
++ struct scif_portID peer;
++ scif_epd_t ep;
++ struct ibscif_conn *conn;
++ int ret;
++ struct ibscif_conn_resp resp;
++ int resp_size;
++
++ if (check_grh)
++ resp_size = sizeof(resp);
++ else
++ resp_size = sizeof(int);
++
++ ret = scif_accept(dev->listen_ep, &peer, &ep, SCIF_ACCEPT_SYNC);
++ if (ret) {
++ printk(KERN_ALERT PFX "%s: scif_accept returns %ld\n", __func__, PTR_ERR(ep));
++ return;
++ }
++
++ if (verbose)
++ printk(KERN_INFO PFX "%s: %d<--%d\n", __func__, dev->node_id, peer.node);
++
++ if (check_grh)
++ memcpy(&resp.gid, &dev->gid, sizeof(resp.gid));
++
++ spin_lock(&conn_state_lock);
++ switch (conn_state[dev->node_id][peer.node]) {
++ case IBSCIF_CONN_IDLE:
++ conn_state[dev->node_id][peer.node] = IBSCIF_CONN_REQ_RCVD;
++ resp.cmd = IBSCIF_CONN_REP;
++ if (verbose)
++ printk(KERN_INFO PFX "%s: no double connection, accepting\n", __func__);
++ break;
++
++ case IBSCIF_CONN_REQ_SENT:
++ /* A connection request has been sent, but no response yet. Node id is used to
++ * break the tie when both side send the connection request. One side is allowed
++ * to accept the request and its own request will be rejected by the peer.
++ */
++ if (dev->node_id > peer.node) {
++ resp.cmd = IBSCIF_CONN_REJ;
++ if (verbose)
++ printk(KERN_INFO PFX "%s: double connection, rejecting (peer will accept)\n", __func__);
++ }
++ else if (dev->node_id == peer.node) {
++ conn_state[dev->node_id][peer.node] = IBSCIF_CONN_REQ_RCVD;
++ resp.cmd = IBSCIF_CONN_REP;
++ if (verbose)
++ printk(KERN_INFO PFX "%s: loopback connection, accepting\n", __func__);
++ }
++ else {
++ conn_state[dev->node_id][peer.node] = IBSCIF_CONN_REQ_RCVD;
++ resp.cmd = IBSCIF_CONN_REP;
++ if (verbose)
++ printk(KERN_INFO PFX "%s: double connection, accepting (peer will reject)\n", __func__);
++ }
++ break;
++
++ case IBSCIF_CONN_REQ_RCVD:
++ if (verbose)
++ printk(KERN_INFO PFX "%s: duplicated connection request, rejecting\n", __func__);
++ resp.cmd = IBSCIF_CONN_REJ;
++ break;
++
++ case IBSCIF_CONN_ESTABLISHED:
++ case IBSCIF_CONN_ACTIVE:
++ if (verbose)
++ printk(KERN_INFO PFX "%s: already connected, rejecting\n", __func__);
++ resp.cmd = IBSCIF_CONN_REJ;
++ break;
++
++ default:
++ if (verbose)
++ printk(KERN_INFO PFX "%s: invalid state: %d\n", __func__, conn_state[dev->node_id][peer.node]);
++ resp.cmd = IBSCIF_CONN_ERR;
++ break;
++ }
++ spin_unlock(&conn_state_lock);
++
++ ret = scif_send(ep, &resp, resp_size, SCIF_SEND_BLOCK);
++ if (ret < 0) {
++ printk(KERN_ALERT PFX "%s: scif_send returns %d\n", __func__, ret);
++ scif_close(ep);
++ return;
++ }
++
++ if (resp.cmd != IBSCIF_CONN_REP) {
++ /* one additional hand shaking to prevent the previous send from being trashed by ep closing */
++ scif_recv(ep, &resp, resp_size, SCIF_RECV_BLOCK);
++ scif_close(ep);
++ return;
++ }
++
++ if (check_grh) {
++ ret = scif_recv(ep, &resp, resp_size, SCIF_RECV_BLOCK);
++ if (ret < 0) {
++ printk(KERN_ALERT PFX "%s: scif_recv returns %d\n", __func__, ret);
++ scif_close(ep);
++ spin_lock(&conn_state_lock);
++ conn_state[dev->node_id][peer.node] = IBSCIF_CONN_IDLE;
++ spin_unlock(&conn_state_lock);
++ return;
++ }
++ }
++
++ conn = kzalloc(sizeof (*conn), GFP_KERNEL);
++ if (!conn) {
++ printk(KERN_ALERT PFX "%s: cannot allocate connection context.\n", __func__);
++ scif_close(ep);
++ spin_lock(&conn_state_lock);
++ conn_state[dev->node_id][peer.node] = IBSCIF_CONN_IDLE;
++ spin_unlock(&conn_state_lock);
++ return;
++ }
++
++ conn->ep = ep;
++ conn->remote_node_id = peer.node;
++ if (check_grh)
++ memcpy(&conn->remote_gid, &resp.gid, sizeof(conn->remote_gid));
++ conn->dev = dev;
++ atomic_set(&conn->refcnt, 0);
++
++ spin_lock(&conn_state_lock);
++ conn_state[dev->node_id][peer.node] = IBSCIF_CONN_ESTABLISHED;
++ spin_unlock(&conn_state_lock);
++
++ if (verbose)
++ printk(KERN_INFO PFX "%s: connection established. ep=%p\n", __func__, ep);
++
++ ibscif_refresh_mreg(conn);
++
++ /* one addition sync to ensure the MRs are registered with the new ep at both side */
++ scif_send(ep, &resp, resp_size, SCIF_SEND_BLOCK);
++ scif_recv(ep, &resp, resp_size, SCIF_RECV_BLOCK);
++
++ list_add(&conn->entry, &dev->conn_list);
++ ibscif_refresh_pollep_list();
++
++ spin_lock(&conn_state_lock);
++ conn_state[dev->node_id][peer.node] = IBSCIF_CONN_ACTIVE;
++ spin_unlock(&conn_state_lock);
++}
++
++struct ibscif_conn *ibscif_do_connect(struct ibscif_dev *dev, int remote_node_id)
++{
++ struct scif_portID dest;
++ struct ibscif_conn *conn = NULL;
++ int ret;
++ scif_epd_t ep;
++ struct ibscif_conn_resp resp;
++ union ib_gid peer_gid;
++ int resp_size;
++
++ if (check_grh)
++ resp_size = sizeof(resp);
++ else
++ resp_size = sizeof(int);
++
++ if (verbose)
++ printk(KERN_INFO PFX "%s: %d-->%d\n", __func__, dev->node_id, remote_node_id);
++
++ /* Validate remote_node_id for conn_state array check */
++ if ((remote_node_id < 0) || (remote_node_id >= IBSCIF_MAX_DEVICES))
++ return ERR_PTR(-EINVAL);
++
++ spin_lock(&conn_state_lock);
++ if (conn_state[dev->node_id][remote_node_id] != IBSCIF_CONN_IDLE) {
++ spin_unlock(&conn_state_lock);
++ if (verbose)
++ printk(KERN_INFO PFX "%s: connection already in progress, retry\n", __func__);
++ return ERR_PTR(-EAGAIN);
++ }
++ conn_state[dev->node_id][remote_node_id] = IBSCIF_CONN_REQ_SENT;
++ spin_unlock(&conn_state_lock);
++
++ ep = scif_open();
++ if (!ep) /* SCIF API semantics */
++ goto out_state;
++
++ if (IS_ERR(ep)) /* SCIF emulator semantics */
++ goto out_state;
++
++ dest.node = remote_node_id;
++ dest.port = SCIF_OFED_PORT_0;
++
++ ret = scif_connect(ep, &dest);
++ if (ret < 0)
++ goto out_close;
++
++ /* Now ret is the port number ep is bound to */
++
++ ret = scif_recv(ep, &resp, resp_size, SCIF_RECV_BLOCK);
++ if (ret < 0) {
++ printk(KERN_ALERT PFX "%s: scif_recv returns %d\n", __func__, ret);
++ goto out_close;
++ }
++
++ if (resp.cmd != IBSCIF_CONN_REP) {
++ scif_send(ep, &resp, resp_size, SCIF_SEND_BLOCK);
++ /* the peer has issued the connection request */
++ if (resp.cmd == IBSCIF_CONN_REJ) {
++ if (verbose)
++ printk(KERN_INFO PFX "%s: rejected by peer due to double connection\n", __func__);
++ scif_close(ep);
++ /* don't reset the state becasue it's used for checking connection state */
++ return ERR_PTR(-EAGAIN);
++ }
++ else {
++ if (verbose)
++ printk(KERN_INFO PFX "%s: rejected by peer due to invalid state\n", __func__);
++ goto out_close;
++ }
++ }
++
++ if (check_grh) {
++ memcpy(&peer_gid, &resp.gid, sizeof(peer_gid));
++ memcpy(&resp.gid, &dev->gid, sizeof(resp.gid));
++ ret = scif_send(ep, &resp, resp_size, SCIF_SEND_BLOCK);
++ if (ret < 0) {
++ printk(KERN_ALERT PFX "%s: scif_send returns %d\n", __func__, ret);
++ goto out_close;
++ }
++ }
++
++ if (verbose)
++ printk(KERN_INFO PFX "%s: connection established. ep=%p\n", __func__, ep);
++
++ spin_lock(&conn_state_lock);
++ conn_state[dev->node_id][remote_node_id] = IBSCIF_CONN_ESTABLISHED;
++ spin_unlock(&conn_state_lock);
++
++ conn = kzalloc(sizeof *conn, GFP_KERNEL);
++ if (!conn) {
++ printk(KERN_ALERT PFX "%s: failed to allocate connection object\n", __func__);
++ goto out_close;
++ }
++
++ conn->ep = ep;
++ conn->remote_node_id = remote_node_id;
++ if (check_grh)
++ memcpy(&conn->remote_gid, &peer_gid, sizeof(conn->remote_gid));
++ conn->dev = dev;
++ atomic_set(&conn->refcnt, 0);
++
++ ibscif_refresh_mreg(conn);
++
++ /* one addition sync to ensure the MRs are registered with the new ep at both side */
++ scif_send(ep, &resp, resp_size, SCIF_SEND_BLOCK);
++ scif_recv(ep, &resp, resp_size, SCIF_RECV_BLOCK);
++
++ list_add_tail(&conn->entry, &dev->conn_list);
++ ibscif_refresh_pollep_list();
++
++ spin_lock(&conn_state_lock);
++ conn_state[dev->node_id][remote_node_id] = IBSCIF_CONN_ACTIVE;
++ spin_unlock(&conn_state_lock);
++
++ return conn;
++
++out_close:
++ scif_close(ep);
++
++out_state:
++ spin_lock(&conn_state_lock);
++ if (conn_state[dev->node_id][remote_node_id] == IBSCIF_CONN_REQ_SENT)
++ conn_state[dev->node_id][remote_node_id] = IBSCIF_CONN_IDLE;
++ spin_unlock(&conn_state_lock);
++ return conn;
++}
++
++struct ibscif_conn *ibscif_get_conn(int node_id, int remote_node_id, int find_local_peer)
++{
++ struct ibscif_dev *cur, *next, *dev = NULL;
++ struct ibscif_conn *conn, *conn1, *conn2;
++ int done=0, err=0, connect_tried=0;
++
++ down(&devlist_mutex);
++ list_for_each_entry_safe(cur, next, &devlist, entry) {
++ if (cur->node_id == node_id) {
++ dev = cur;
++ break;
++ }
++ }
++ up(&devlist_mutex);
++
++ if (!dev)
++ return NULL;
++
++again:
++ conn1 = NULL;
++ conn2 = NULL;
++ down(&dev->mutex);
++ list_for_each_entry(conn, &dev->conn_list, entry)
++ {
++ if (conn->remote_node_id == remote_node_id) {
++ if (node_id == remote_node_id) {
++ if (!conn1) {
++ conn1 = conn;
++ continue;
++ }
++ else {
++ conn2 = conn;
++ break;
++ }
++ }
++ up(&dev->mutex);
++ atomic_inc(&conn->refcnt);
++ if (conn->local_close) {
++ conn->local_close = 0;
++ ibscif_send_reopen(conn);
++ }
++ return conn;
++ }
++ }
++ up(&dev->mutex);
++
++ /* for loopback connections, we must wait for both endpoints be in the list to ensure that
++ * different endpoints are assigned to the two sides
++ */
++ if (node_id == remote_node_id) {
++ if (conn1 && conn2) {
++ conn = find_local_peer ? conn2 : conn1;
++ atomic_inc(&conn->refcnt);
++ if (conn->local_close) {
++ conn->local_close = 0;
++ ibscif_send_reopen(conn);
++ }
++ return conn;
++ }
++ else if (conn1) {
++ schedule();
++ goto again;
++ }
++ }
++
++ if (connect_tried) {
++ printk(KERN_ALERT PFX "%s: ERROR: cannot get connection (%d-->%d) after waiting, state=%d\n",
++ __func__, dev->node_id, remote_node_id, err-1);
++ return NULL;
++ }
++
++ conn = ibscif_do_connect(dev, remote_node_id);
++
++ /* If a connection is in progress, wait for its finish */
++ if (conn == ERR_PTR(-EAGAIN)) {
++ while (!done && !err) {
++ spin_lock(&conn_state_lock);
++ switch (conn_state[node_id][remote_node_id]) {
++ case IBSCIF_CONN_REQ_SENT:
++ case IBSCIF_CONN_REQ_RCVD:
++ case IBSCIF_CONN_ESTABLISHED:
++ break;
++ case IBSCIF_CONN_ACTIVE:
++ done = 1;
++ break;
++ default:
++ err = 1 + conn_state[node_id][remote_node_id];
++ break;
++ }
++ spin_unlock(&conn_state_lock);
++ schedule();
++ }
++ }
++
++ connect_tried = 1;
++ goto again;
++}
++
++void ibscif_put_conn(struct ibscif_conn *conn)
++{
++ if (!conn)
++ return;
++
++ if (atomic_dec_and_test(&conn->refcnt)) {
++ // printk(KERN_INFO PFX "%s: local_close, conn=%p, remote_close=%d\n", __func__, conn, conn->remote_close);
++ ibscif_send_close(conn);
++ conn->local_close = 1;
++ }
++}
++
++void ibscif_get_pollep_list(struct scif_pollepd *polleps,
++ struct ibscif_dev **devs, int *types, struct ibscif_conn **conns, int *count)
++{
++ struct ibscif_dev *dev;
++ struct ibscif_conn *conn;
++ int i = 0;
++ int max = *count;
++
++ down(&devlist_mutex);
++ list_for_each_entry(dev, &devlist, entry) {
++ if (i >= max)
++ break;
++
++ polleps[i].epd = dev->listen_ep;
++ polleps[i].events = POLLIN;
++ polleps[i].revents = 0;
++ devs[i] = dev;
++ types[i] = IBSCIF_EP_TYPE_LISTEN;
++ conns[i] = NULL;
++ i++;
++ if (verbose)
++ printk(KERN_INFO PFX "%s: ep=%p (%d:listen)\n", __func__, dev->listen_ep, dev->node_id);
++
++ down(&dev->mutex);
++ list_for_each_entry(conn, &dev->conn_list, entry)
++ {
++ if (i >= max)
++ break;
++ polleps[i].epd = conn->ep;
++ polleps[i].events = POLLIN;
++ polleps[i].revents = 0;
++ devs[i] = dev;
++ types[i] = IBSCIF_EP_TYPE_COMM;
++ conns[i] = conn;
++ i++;
++ if (verbose)
++ printk(KERN_INFO PFX "%s: ep=%p (%d<--->%d)\n", __func__, conn->ep, dev->node_id, conn->remote_node_id);
++ }
++ up(&dev->mutex);
++ }
++ up(&devlist_mutex);
++
++ if (verbose)
++ printk(KERN_INFO PFX "%s: count=%d\n", __func__, i);
++ *count = i;
++}
++
++void ibscif_get_ep_list(scif_epd_t *eps, int *count)
++{
++ struct ibscif_dev *dev;
++ struct ibscif_conn *conn;
++ int i = 0;
++ int max = *count;
++
++ down(&devlist_mutex);
++ list_for_each_entry(dev, &devlist, entry) {
++ if (i >= max)
++ break;
++
++ down(&dev->mutex);
++ list_for_each_entry(conn, &dev->conn_list, entry)
++ {
++ if (i >= max)
++ break;
++ eps[i] = conn->ep;
++ i++;
++ }
++ up(&dev->mutex);
++ }
++ up(&devlist_mutex);
++
++ *count = i;
++}
++
++void ibscif_remove_ep(struct ibscif_dev *dev, scif_epd_t ep)
++{
++ struct ibscif_conn *conn, *next;
++ down(&dev->mutex);
++ list_for_each_entry_safe(conn, next, &dev->conn_list, entry)
++ {
++ if (conn->ep == ep) {
++ spin_lock(&conn_state_lock);
++ conn_state[conn->dev->node_id][conn->remote_node_id] = IBSCIF_CONN_IDLE;
++ spin_unlock(&conn_state_lock);
++ list_del(&conn->entry);
++ }
++ }
++ up(&dev->mutex);
++}
++
++
++void ibscif_free_conn(struct ibscif_conn *conn)
++{
++ scif_close(conn->ep);
++ kfree(conn);
++}
++
++int ibscif_cleanup_idle_conn(void)
++{
++ struct ibscif_dev *dev;
++ struct ibscif_conn *conn, *next;
++ struct ibscif_conn *idle_conns[IBSCIF_MAX_DEVICES];
++ int i, n=0;
++
++ down(&devlist_mutex);
++ list_for_each_entry(dev, &devlist, entry) {
++ down(&dev->mutex);
++ list_for_each_entry_safe(conn, next, &dev->conn_list, entry)
++ {
++ if (conn->local_close && conn->remote_close) {
++ spin_lock(&conn_state_lock);
++ conn_state[conn->dev->node_id][conn->remote_node_id] = IBSCIF_CONN_IDLE;
++ spin_unlock(&conn_state_lock);
++ list_del(&conn->entry);
++ idle_conns[n++] = conn;
++ }
++ }
++ up(&dev->mutex);
++ }
++ up(&devlist_mutex);
++
++ for (i=0; i<n; i++)
++ ibscif_free_conn(idle_conns[i]);
++
++ if (n && verbose)
++ printk(KERN_ALERT PFX "%s: n=%d\n", __func__, n);
++
++ return n;
++}
++
++/*
++ * Simple routines to support performance profiling
++ */
++
++#include <linux/time.h>
++
++static uint32_t ibscif_time_passed(void)
++{
++ static int first = 1;
++ static struct timeval t0;
++ static struct timeval t;
++ uint32_t usec;
++
++ if (first) {
++ do_gettimeofday(&t0);
++ first = 0;
++ return 0;
++ }
++
++ do_gettimeofday(&t);
++ usec = (t.tv_sec - t0.tv_sec) * 1000000UL;
++ if (t.tv_usec >= t0.tv_usec)
++ usec += (t.tv_usec - t0.tv_usec);
++ else
++ usec -= (t0.tv_usec - t.tv_usec);
++
++ t0 = t;
++ return usec;
++}
++
++#define IBSCIF_PERF_MAX_SAMPLES 100
++#define IBSCIF_PERF_MAX_COUNTERS 10
++
++void ibscif_perf_sample(int counter, int next)
++{
++ static uint32_t T[IBSCIF_PERF_MAX_SAMPLES][IBSCIF_PERF_MAX_COUNTERS];
++ static int T_idx=0;
++ int i, j, sum;
++
++ if (counter>=0 && counter<IBSCIF_PERF_MAX_COUNTERS)
++ T[T_idx][counter] = ibscif_time_passed();
++
++ if (next) {
++ if (++T_idx < IBSCIF_PERF_MAX_SAMPLES)
++ return;
++
++ T_idx = 0;
++
++ /* batch output to minimize the impact on higher level timing */
++ for (i=0; i<IBSCIF_PERF_MAX_SAMPLES; i++) {
++ sum = 0;
++ printk(KERN_INFO PFX "%d: ", i);
++ for (j=0; j<IBSCIF_PERF_MAX_COUNTERS; j++) {
++ printk("T%d=%u ", j, T[i][j]);
++ if (j>0)
++ sum += T[i][j];
++ }
++ printk("SUM(T1..T%d)=%u\n", IBSCIF_PERF_MAX_COUNTERS-1, sum);
++ }
++ }
++}
++
+diff -urN a7/drivers/infiniband/hw/scif/Kconfig a8/drivers/infiniband/hw/scif/Kconfig
+--- a7/drivers/infiniband/hw/scif/Kconfig 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/Kconfig 2015-02-23 10:14:37.489809663 -0800
+@@ -0,0 +1,4 @@
++config INFINIBAND_SCIF
++ tristate "SCIF RDMA driver support"
++ ---help---
++ RDMA over SCIF driver.
+diff -urN a7/drivers/infiniband/hw/scif/Makefile a8/drivers/infiniband/hw/scif/Makefile
+--- a7/drivers/infiniband/hw/scif/Makefile 1969-12-31 16:00:00.000000000 -0800
++++ a8/drivers/infiniband/hw/scif/Makefile 2015-02-23 10:14:37.489809663 -0800
+@@ -0,0 +1,41 @@
++ifneq ($(KERNELRELEASE),)
++
++# Original Make begins
++
++obj-$(CONFIG_INFINIBAND_SCIF) += ibscif.o
++
++ibscif-y := ibscif_main.o \
++ ibscif_ah.o \
++ ibscif_pd.o \
++ ibscif_cq.o \
++ ibscif_qp.o \
++ ibscif_mr.o \
++ ibscif_cm.o \
++ ibscif_post.o \
++ ibscif_procfs.o \
++ ibscif_loopback.o \
++ ibscif_provider.o \
++ ibscif_protocol.o \
++ ibscif_scheduler.o \
++ ibscif_util.o
++
++# Original Makefile ends
++
++else
++
++ifeq ($(KVER),)
++ ifeq ($(KDIR),)
++ KDIR := /lib/modules/$(shell uname -r)/build
++ endif
++else
++ KDIR := /lib/modules/$(KVER)/build
++endif
++
++all:
++ $(MAKE) -C $(KDIR) SUBDIRS=$(shell pwd) CONFIG_INFINIBAND_SCIF=m
++
++clean:
++ rm -rf *.o *.ko *.mod.c .*.cmd Module.* .tmp_versions
++
++endif
++
--- /dev/null
+From 4f27d323bd47563f40a663672a331c5b2c95138e Mon Sep 17 00:00:00 2001
+From: Phil Cayton <phil.cayton@intel.com>
+Date: Tue, 4 Feb 2014 12:25:45 -0800
+Subject: [PATCH 09/12] update drivers/infiniband's Kconfig and Makefile to
+ allow compilation of CCL-Direct (ibp)
+
+Signed-off-by: Phil Cayton <phil.cayton@intel.com>
+---
+diff -urN a8/drivers/infiniband/hw/Makefile a9/drivers/infiniband/hw/Makefile
+--- a8/drivers/infiniband/hw/Makefile 2015-01-05 15:04:13.993463721 -0800
++++ a9/drivers/infiniband/hw/Makefile 2015-01-05 15:09:10.056451249 -0800
+@@ -10,3 +10,4 @@
+ obj-$(CONFIG_INFINIBAND_NES) += nes/
+ obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/
+ obj-$(CONFIG_INFINIBAND_USNIC) += usnic/
++obj-$(CONFIG_INFINIBAND_SCIF) += scif/
+diff -urN a8/drivers/infiniband/Kconfig a9/drivers/infiniband/Kconfig
+--- a8/drivers/infiniband/Kconfig 2015-01-05 15:04:14.001463720 -0800
++++ a9/drivers/infiniband/Kconfig 2015-01-05 15:07:03.176456594 -0800
+@@ -55,6 +55,9 @@
+ source "drivers/infiniband/hw/nes/Kconfig"
+ source "drivers/infiniband/hw/ocrdma/Kconfig"
+ source "drivers/infiniband/hw/usnic/Kconfig"
++source "drivers/infiniband/hw/scif/Kconfig"
++
++source "drivers/infiniband/ibp/Kconfig"
+
+ source "drivers/infiniband/ulp/ipoib/Kconfig"
+
+diff -urN a8/drivers/infiniband/Makefile a9/drivers/infiniband/Makefile
+--- a8/drivers/infiniband/Makefile 2015-01-05 15:04:14.001463720 -0800
++++ a9/drivers/infiniband/Makefile 2015-01-05 15:08:25.112453143 -0800
+@@ -1,3 +1,4 @@
+ obj-$(CONFIG_INFINIBAND) += core/
+ obj-$(CONFIG_INFINIBAND) += hw/
+ obj-$(CONFIG_INFINIBAND) += ulp/
++obj-$(CONFIG_IBP_SERVER) += ibp/
--- /dev/null
+IB/qib: Update qib for XEON PHI support
+
+From: Jubin John <jubin.john@intel.com>
+
+Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
+Signed-off-by: Jubin John <jubin.john@intel.com>
+---
+diff -urN a9/drivers/infiniband/hw/qib/Makefile a10/drivers/infiniband/hw/qib/Makefile
+--- a9/drivers/infiniband/hw/qib/Makefile 2015-01-05 15:05:04.280461602 -0800
++++ a10/drivers/infiniband/hw/qib/Makefile 2015-01-05 15:10:58.250446692 -0800
+@@ -14,3 +14,8 @@
+ ib_qib-$(CONFIG_X86_64) += qib_wc_x86_64.o
+ ib_qib-$(CONFIG_PPC64) += qib_wc_ppc64.o
+ ib_qib-$(CONFIG_DEBUG_FS) += qib_debugfs.o
++
++ifeq ($(CONFIG_INFINIBAND_SCIF),m)
++ib_qib-y += qib_knx.o
++ccflags-y += -DQIB_CONFIG_KNX
++endif
+diff -urN a9/drivers/infiniband/hw/qib/qib_common.h a10/drivers/infiniband/hw/qib/qib_common.h
+--- a9/drivers/infiniband/hw/qib/qib_common.h 2015-01-05 15:05:04.281461602 -0800
++++ a10/drivers/infiniband/hw/qib/qib_common.h 2015-01-05 15:10:58.250446692 -0800
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+@@ -337,8 +338,12 @@
+ * Should be set to QIB_USER_SWVERSION.
+ */
+ __u32 spu_userversion;
+-
++#ifdef QIB_CONFIG_KNX
++ __u16 spu_knx_node_id;
++ __u16 _spu_unused2;
++#else
+ __u32 _spu_unused2;
++#endif
+
+ /* size of struct base_info to write to */
+ __u32 spu_base_info_size;
+diff -urN a9/drivers/infiniband/hw/qib/qib_file_ops.c a10/drivers/infiniband/hw/qib/qib_file_ops.c
+--- a9/drivers/infiniband/hw/qib/qib_file_ops.c 2015-01-05 15:05:04.280461602 -0800
++++ a10/drivers/infiniband/hw/qib/qib_file_ops.c 2015-01-05 15:10:58.251446692 -0800
+@@ -48,6 +48,7 @@
+ #include "qib.h"
+ #include "qib_common.h"
+ #include "qib_user_sdma.h"
++#include "qib_knx.h"
+
+ #undef pr_fmt
+ #define pr_fmt(fmt) QIB_DRV_NAME ": " fmt
+@@ -59,6 +60,9 @@
+ unsigned long, loff_t);
+ static unsigned int qib_poll(struct file *, struct poll_table_struct *);
+ static int qib_mmapf(struct file *, struct vm_area_struct *);
++static int subctxt_search_ctxts(struct qib_devdata *, struct file *,
++ const struct qib_user_info *);
++
+
+ static const struct file_operations qib_file_ops = {
+ .owner = THIS_MODULE,
+@@ -89,6 +93,64 @@
+ return paddr;
+ }
+
++#ifdef QIB_CONFIG_KNX
++/*
++ * Fills in only a few of the fields in the qib_base_info structure so the
++ * module on the KNX size can allocate all necessary memories locally.
++ */
++static int qib_get_early_base_info(struct file *fp, void __user *ubase,
++ size_t ubase_size) {
++ struct qib_ctxtdata *rcd = ctxt_fp(fp);
++ int ret = 0;
++ struct qib_devdata *dd = rcd->dd;
++ struct qib_base_info *kinfo = NULL;
++ size_t sz;
++ int local_node = (numa_node_id() == pcibus_to_node(dd->pcidev->bus));
++
++ sz = sizeof(*kinfo);
++ if (!rcd->subctxt_cnt)
++ sz -= 7 * sizeof(u64);
++ if (ubase_size < sz) {
++ ret = -EINVAL;
++ goto bail;
++ }
++
++ kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
++ if (kinfo == NULL) {
++ ret = -ENOMEM;
++ goto bail;
++ }
++
++ ret = dd->f_get_base_info(rcd, kinfo);
++ if (ret < 0)
++ goto bail_free;
++
++ if (rcd->subctxt_cnt && !subctxt_fp(fp))
++ kinfo->spi_runtime_flags |= QIB_RUNTIME_MASTER;
++
++ kinfo->spi_unit = dd->unit;
++ kinfo->spi_port = rcd->ppd->port;
++ kinfo->spi_ctxt = rcd->ctxt;
++ kinfo->spi_subctxt = subctxt_fp(fp);
++ kinfo->spi_rcvhdr_cnt = dd->rcvhdrcnt;
++ kinfo->spi_rcvhdrent_size = dd->rcvhdrentsize;
++ kinfo->spi_rcv_egrbufsize = dd->rcvegrbufsize;
++ kinfo->spi_rcv_egrbuftotlen =
++ rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size;
++ kinfo->spi_rcv_egrperchunk = rcd->rcvegrbufs_perchunk;
++ kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
++ rcd->rcvegrbuf_chunks;
++
++ sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);
++ if (copy_to_user(ubase, kinfo, sz))
++ ret = -EFAULT;
++bail_free:
++ kfree(kinfo);
++bail:
++ return ret;
++}
++#endif
++
+ static int qib_get_base_info(struct file *fp, void __user *ubase,
+ size_t ubase_size)
+ {
+@@ -177,14 +239,43 @@
+ */
+ kinfo->spi_rcvhdr_base = (u64) rcd->rcvhdrq_phys;
+ kinfo->spi_rcvhdr_tailaddr = (u64) rcd->rcvhdrqtailaddr_phys;
++ /*
++ * In the case of KNX, qib_do_user_init() would call into the
++ * KNX-specific memory allocation/registration functions. These
++ * functions will write the registered memory offsets in the
++ * qib_base_info structure. Those are the addresses that need to be
++ * handled to user level.
++ */
++ kinfo->spi_uregbase = knx_node_fp(fp) ?
++ qib_knx_ctxt_info(rcd, QIB_KNX_CTXTINFO_UREG, fp) :
++ (u64) dd->uregbase + dd->ureg_align * rcd->ctxt;
++
++ if (knx_node_fp(fp))
++ kinfo->spi_runtime_flags =
++ qib_knx_ctxt_info(rcd, QIB_KNX_CTXTINFO_FLAGS, fp);
+ kinfo->spi_rhf_offset = dd->rhf_offset;
+ kinfo->spi_rcv_egrbufs = (u64) rcd->rcvegr_phys;
+- kinfo->spi_pioavailaddr = (u64) dd->pioavailregs_phys;
++
++ /* see comment for spi_uregbase above */
++ if (knx_node_fp(fp))
++ kinfo->spi_pioavailaddr =
++ qib_knx_ctxt_info(rcd, QIB_KNX_CTXTINFO_PIOAVAIL, fp);
++ else
++ kinfo->spi_pioavailaddr = (u64) dd->pioavailregs_phys;
++
+ /* setup per-unit (not port) status area for user programs */
+- kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
+- (char *) ppd->statusp -
+- (char *) dd->pioavailregs_dma;
+- kinfo->spi_uregbase = (u64) dd->uregbase + dd->ureg_align * rcd->ctxt;
++ kinfo->spi_status = (knx_node_fp(fp) ?
++ qib_knx_ctxt_info(
++ rcd, QIB_KNX_CTXTINFO_STATUS, fp) :
++ (u64) dd->pioavailregs_phys) +
++ (char *) ppd->statusp - (char *) dd->pioavailregs_dma;
++
++ /*
++ * Do not set spi_piobufbase to KNX offset here as it is used in
++ * PIO index calculations below. For KNX contexts, the value of
++ * spi_piobufbase is not the physical address but the offset of
++ * the registered memory.
++ */
+ if (!shared) {
+ kinfo->spi_piocnt = rcd->piocnt;
+ kinfo->spi_piobufbase = (u64) rcd->piobufs;
+@@ -204,7 +295,11 @@
+ dd->palign * kinfo->spi_piocnt * slave;
+ }
+
+- if (shared) {
++ /*
++ * In the case of KNX contexts, shared context memory is setup and
++ * handled on the the KNX.
++ */
++ if (shared && !knx_node_fp(fp)) {
+ kinfo->spi_sendbuf_status =
+ cvt_kvaddr(&rcd->user_event_mask[subctxt_fp(fp)]);
+ /* only spi_subctxt_* fields should be set in this block! */
+@@ -225,6 +320,11 @@
+ kinfo->spi_pioindex = (kinfo->spi_piobufbase - dd->pio2k_bufbase) /
+ dd->palign;
+ kinfo->spi_pioalign = dd->palign;
++ /* Update spi_piobufbase after all calculations are done. */
++ if (knx_node_fp(fp))
++ kinfo->spi_piobufbase =
++ qib_knx_ctxt_info(rcd, QIB_KNX_CTXTINFO_PIOBUFBASE, fp);
++
+ kinfo->spi_qpair = QIB_KD_QP;
+ /*
+ * user mode PIO buffers are always 2KB, even when 4KB can
+@@ -1261,6 +1361,17 @@
+ goto bail;
+ }
+
++#ifdef QIB_CONFIG_KNX
++ if (uinfo->spu_knx_node_id)
++ /*
++ * When setting up a context for a KNX process, setup of
++ * the subcontexts memory is done on the KNX side and
++ * mapped into user level. Therefore, the host driver never
++ * has to worry about it unless we are setting up a context
++ * on the host.
++ */
++ goto no_subctxt_mem;
++#endif
+ rcd->subctxt_uregbase = vmalloc_user(PAGE_SIZE * num_subctxts);
+ if (!rcd->subctxt_uregbase) {
+ ret = -ENOMEM;
+@@ -1283,6 +1394,9 @@
+ goto bail_rhdr;
+ }
+
++#ifdef QIB_CONFIG_KNX
++no_subctxt_mem:
++#endif
+ rcd->subctxt_cnt = uinfo->spu_subctxt_cnt;
+ rcd->subctxt_id = uinfo->spu_subctxt_id;
+ rcd->active_slaves = 1;
+@@ -1317,6 +1431,14 @@
+
+ rcd = qib_create_ctxtdata(ppd, ctxt, numa_id);
+
++#ifdef QIB_CONFIG_KNX
++ if (uinfo->spu_knx_node_id)
++ /*
++ * Skip allocation of page pointer list for TID
++ * receives. This will be done on the KNX.
++ */
++ goto no_page_list;
++#endif
+ /*
+ * Allocate memory for use in qib_tid_update() at open to
+ * reduce cost of expected send setup per message segment
+@@ -1332,7 +1454,11 @@
+ ret = -ENOMEM;
+ goto bailerr;
+ }
++#ifdef QIB_CONFIG_KNX
++no_page_list:
++#endif
+ rcd->userversion = uinfo->spu_userversion;
++
+ ret = init_subctxts(dd, rcd, uinfo);
+ if (ret)
+ goto bailerr;
+@@ -1489,43 +1615,68 @@
+ static int find_shared_ctxt(struct file *fp,
+ const struct qib_user_info *uinfo)
+ {
+- int devmax, ndev, i;
++ int devmax, ndev;
+ int ret = 0;
++ struct qib_devdata *dd;
+
++#ifdef QIB_CONFIG_KNX
++ /*
++ * In the case we are allocating a context for a KNX process,
++ * Don't loop over all devices but use the one assosiated with the
++ * requesting KNX.
++ */
++ if (uinfo->spu_knx_node_id) {
++ dd = qib_knx_node_to_dd(uinfo->spu_knx_node_id);
++ if (dd && dd->num_knx)
++ ret = subctxt_search_ctxts(dd, fp, uinfo);
++ goto done;
++ }
++#endif
+ devmax = qib_count_units(NULL, NULL);
+
+ for (ndev = 0; ndev < devmax; ndev++) {
+- struct qib_devdata *dd = qib_lookup(ndev);
+-
++ dd = qib_lookup(ndev);
+ /* device portion of usable() */
+ if (!(dd && (dd->flags & QIB_PRESENT) && dd->kregbase))
+ continue;
+- for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) {
+- struct qib_ctxtdata *rcd = dd->rcd[i];
++ ret = subctxt_search_ctxts(dd, fp, uinfo);
++ if (ret)
++ break;
++ }
++#ifdef QIB_CONFIG_KNX
++done:
++#endif
++ return ret;
++}
+
+- /* Skip ctxts which are not yet open */
+- if (!rcd || !rcd->cnt)
+- continue;
+- /* Skip ctxt if it doesn't match the requested one */
+- if (rcd->subctxt_id != uinfo->spu_subctxt_id)
+- continue;
+- /* Verify the sharing process matches the master */
+- if (rcd->subctxt_cnt != uinfo->spu_subctxt_cnt ||
+- rcd->userversion != uinfo->spu_userversion ||
+- rcd->cnt >= rcd->subctxt_cnt) {
+- ret = -EINVAL;
+- goto done;
+- }
+- ctxt_fp(fp) = rcd;
+- subctxt_fp(fp) = rcd->cnt++;
+- rcd->subpid[subctxt_fp(fp)] = current->pid;
+- tidcursor_fp(fp) = 0;
+- rcd->active_slaves |= 1 << subctxt_fp(fp);
+- ret = 1;
++static int subctxt_search_ctxts(struct qib_devdata *dd, struct file *fp,
++ const struct qib_user_info *uinfo)
++{
++ int ret = 0, i;
++ for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) {
++ struct qib_ctxtdata *rcd = dd->rcd[i];
++
++ /* Skip ctxts which are not yet open */
++ if (!rcd || !rcd->cnt)
++ continue;
++ /* Skip ctxt if it doesn't match the requested one */
++ if (rcd->subctxt_id != uinfo->spu_subctxt_id)
++ continue;
++ /* Verify the sharing process matches the master */
++ if (rcd->subctxt_cnt != uinfo->spu_subctxt_cnt ||
++ rcd->userversion != uinfo->spu_userversion ||
++ rcd->cnt >= rcd->subctxt_cnt) {
++ ret = -EINVAL;
+ goto done;
+ }
++ ctxt_fp(fp) = rcd;
++ subctxt_fp(fp) = rcd->cnt++;
++ rcd->subpid[subctxt_fp(fp)] = current->pid;
++ tidcursor_fp(fp) = 0;
++ rcd->active_slaves |= 1 << subctxt_fp(fp);
++ ret = 1;
++ break;
+ }
+-
+ done:
+ return ret;
+ }
+@@ -1617,6 +1768,13 @@
+
+ if (swminor >= 11 && uinfo->spu_port_alg < QIB_PORT_ALG_COUNT)
+ alg = uinfo->spu_port_alg;
++#ifdef QIB_CONFIG_KNX
++ /* Make sure we have a connection to the KNX module on the right node */
++ if (uinfo->spu_knx_node_id && !qib_knx_get(uinfo->spu_knx_node_id)) {
++ ret = -ENODEV;
++ goto done;
++ }
++#endif
+
+ mutex_lock(&qib_mutex);
+
+@@ -1624,13 +1782,38 @@
+ uinfo->spu_subctxt_cnt) {
+ ret = find_shared_ctxt(fp, uinfo);
+ if (ret > 0) {
+- ret = do_qib_user_sdma_queue_create(fp);
++#ifdef QIB_CONFIG_KNX
++ if (uinfo->spu_knx_node_id) {
++ ret = qib_knx_sdma_queue_create(fp);
++ } else
++#endif
++ ret = do_qib_user_sdma_queue_create(fp);
+ if (!ret)
+ assign_ctxt_affinity(fp, (ctxt_fp(fp))->dd);
+ goto done_ok;
+ }
+ }
+
++#ifdef QIB_CONFIG_KNX
++ /*
++ * If there is a KNX node set, we pick the device that is
++ * associate with that KNX node
++ */
++ if (uinfo->spu_knx_node_id) {
++ struct qib_devdata *dd =
++ qib_knx_node_to_dd(uinfo->spu_knx_node_id);
++ if (dd) {
++ ret = find_free_ctxt(dd->unit, fp, uinfo);
++ if (!ret)
++ ret = qib_knx_alloc_ctxt(
++ uinfo->spu_knx_node_id,
++ ctxt_fp(fp)->ctxt);
++ } else
++ ret = -ENXIO;
++ goto done_chk_sdma;
++ }
++
++#endif
+ i_minor = iminor(file_inode(fp)) - QIB_USER_MINOR_BASE;
+ if (i_minor)
+ ret = find_free_ctxt(i_minor - 1, fp, uinfo);
+@@ -1639,7 +1822,6 @@
+ const unsigned int cpu = cpumask_first(¤t->cpus_allowed);
+ const unsigned int weight =
+ cpumask_weight(¤t->cpus_allowed);
+-
+ if (weight == 1 && !test_bit(cpu, qib_cpulist))
+ if (!find_hca(cpu, &unit) && unit >= 0)
+ if (!find_free_ctxt(unit, fp, uinfo)) {
+@@ -1650,9 +1832,21 @@
+ }
+
+ done_chk_sdma:
+- if (!ret)
++ if (!ret) {
++#ifdef QIB_CONFIG_KNX
++ if (uinfo->spu_knx_node_id) {
++ ret = qib_knx_sdma_queue_create(fp);
++ /*if (!ret)
++ ret = qib_knx_setup_tidrcv(fp);*/
++ goto done_ok;
++ }
++#endif
+ ret = do_qib_user_sdma_queue_create(fp);
++ }
+ done_ok:
++#ifdef QIB_CONFIG_KNX
++ knx_node_fp(fp) = uinfo->spu_knx_node_id;
++#endif
+ mutex_unlock(&qib_mutex);
+
+ done:
+@@ -1667,11 +1861,25 @@
+ struct qib_ctxtdata *rcd = ctxt_fp(fp);
+ struct qib_devdata *dd;
+ unsigned uctxt;
++#ifdef QIB_CONFIG_KNX
++ struct qib_base_info *base_info = NULL;
++ void __user *ubase = (void __user *)(unsigned long)
++ uinfo->spu_base_info;
++#endif
+
+ /* Subctxts don't need to initialize anything since master did it. */
+ if (subctxt_fp(fp)) {
+ ret = wait_event_interruptible(rcd->wait,
+ !test_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag));
++#ifdef QIB_CONFIG_KNX
++ /*
++ * Subctxt pio buffers need to be registered after the
++ * master has set everything up.
++ */
++ if (uinfo->spu_knx_node_id)
++ ret = qib_knx_setup_piobufs(rcd->dd, rcd,
++ subctxt_fp(fp));
++#endif
+ goto bail;
+ }
+
+@@ -1722,6 +1930,41 @@
+ */
+ dd->f_sendctrl(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+
++#ifdef QIB_CONFIG_KNX
++ if (uinfo->spu_knx_node_id) {
++ /*
++ * When setting up rcvhdr Q and eager buffers for a KNX, the
++ * memory comes from the KNX side encoded in the qib_base_info
++ * structure.
++ */
++ if (uinfo->spu_base_info_size < (sizeof(*base_info) -
++ 7 * sizeof(u64))) {
++ ret = -EINVAL;
++ goto bail_pio;
++ }
++ base_info = kzalloc(sizeof(*base_info), GFP_KERNEL);
++ if (!base_info) {
++ ret = -ENOMEM;
++ goto bail_pio;
++ }
++ if (copy_from_user(base_info, ubase,
++ uinfo->spu_base_info_size)) {
++ ret = -EFAULT;
++ goto bail_pio;
++ }
++ ret = qib_knx_setup_piobufs(dd, rcd, subctxt_fp(fp));
++ if (ret)
++ goto cont_init;
++ ret = qib_knx_setup_pioregs(dd, rcd, base_info);
++ if (ret)
++ goto cont_init;
++ ret = qib_knx_create_rcvhdrq(dd, rcd, base_info);
++ if (ret)
++ goto cont_init;
++ ret = qib_knx_setup_eagerbufs(rcd, base_info);
++ goto cont_init;
++ }
++#endif /* QIB_CONFIG_KNX */
+ /*
+ * Now allocate the rcvhdr Q and eager TIDs; skip the TID
+ * array for time being. If rcd->ctxt > chip-supported,
+@@ -1731,6 +1974,9 @@
+ ret = qib_create_rcvhdrq(dd, rcd);
+ if (!ret)
+ ret = qib_setup_eagerbufs(rcd);
++#ifdef QIB_CONFIG_KNX
++cont_init:
++#endif
+ if (ret)
+ goto bail_pio;
+
+@@ -1828,6 +2074,13 @@
+
+ /* drain user sdma queue */
+ if (fd->pq) {
++#ifdef QIB_CONFIG_KNX
++ /*
++ * The thread should be stopped first before attempting
++ * to clean the queue.
++ */
++ qib_knx_sdma_queue_destroy(fd);
++#endif
+ qib_user_sdma_queue_drain(rcd->ppd, fd->pq);
+ qib_user_sdma_queue_destroy(fd->pq);
+ }
+@@ -1885,6 +2138,12 @@
+ }
+
+ mutex_unlock(&qib_mutex);
++#ifdef QIB_CONFIG_KNX
++ if (fd->knx_node_id) {
++ qib_knx_free_ctxtdata(dd, rcd);
++ goto bail;
++ }
++#endif
+ qib_free_ctxtdata(dd, rcd); /* after releasing the mutex */
+
+ bail:
+@@ -2170,6 +2429,13 @@
+ ret = qib_assign_ctxt(fp, &cmd.cmd.user_info);
+ if (ret)
+ goto bail;
++#ifdef QIB_CONFIG_KNX
++ if (cmd.cmd.user_info.spu_knx_node_id)
++ ret = qib_get_early_base_info(
++ fp, (void __user *) (unsigned long)
++ cmd.cmd.user_info.spu_base_info,
++ cmd.cmd.user_info.spu_base_info_size);
++#endif
+ break;
+
+ case QIB_CMD_USER_INIT:
+diff -urN a9/drivers/infiniband/hw/qib/qib.h a10/drivers/infiniband/hw/qib/qib.h
+--- a9/drivers/infiniband/hw/qib/qib.h 2015-01-05 15:05:04.280461602 -0800
++++ a10/drivers/infiniband/hw/qib/qib.h 2015-01-05 15:10:58.250446692 -0800
+@@ -234,6 +234,10 @@
+ u32 lookaside_qpn;
+ /* QPs waiting for context processing */
+ struct list_head qp_wait_list;
++#ifdef QIB_CONFIG_KNX
++ /* KNX Receive Context Data */
++ struct qib_knx_ctxt *krcd;
++#endif
+ #ifdef CONFIG_DEBUG_FS
+ /* verbs stats per CTX */
+ struct qib_opcode_stats_perctx *opstats;
+@@ -1106,6 +1110,11 @@
+ struct kthread_worker *worker;
+
+ int assigned_node_id; /* NUMA node closest to HCA */
++
++#ifdef QIB_CONFIG_KNX
++ /* number of KNx nodes using this device */
++ u16 num_knx;
++#endif
+ };
+
+ /* hol_state values */
+@@ -1134,6 +1143,9 @@
+ unsigned tidcursor;
+ struct qib_user_sdma_queue *pq;
+ int rec_cpu_num; /* for cpu affinity; -1 if none */
++#ifdef QIB_CONFIG_KNX
++ u16 knx_node_id;
++#endif
+ };
+
+ extern struct list_head qib_dev_list;
+@@ -1211,6 +1223,13 @@
+ (((struct qib_filedata *)(fp)->private_data)->tidcursor)
+ #define user_sdma_queue_fp(fp) \
+ (((struct qib_filedata *)(fp)->private_data)->pq)
++#ifdef QIB_CONFIG_KNX
++#define knx_node_fp(fp) \
++ (((struct qib_filedata *)(fp)->private_data)->knx_node_id)
++#else
++/* allow the use of knx_node_fp() outside of a #ifdef QIB_CONFIG_KNX */
++#define knx_node_fp(fp) 0
++#endif
+
+ static inline struct qib_devdata *dd_from_ppd(struct qib_pportdata *ppd)
+ {
+diff -urN a9/drivers/infiniband/hw/qib/qib_init.c a10/drivers/infiniband/hw/qib/qib_init.c
+--- a9/drivers/infiniband/hw/qib/qib_init.c 2015-01-05 15:05:04.279461602 -0800
++++ a10/drivers/infiniband/hw/qib/qib_init.c 2015-01-05 15:10:58.251446692 -0800
+@@ -51,6 +51,10 @@
+ #include "qib_verbs.h"
+ #endif
+
++#ifdef QIB_CONFIG_KNX
++#include "qib_knx.h"
++#endif
++
+ #undef pr_fmt
+ #define pr_fmt(fmt) QIB_DRV_NAME ": " fmt
+
+@@ -1301,6 +1305,12 @@
+ /* not fatal if it doesn't work */
+ if (qib_init_qibfs())
+ pr_err("Unable to register ipathfs\n");
++
++#ifdef QIB_CONFIG_KNX
++ ret = qib_knx_server_init();
++ if (ret < 0)
++ pr_err(": Unable to start KNX listen thread\n");
++#endif
+ goto bail; /* all OK */
+
+ bail_dev:
+@@ -1325,6 +1335,9 @@
+ {
+ int ret;
+
++#ifdef QIB_CONFIG_KNX
++ qib_knx_server_exit();
++#endif
+ ret = qib_exit_qibfs();
+ if (ret)
+ pr_err(
+@@ -1568,6 +1581,9 @@
+ /* unregister from IB core */
+ qib_unregister_ib_device(dd);
+
++#ifdef QIB_CONFIG_KNX
++ qib_knx_remove_device(dd);
++#endif
+ /*
+ * Disable the IB link, disable interrupts on the device,
+ * clear dma engines, etc.
+diff -urN a9/drivers/infiniband/hw/qib/qib_knx.c a10/drivers/infiniband/hw/qib/qib_knx.c
+--- a9/drivers/infiniband/hw/qib/qib_knx.c 1969-12-31 16:00:00.000000000 -0800
++++ a10/drivers/infiniband/hw/qib/qib_knx.c 2015-01-05 15:10:58.252446692 -0800
+@@ -0,0 +1,1532 @@
++/*
++ * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#include <linux/module.h>
++#include <linux/kthread.h>
++#include <linux/kernel.h>
++#include <linux/dma-mapping.h>
++#include <linux/scatterlist.h>
++#include <modules/scif.h>
++
++#include "qib.h"
++#include "qib_knx.h"
++#include "qib_user_sdma.h"
++#include "qib_knx_common.h"
++
++unsigned int qib_knx_nconns = 5;
++module_param_named(num_conns, qib_knx_nconns, uint, S_IRUGO);
++MODULE_PARM_DESC(num_conns, "Max number of pending connections");
++
++#define QIB_KNX_SCIF_PORT SCIF_OFED_PORT_9
++#define CLIENT_THREAD_NAME(x) "qib/mic" __stringify(x)
++
++#define knx_sdma_next(sdma) \
++ (sdma->head = ((sdma->head + 1) % sdma->desc_num))
++#define per_ctxt(ctxt, sub) ((ctxt * QLOGIC_IB_MAX_SUBCTXT) + sub)
++#define QIB_KNX_SDMA_STATUS(sdma, st) \
++ QIB_KNX_SDMA_SET(sdma->mflags->status, ((u64)st << 32) | 1)
++
++struct qib_knx_server {
++ struct task_struct *kthread;
++ struct scif_pollepd epd;
++ spinlock_t client_lock;
++ struct list_head clients;
++ unsigned int nclients;
++};
++
++struct qib_knx_rma {
++ /* SCIF registered offset */
++ off_t offset;
++ /* size of mapped memory (in bytes) */
++ size_t size;
++ /* kernel virtual address of ioremap'ed memory */
++ void *kvaddr;
++};
++
++struct qib_knx_mem_map {
++ /* physical address is DMA range */
++ dma_addr_t dma_mapped_addr;
++ /* DMA direction */
++ enum dma_data_direction dir;
++ /* size of remote memory area */
++ size_t size;
++ /* SCIF array of physical pages */
++ struct scif_range *pages;
++};
++
++struct qib_knx_mem_map_sg {
++ /* list of pages to map */
++ struct scatterlist *sglist;
++ /* DMA direction */
++ enum dma_data_direction dir;
++ /* total size of mapped memory */
++ size_t size;
++ struct scif_range *pages;
++};
++
++struct qib_knx_tidrcv {
++ struct qib_knx_rma tidmem;
++ u64 tidbase;
++ u32 tidcnt;
++};
++
++struct qib_knx_ctxt {
++ u16 ctxt;
++ struct qib_knx *knx;
++ struct qib_pportdata *ppd;
++ /* local registered memory for PIO buffers */
++ struct qib_knx_rma piobufs[QLOGIC_IB_MAX_SUBCTXT];
++ /* local registered memory for user registers */
++ struct qib_knx_rma uregs;
++ /* local registered memory for PIO avail registers */
++ struct qib_knx_rma pioavail;
++ /* remote registered memory for RcvHdr Q */
++ struct qib_knx_mem_map_sg rcvhdrq;
++ /* remote registered memory for SendBuf status */
++ struct qib_knx_mem_map sbufstatus;
++ /* remote registered memory for RcvHdrTail register */
++ struct qib_knx_mem_map rcvhdrqtailaddr;
++ /* remote registered memory for Eager buffers */
++ struct qib_knx_mem_map_sg eagerbufs;
++
++ /* Saved offsets for shared context processes */
++ __u64 uregbase;
++ __u64 pioavailaddr;
++ __u64 status;
++ __u64 piobufbase[QLOGIC_IB_MAX_SUBCTXT];
++ __u32 runtime_flags;
++
++ struct qib_user_sdma_queue *pq[QLOGIC_IB_MAX_SUBCTXT];
++};
++
++struct qib_knx_sdma {
++ /* KNX flags page */
++ struct scif_range *mflag_pages;
++ struct qib_knx_sdma_mflags *mflags;
++ /* KNX descriptor queue */
++ struct scif_range *queue_pages;
++ struct qib_knx_sdma_desc *queue;
++ u32 desc_num;
++ /* host flags (in host memory) */
++ struct qib_knx_rma hflags_mem;
++ struct qib_knx_sdma_hflags *hflags;
++ u32 head; /* shadow */
++ u32 complete;
++};
++
++struct qib_knx {
++ struct list_head list;
++ struct scif_pollepd epd;
++ struct scif_portID peer;
++ struct scif_pci_info pci_info;
++ int numa_node;
++ struct qib_devdata *dd;
++ struct qib_knx_ctxt **ctxts;
++ spinlock_t ctxt_lock;
++ resource_size_t bar;
++ u64 barlen;
++ struct qib_knx_sdma *sdma;
++ struct task_struct *sdma_poll;
++ atomic_t tref;
++ char tname[64];
++ struct qib_knx_rma tidmem;
++};
++
++static struct qib_knx_server *server;
++
++static int qib_knx_init(struct qib_knx_server *);
++static void qib_knx_free(struct qib_knx *, int);
++static int qib_knx_server_listen(void *);
++static off_t qib_knx_register_memory(struct qib_knx *, struct qib_knx_rma *,
++ void *, size_t, int, const char *);
++static int qib_knx_unregister_memory(struct qib_knx *, struct qib_knx_rma *,
++ const char *);
++static __always_inline void qib_knx_memcpy(void *, void __iomem *, size_t);
++static ssize_t qib_show_knx_node(struct device *, struct device_attribute *,
++ char *);
++static int qib_knx_sdma_init(struct qib_knx *);
++static void qib_knx_sdma_teardown(struct qib_knx *);
++static __always_inline struct page *
++qib_knx_phys_to_page(struct qib_knx *, unsigned long);
++static int qib_knx_sdma_pkts_to_descs(struct qib_knx_ctxt *,
++ struct qib_knx_sdma_desc *,
++ struct qib_user_sdma_queue *,
++ int *, struct list_head *);
++static int qib_knx_sdma_poll(void *);
++static int qib_knx_tidrcv_init(struct qib_knx *);
++static int qib_knx_tidrcv_teardown(struct qib_knx *);
++
++inline struct qib_knx *qib_knx_get(u16 nodeid)
++{
++ struct qib_knx *knx = NULL;
++
++ spin_lock(&server->client_lock);
++ if (!list_empty(&server->clients))
++ list_for_each_entry(knx, &server->clients, list)
++ if (knx->peer.node == nodeid)
++ break;
++ spin_unlock(&server->client_lock);
++ return knx;
++}
++
++inline struct qib_devdata *qib_knx_node_to_dd(u16 node)
++{
++ struct qib_knx *knx = qib_knx_get(node);
++ return knx ? knx->dd : NULL;
++}
++
++static int qib_knx_init(struct qib_knx_server *server)
++{
++ int ret = 0, num_devs = 0, i, seen = 0;
++ unsigned fewest = -1U;
++ struct qib_devdata *dd = NULL, *dd_no_numa = NULL;
++ struct qib_knx *knx;
++ struct qib_device_info info = { -1 };
++
++ knx = kzalloc(sizeof(*knx), GFP_KERNEL);
++ if (!knx) {
++ ret = -ENOMEM;
++ goto bail;
++ }
++ ret = scif_accept(server->epd.epd, &knx->peer, &knx->epd.epd, 0);
++ if (ret) {
++ kfree(knx);
++ goto bail;
++ }
++
++ INIT_LIST_HEAD(&knx->list);
++ spin_lock_init(&knx->ctxt_lock);
++ knx->numa_node = -1;
++ ret = scif_pci_info(knx->peer.node, &knx->pci_info);
++ if (!ret) {
++ knx->numa_node = pcibus_to_node(knx->pci_info.pdev->bus);
++ knx->bar = pci_resource_start(knx->pci_info.pdev, 0);
++ knx->barlen = pci_resource_len(knx->pci_info.pdev, 0);
++ }
++
++ if (knx->numa_node < 0)
++ knx->numa_node = numa_node_id();
++
++ num_devs = qib_count_units(NULL, NULL);
++ if (unlikely(!num_devs)) {
++ ret = -ENODEV;
++ /* we have to send this */
++ scif_send(knx->epd.epd, &info, sizeof(info),
++ SCIF_SEND_BLOCK);
++ goto done;
++ }
++
++ /*
++ * Attempt to find an HCA on the same NUMA node as the card. Save
++ * the first HCA that hasn't been associated with a card in case
++ * there is no HCA on the same NUMA node.
++ */
++ for (i = 0; seen < num_devs; i++) {
++ dd = qib_lookup(i);
++ if (dd) {
++ if (dd->assigned_node_id == knx->numa_node) {
++ knx->dd = dd;
++ break;
++ } else if (dd->num_knx < fewest)
++ dd_no_numa = dd;
++ seen++;
++ }
++ }
++ /*
++ * We didn't find a QIB device on the same NUMA node,
++ * use the "backup".
++ */
++ if (unlikely(!knx->dd)) {
++ if (!dd_no_numa) {
++ ret = -ENODEV;
++ /* we have to send this */
++ scif_send(knx->epd.epd, &info, sizeof(info),
++ SCIF_SEND_BLOCK);
++ goto done;
++ }
++ knx->dd = dd_no_numa;
++ }
++ knx->dd->num_knx++;
++
++ knx->ctxts = kzalloc_node(knx->dd->ctxtcnt * sizeof(*knx->ctxts),
++ GFP_KERNEL, knx->numa_node);
++ if (!knx->ctxts)
++ ret = -ENOMEM;
++ /* Give the KNX the associated device information. */
++ info.unit = knx->dd->unit;
++ ret = scif_send(knx->epd.epd, &info, sizeof(info),
++ SCIF_SEND_BLOCK);
++
++ ret = qib_knx_sdma_init(knx);
++ if (ret)
++ goto done;
++ atomic_set(&knx->tref, 0);
++ ret = qib_knx_tidrcv_init(knx);
++done:
++ spin_lock(&server->client_lock);
++ list_add_tail(&knx->list, &server->clients);
++ server->nclients++;
++ spin_unlock(&server->client_lock);
++ try_module_get(THIS_MODULE);
++bail:
++ return ret;
++}
++
++static void qib_knx_free(struct qib_knx *knx, int unload)
++{
++ struct qib_devdata *dd = knx->dd;
++ int i;
++
++ qib_knx_tidrcv_teardown(knx);
++ qib_knx_sdma_teardown(knx);
++ if (dd)
++ dd->num_knx--;
++ /*
++ * If this function is called with unload set, we can
++ * free the context data. Otherwise, we are here
++ * because the connection between the modules has broken.
++ */
++ if (knx->ctxts && unload && dd)
++ for (i = dd->first_user_ctxt; i < dd->ctxtcnt; i++)
++ qib_knx_free_ctxtdata(dd, dd->rcd[i]);
++
++ scif_close(knx->epd.epd);
++ module_put(THIS_MODULE);
++ if (unload)
++ kfree(knx->ctxts);
++}
++
++static int qib_knx_server_listen(void *data)
++{
++ struct qib_knx_server *server =
++ (struct qib_knx_server *)data;
++ struct qib_knx *client, *ptr;
++ int ret = 0;
++
++ server->epd.epd = scif_open();
++ if (!server->epd.epd) {
++ ret = -EIO;
++ goto done;
++ }
++ server->epd.events = POLLIN;
++ ret = scif_bind(server->epd.epd, QIB_KNX_SCIF_PORT);
++ if (ret < 0)
++ goto err_close;
++
++ ret = scif_listen(server->epd.epd, qib_knx_nconns);
++ if (ret)
++ goto err_close;
++
++ while (!kthread_should_stop()) {
++ schedule();
++
++ /* poll for one millisecond. Is 50ms good? */
++ ret = scif_poll(&server->epd, 1, 50);
++ if (ret > 0)
++ ret = qib_knx_init(server);
++
++ /*
++ * Check for any disconnected clients and clean them up.
++ * Since there is nothing anywhere else that can change the
++ * list, we only lock when we are deleting a client so
++ * querying functions operate on "static" list.
++ */
++ list_for_each_entry_safe(client, ptr, &server->clients, list) {
++ client->epd.events = POLLIN;
++ if (scif_poll(&client->epd, 1, 1)) {
++ if (client->epd.revents & POLLHUP) {
++ spin_lock(&server->client_lock);
++ list_del(&client->list);
++ spin_unlock(&server->client_lock);
++ qib_knx_free(client, 0);
++ kfree(client);
++ }
++ }
++ }
++ }
++err_close:
++ scif_close(server->epd.epd);
++done:
++ return ret;
++}
++
++
++static off_t qib_knx_register_memory(struct qib_knx *knx,
++ struct qib_knx_rma *rma, void *kvaddr,
++ size_t size, int prot, const char *what)
++{
++ int ret = 0;
++ off_t regoffset;
++
++ if (!kvaddr || ((unsigned long)kvaddr & ~PAGE_MASK)) {
++ ret = -EINVAL;
++ goto bail;
++ }
++ rma->kvaddr = kvaddr;
++ rma->size = size;
++
++ regoffset = scif_register(knx->epd.epd, rma->kvaddr, rma->size,
++ 0, prot, SCIF_MAP_KERNEL);
++ if (IS_ERR_VALUE(regoffset)) {
++ ret = regoffset;
++ goto bail;
++ }
++ rma->offset = regoffset;
++ return regoffset;
++bail:
++ rma->kvaddr = NULL;
++ rma->size = 0;
++ return ret;
++}
++
++static int qib_knx_unregister_memory(struct qib_knx *knx,
++ struct qib_knx_rma *rma, const char *what)
++{
++ int ret = 0;
++
++ if (!rma) {
++ ret = -EINVAL;
++ goto done;
++ }
++ if (rma->offset)
++ ret = scif_unregister(knx->epd.epd, rma->offset, rma->size);
++ rma->kvaddr = NULL;
++ rma->size = 0;
++ rma->offset = 0;
++done:
++ return ret;
++}
++
++static __always_inline void qib_knx_memcpy(void *dst, void __iomem *src,
++ size_t size)
++{
++ memcpy_fromio(dst, src, size);
++}
++
++int qib_knx_alloc_ctxt(u16 node_id, unsigned ctxt)
++{
++ struct qib_knx *knx = qib_knx_get(node_id);
++ struct qib_devdata *dd = knx->dd;
++ struct qib_knx_ctxt *ptr;
++ int ret = 0;
++
++ if (ctxt >= dd->ctxtcnt) {
++ ret = -EINVAL;
++ goto bail;
++ }
++ if (unlikely(!knx->ctxts)) {
++ ret = -ENOMEM;
++ goto bail;
++ }
++ ptr = kzalloc_node(sizeof(*ptr), GFP_KERNEL, knx->numa_node);
++ if (unlikely(!ptr)) {
++ ret = -ENOMEM;
++ goto bail;
++ }
++ ptr->knx = knx;
++ ptr->ctxt = ctxt;
++ ptr->ppd = dd->rcd[ctxt]->ppd;
++
++ spin_lock(&knx->ctxt_lock);
++ knx->ctxts[ctxt] = ptr;
++ dd->rcd[ctxt]->krcd = ptr;
++ spin_unlock(&knx->ctxt_lock);
++bail:
++ return ret;
++}
++
++__u64 qib_knx_ctxt_info(struct qib_ctxtdata *rcd,
++ enum qib_knx_ctxtinfo_type type,
++ struct file *fp)
++{
++ struct qib_knx *knx = rcd->krcd->knx;
++ __u16 subctxt;
++ __u64 ret = 0;
++
++ spin_lock(&knx->ctxt_lock);
++ if (!knx || !knx->ctxts || !knx->ctxts[rcd->ctxt])
++ goto done;
++
++ switch (type) {
++ case QIB_KNX_CTXTINFO_UREG:
++ ret = knx->ctxts[rcd->ctxt]->uregbase;
++ break;
++ case QIB_KNX_CTXTINFO_PIOAVAIL:
++ ret = knx->ctxts[rcd->ctxt]->pioavailaddr;
++ break;
++ case QIB_KNX_CTXTINFO_STATUS:
++ ret = knx->ctxts[rcd->ctxt]->status;
++ break;
++ case QIB_KNX_CTXTINFO_PIOBUFBASE:
++ subctxt = fp ? subctxt_fp(fp) : 0;
++ ret = knx->ctxts[rcd->ctxt]->piobufbase[subctxt];
++ break;
++ case QIB_KNX_CTXTINFO_FLAGS:
++ ret = knx->ctxts[rcd->ctxt]->runtime_flags;
++ break;
++ }
++done:
++ spin_unlock(&knx->ctxt_lock);
++ return ret;
++}
++
++int qib_knx_setup_piobufs(struct qib_devdata *dd, struct qib_ctxtdata *rcd,
++ __u16 subctxt)
++{
++ unsigned piobufs, piocnt;
++ char buf[16];
++ off_t offset;
++ int ret = 0;
++ struct qib_knx *knx = rcd->krcd->knx;
++
++ if (unlikely(!knx)) {
++ ret = -ENODEV;
++ goto bail;
++ }
++ if (unlikely(!knx->ctxts[rcd->ctxt])) {
++ ret = -EINVAL;
++ goto bail;
++ }
++
++ /*
++ * We don't calculate piobufs based on the rcd->piobufs like
++ * everywhere else in the driver because rcd->piobufs is based
++ * on the 2K PIO buffer virtual address. We just need an offset.
++ */
++ piobufs = rcd->pio_base * dd->palign;
++ if (!rcd->subctxt_cnt)
++ piocnt = rcd->piocnt;
++ else if (!subctxt) {
++ piocnt = (rcd->piocnt / rcd->subctxt_cnt) +
++ (rcd->piocnt % rcd->subctxt_cnt);
++ piobufs += dd->palign * (rcd->piocnt - piocnt);
++ } else {
++ piocnt = rcd->piocnt / rcd->subctxt_cnt;
++ piobufs += dd->palign * piocnt * (subctxt - 1);
++ }
++
++ /* register PIO buffers */
++ snprintf(buf, sizeof(buf), "PIO bufs %u:%u", rcd->ctxt, subctxt);
++ offset = qib_knx_register_memory(
++ knx, &knx->ctxts[rcd->ctxt]->piobufs[subctxt],
++ dd->piobase + piobufs, piocnt * dd->palign,
++ SCIF_PROT_WRITE, buf);
++ if (IS_ERR_VALUE(offset)) {
++ ret = offset;
++ goto bail;
++ }
++ knx->ctxts[rcd->ctxt]->piobufbase[subctxt] = offset;
++bail:
++ return ret;
++}
++
++int qib_knx_setup_pioregs(struct qib_devdata *dd, struct qib_ctxtdata *rcd,
++ struct qib_base_info *binfo)
++{
++ int ret = 0;
++ off_t offset;
++ struct qib_knx *knx = rcd->krcd->knx;
++
++ if (unlikely(!knx)) {
++ ret = -ENODEV;
++ goto bail;
++ }
++ if (unlikely(!knx->ctxts[rcd->ctxt])) {
++ ret = -EINVAL;
++ goto bail;
++ }
++
++ /* register the user registers to remote mapping */
++ offset = qib_knx_register_memory(knx, &knx->ctxts[rcd->ctxt]->uregs,
++ (char *)dd->userbase +
++ (dd->ureg_align * rcd->ctxt),
++ dd->flags & QIB_HAS_HDRSUPP ?
++ 2 * PAGE_SIZE : PAGE_SIZE,
++ SCIF_PROT_READ|SCIF_PROT_WRITE,
++ "UserRegs");
++ if (IS_ERR_VALUE(offset)) {
++ ret = offset;
++ goto bail;
++ }
++ knx->ctxts[rcd->ctxt]->uregbase = offset;
++
++ /*
++ * register the PIO availability registers.
++ * user status 64bit values are part of the page containing the
++ * pio availability registers.
++ */
++ offset = qib_knx_register_memory(knx, &knx->ctxts[rcd->ctxt]->pioavail,
++ (void *)dd->pioavailregs_dma,
++ PAGE_SIZE, SCIF_PROT_READ,
++ "pioavail regs");
++ if (IS_ERR_VALUE(offset)) {
++ ret = offset;
++ goto bail_uregs;
++ }
++ knx->ctxts[rcd->ctxt]->pioavailaddr = offset;
++ /*
++ * User status bitmask is part of the same mapped page as the PIO
++ * availability bits and user level code should know that. Therefore,
++ * we just need to give it the offset into the mapped page where the
++ * status mask is located.
++ */
++ knx->ctxts[rcd->ctxt]->status = offset;
++ /* Record the run time flags that were passed in by the user. */
++ knx->ctxts[rcd->ctxt]->runtime_flags = binfo->spi_runtime_flags;
++ goto bail;
++bail_uregs:
++ qib_knx_unregister_memory(knx, &knx->ctxts[rcd->ctxt]->uregs,
++ "UserRegs");
++bail:
++ return ret;
++}
++
++int qib_knx_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd,
++ struct qib_base_info *binfo)
++{
++ struct qib_knx_mem_map_sg *mapsg;
++ struct qib_knx_mem_map *map;
++ struct qib_knx *knx = rcd->krcd->knx;
++ dma_addr_t offset;
++ struct scatterlist *sg;
++ unsigned num_pages;
++ size_t size;
++ int ret = 0, i;
++
++ if (unlikely(!knx)) {
++ ret = -ENODEV;
++ goto bail;
++ }
++ if (unlikely(!knx->ctxts[rcd->ctxt])) {
++ ret = -EINVAL;
++ goto bail;
++ }
++ if (unlikely(!binfo->spi_rcvhdr_base)) {
++ ret = -EIO;
++ goto bail;
++ }
++
++ size = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize *
++ sizeof(u32), PAGE_SIZE);
++ mapsg = &knx->ctxts[rcd->ctxt]->rcvhdrq;
++ ret = scif_get_pages(knx->epd.epd, binfo->spi_rcvhdr_base,
++ size, &mapsg->pages);
++ if (ret)
++ goto bail;
++ if (!mapsg->pages->nr_pages) {
++ rcd->rcvhdrq = NULL;
++ ret = -ENOMEM;
++ goto bail_rcvq_pages;
++ }
++ num_pages = mapsg->pages->nr_pages;
++ if (num_pages * PAGE_SIZE != size) {
++ ret = -EINVAL;
++ goto bail_rcvq_pages;
++ }
++ rcd->rcvhdrq_size = size;
++ /* verify that rcvhdr q is contiguous */
++ offset = mapsg->pages->phys_addr[0];
++ for (i = 1; i < num_pages; i++) {
++ if (offset + PAGE_SIZE != mapsg->pages->phys_addr[i]) {
++ ret = -EFAULT;
++ goto bail_rcvq_pages;
++ }
++ offset += PAGE_SIZE;
++ }
++ memset(mapsg->pages->va[0], 0, size);
++ mapsg->size = size;
++ mapsg->dir = DMA_FROM_DEVICE;
++ /*
++ * Streaming DMa mappings are supposed to be short-lived.
++ * The mappings here are not exactly short-lived and
++ * technically we might not even need them since SusieQ
++ * can use 64bit addresses for DMA but the CPU might not.
++ * (see pci_set_dma_mask() in qib_pcie.c).
++ */
++ mapsg->sglist = kzalloc_node(num_pages * sizeof(*mapsg->sglist),
++ GFP_KERNEL, knx->numa_node);
++ if (!mapsg->sglist) {
++ ret = -ENOMEM;
++ goto bail_rcvq_pages;
++ }
++ sg_init_table(mapsg->sglist, num_pages);
++ for_each_sg(mapsg->sglist, sg, num_pages, i)
++ sg_set_page(sg, vmalloc_to_page(mapsg->pages->va[i]), PAGE_SIZE,
++ 0);
++ ret = pci_map_sg(dd->pcidev, mapsg->sglist, num_pages, mapsg->dir);
++ if (!ret) {
++ rcd->rcvhdrq_phys = 0;
++ goto bail_free_sgtable;
++ }
++ /*
++ * pci_map_sg() will remap all 128 pages of the
++ * scatterlist separately (without coalescing them).
++ * However, since the buffer is contiguous, as long
++ * as the base address is mapped correctly, everything
++ * should work. In any case, check that the mapped
++ * addresses are contiguous anyway.
++ */
++ offset = sg_dma_address(mapsg->sglist);
++ for_each_sg(mapsg->sglist, sg, num_pages, i) {
++ dma_addr_t sgaddr;
++ sgaddr = sg_dma_address(sg);
++ if ((offset == sgaddr && i) ||
++ (offset != sgaddr && sgaddr != offset + PAGE_SIZE)) {
++ ret = -EINVAL;
++ goto bail_rcvhdrq;
++ }
++ offset = sgaddr;
++ }
++ rcd->rcvhdrq_phys = sg_dma_address(mapsg->sglist);
++ rcd->rcvhdrq = mapsg->pages->va[0];
++
++ map = &knx->ctxts[rcd->ctxt]->sbufstatus;
++ ret = scif_get_pages(knx->epd.epd, binfo->spi_sendbuf_status,
++ PAGE_SIZE, &map->pages);
++ if (ret)
++ goto bail_rcvhdrq;
++
++ map->size = PAGE_SIZE;
++ if (map->pages->nr_pages > 0) {
++ rcd->user_event_mask = map->pages->va[0];
++ /*
++ * clear the mapped page - this is important as it will cause
++ * user level to request "invalid" updates on every PIO send.
++ */
++ memset(rcd->user_event_mask, 0, PAGE_SIZE);
++ }
++ /*
++ * Map the rcvhdrtailaddr page(s) if we are goign to DMA the tail
++ * register to memory, the chip will be prgrammed when
++ * qib_do_user_init() calls f_rcvctrl().
++ */
++ if (!(dd->flags & QIB_NODMA_RTAIL) && binfo->spi_rcvhdr_tailaddr) {
++ map = &knx->ctxts[rcd->ctxt]->rcvhdrqtailaddr;
++ ret = scif_get_pages(knx->epd.epd, binfo->spi_rcvhdr_tailaddr,
++ PAGE_SIZE, &map->pages);
++ if (ret)
++ goto bail_umask;
++ map->size = PAGE_SIZE;
++ map->dir = DMA_FROM_DEVICE;
++ /* don't reuse num_pages in case there is an error */
++ if (map->pages->nr_pages > 0) {
++ rcd->rcvhdrqtailaddr_phys =
++ pci_map_page(dd->pcidev,
++ vmalloc_to_page(map->pages->va[0]),
++ 0, map->size, map->dir);
++ if (pci_dma_mapping_error(dd->pcidev,
++ rcd->rcvhdrqtailaddr_phys)) {
++ rcd->rcvhdrqtailaddr_phys = 0;
++ ret = -ENOMEM;
++ goto bail_tail;
++ }
++ rcd->rcvhdrtail_kvaddr = map->pages->va[0];
++ /* clear, just in case... */
++ memset(rcd->rcvhdrtail_kvaddr, 0, map->size);
++ map->dma_mapped_addr =
++ rcd->rcvhdrqtailaddr_phys;
++ knx->ctxts[rcd->ctxt]->runtime_flags &=
++ ~QIB_RUNTIME_NODMA_RTAIL;
++ }
++ }
++ ret = 0;
++ goto bail;
++bail_tail:
++ scif_put_pages(knx->ctxts[rcd->ctxt]->rcvhdrqtailaddr.pages);
++bail_umask:
++ rcd->user_event_mask = NULL;
++ scif_put_pages(knx->ctxts[rcd->ctxt]->sbufstatus.pages);
++bail_rcvhdrq:
++ rcd->rcvhdrq = NULL;
++ pci_unmap_sg(dd->pcidev, knx->ctxts[rcd->ctxt]->rcvhdrq.sglist,
++ num_pages, knx->ctxts[rcd->ctxt]->rcvhdrq.dir);
++bail_free_sgtable:
++ kfree(knx->ctxts[rcd->ctxt]->rcvhdrq.sglist);
++bail_rcvq_pages:
++ scif_put_pages(knx->ctxts[rcd->ctxt]->rcvhdrq.pages);
++bail:
++ return ret;
++}
++
++int qib_knx_setup_eagerbufs(struct qib_ctxtdata *rcd,
++ struct qib_base_info *binfo)
++{
++ struct qib_knx_mem_map_sg *map;
++ struct scatterlist *sg;
++ struct qib_devdata *dd = rcd->dd;
++ struct qib_knx *knx = rcd->krcd->knx;
++ unsigned size, egrsize, egrcnt, num_pages, bufs_ppage,
++ egrbufcnt;
++ dma_addr_t dma_addr, page;
++ int ret = -ENOMEM, i, bufcnt;
++
++ if (unlikely(!knx)) {
++ ret = -ENODEV;
++ goto bail;
++ }
++ if (unlikely(!knx->ctxts[rcd->ctxt])) {
++ ret = -EINVAL;
++ goto bail;
++ }
++ if (unlikely(!binfo->spi_rcv_egrbufs)) {
++ ret = -ENOBUFS;
++ goto bail;
++ }
++ size = binfo->spi_rcv_egrbuftotlen;
++ egrsize = dd->rcvegrbufsize;
++ egrcnt = rcd->rcvegrcnt;
++
++ /*
++ * Check whether the total size of the buffer is enough for all
++ * Eager buffers.
++ */
++ if (size < egrsize * egrcnt) {
++ ret = -EINVAL;
++ goto bail;
++ }
++
++ /* number of pages required to fit all the eager buffers */
++ num_pages = (egrsize * egrcnt) / PAGE_SIZE;
++ /* number of buffers per page (depends on MTU) */
++ bufs_ppage = PAGE_SIZE / egrsize;
++ map = &knx->ctxts[rcd->ctxt]->eagerbufs;
++ ret = scif_get_pages(knx->epd.epd, binfo->spi_rcv_egrbufs,
++ size, &map->pages);
++ if (ret)
++ goto bail;
++
++ if (map->pages->nr_pages != num_pages) {
++ ret = -EINVAL;
++ goto bail_free_scif;
++ }
++
++ /*
++ * Allocate pointer to the pages from the KNX memory.
++ * In the case of KNX eager buffers, we are not dealing with
++ * 32K chunks of locally allocated memory. Therefore, we
++ * allocate num_pages pointers instead of rcd->rcvegrbuf_chunks.
++ */
++ if (likely(!rcd->rcvegrbuf)) {
++ rcd->rcvegrbuf = kzalloc_node(num_pages *
++ sizeof(rcd->rcvegrbuf[0]),
++ GFP_KERNEL, rcd->node_id);
++ if (!rcd->rcvegrbuf) {
++ ret = -ENOMEM;
++ goto bail_free_scif;
++ }
++ }
++
++ /*
++ * Allocate array of DMA addresses for each of the mapped
++ * pages.
++ */
++ if (likely(!rcd->rcvegrbuf_phys)) {
++ rcd->rcvegrbuf_phys =
++ kzalloc_node(num_pages * sizeof(rcd->rcvegrbuf_phys[0]),
++ GFP_KERNEL, rcd->node_id);
++ if (!rcd->rcvegrbuf_phys) {
++ ret = -ENOMEM;
++ goto bail_free_rcvegr;
++ }
++ }
++
++ map->size = size;
++ map->dir = DMA_BIDIRECTIONAL;
++ map->sglist = kzalloc_node(num_pages * sizeof(*map->sglist), GFP_KERNEL,
++ knx->numa_node);
++ if (!map->sglist) {
++ ret = -ENOMEM;
++ goto bail_free_rcvegr_phys;
++ }
++ sg_init_table(map->sglist, num_pages);
++ for_each_sg(map->sglist, sg, num_pages, i) {
++ memset(map->pages->va[i], 0, PAGE_SIZE);
++ sg_set_page(sg, vmalloc_to_page(map->pages->va[i]),
++ PAGE_SIZE, 0);
++ }
++ ret = pci_map_sg(dd->pcidev, map->sglist, num_pages, map->dir);
++ if (!ret) {
++ ret = -ENOMEM;
++ goto bail_free_rcvegr_phys;
++ }
++ for_each_sg(map->sglist, sg, num_pages, i) {
++ rcd->rcvegrbuf_phys[i] = sg_dma_address(sg);
++ rcd->rcvegrbuf[i] = map->pages->va[i];
++ }
++
++ for (egrbufcnt = i = 0; i < num_pages; i++) {
++ page = rcd->rcvegrbuf_phys[i];
++ dma_addr = page;
++ for (bufcnt = 0; egrbufcnt < egrcnt && bufcnt < bufs_ppage;
++ egrbufcnt++, bufcnt++) {
++ dd->f_put_tid(dd, rcd->rcvegr_tid_base +
++ egrbufcnt +
++ (u64 __iomem *)((char __iomem *)
++ dd->kregbase +
++ dd->rcvegrbase),
++ RCVHQ_RCV_TYPE_EAGER, dma_addr);
++ dma_addr += egrsize;
++ }
++ }
++ ret = 0;
++ goto bail;
++bail_free_rcvegr_phys:
++ kfree(map->sglist);
++ kfree(rcd->rcvegrbuf_phys);
++ rcd->rcvegrbuf_phys = NULL;
++bail_free_rcvegr:
++ kfree(rcd->rcvegrbuf);
++ rcd->rcvegrbuf = NULL;
++bail_free_scif:
++ scif_put_pages(map->pages);
++bail:
++ return ret;
++}
++
++void qib_knx_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
++{
++ struct qib_knx *knx = rcd->krcd->knx;
++ struct qib_knx_ctxt *ctxt;
++ char buf[16];
++ int i, ret = 0;
++
++ if (!rcd || !knx || !knx->ctxts)
++ return;
++
++ spin_lock(&knx->ctxt_lock);
++ ctxt = knx->ctxts[rcd->ctxt];
++ knx->ctxts[rcd->ctxt] = NULL;
++ spin_unlock(&knx->ctxt_lock);
++
++ if (!ctxt)
++ return;
++
++ if (rcd->rcvhdrq) {
++ /* Unmap the RcvHdr Q */
++ pci_unmap_sg(dd->pcidev, ctxt->rcvhdrq.sglist,
++ ctxt->rcvhdrq.pages->nr_pages,
++ ctxt->rcvhdrq.dir);
++ /* TODO: do something with return value */
++ ret = scif_put_pages(ctxt->rcvhdrq.pages);
++ kfree(ctxt->rcvhdrq.sglist);
++ }
++
++ if (rcd->user_event_mask)
++ /* TODO: do something with return value */
++ ret = scif_put_pages(ctxt->sbufstatus.pages);
++
++ if (rcd->rcvhdrtail_kvaddr) {
++ pci_unmap_page(dd->pcidev,
++ ctxt->rcvhdrqtailaddr.dma_mapped_addr,
++ ctxt->rcvhdrqtailaddr.size,
++ ctxt->rcvhdrqtailaddr.dir);
++ /* TODO: do something with return value */
++ ret = scif_put_pages(ctxt->rcvhdrqtailaddr.pages);
++ }
++
++ if (rcd->rcvegrbuf) {
++ pci_unmap_sg(dd->pcidev, ctxt->eagerbufs.sglist,
++ ctxt->eagerbufs.pages->nr_pages,
++ ctxt->eagerbufs.dir);
++ /* TODO: do something with return value */
++ ret = scif_put_pages(ctxt->eagerbufs.pages);
++ kfree(ctxt->eagerbufs.sglist);
++ kfree(rcd->rcvegrbuf);
++ kfree(rcd->rcvegrbuf_phys);
++ }
++
++ /* We are done with all remote memory, handle local */
++ qib_knx_unregister_memory(knx, &ctxt->pioavail, "pioavail regs");
++ qib_knx_unregister_memory(knx, &ctxt->uregs, "UserRegs");
++ for (i = 0; i < QLOGIC_IB_MAX_SUBCTXT; i++) {
++ snprintf(buf, sizeof(buf), "PIO bufs %u:%u", rcd->ctxt, i);
++ qib_knx_unregister_memory(knx, &ctxt->piobufs[i], buf);
++ }
++
++ kfree(ctxt);
++ kfree(rcd);
++}
++
++/*
++ * TID management for processes on the MIC happens on the MIC. Therefore,
++ * we only register the HW TID array here.
++ * The MIC will calculate TID array offsets using the same algorithm is
++ * the host. Therefore, it is OK that the entire HW TID array is mapped
++ * since neither side should step on the other.
++ */
++static int qib_knx_tidrcv_init(struct qib_knx *knx)
++{
++ struct qib_devdata *dd = knx->dd;
++ struct qib_knx_tid_info info;
++ void *tidbase;
++ int ret = 0;
++ off_t offset = 0;
++ size_t len;
++ char buf[64];
++
++ memset(&info, 0, sizeof(info));
++
++ info.tidcnt = dd->rcvtidcnt;
++ tidbase = ((char *)dd->kregbase + dd->rcvtidbase);
++ info.tidbase_len = dd->ctxtcnt * dd->rcvtidcnt * sizeof(tidbase);
++ info.tidtemplate = dd->tidtemplate;
++ info.invalidtid = dd->tidinvalid;
++ /* information needed to properly calculate DMA address to MIC pages */
++ info.bar_addr = knx->bar;
++ info.bar_len = knx->barlen;
++
++ snprintf(buf, sizeof(buf), "TID array KNx%u", knx->peer.node);
++ offset = qib_knx_register_memory(knx, &knx->tidmem, tidbase,
++ info.tidbase_len, SCIF_PROT_WRITE,
++ buf);
++ info.tidbase_offset = offset;
++ if (IS_ERR_VALUE(offset))
++ ret = offset;
++ len = scif_send(knx->epd.epd, &info, sizeof(info),
++ SCIF_SEND_BLOCK);
++ if (len < sizeof(info))
++ ret = -EFAULT;
++ return ret;
++}
++
++static int qib_knx_tidrcv_teardown(struct qib_knx *knx)
++{
++ char buf[64];
++ snprintf(buf, sizeof(buf), "TID array KNx%u", knx->peer.node);
++ return qib_knx_unregister_memory(knx, &knx->tidmem, buf);
++}
++
++static int qib_knx_sdma_init(struct qib_knx *knx)
++{
++ struct qib_knx_host_mem flags;
++ struct qib_knx_knc_mem mflags;
++ struct qib_knx_sdma *sdma;
++ char buf[64];
++ int ret = 0;
++
++ sdma = kzalloc_node(sizeof(*sdma), GFP_KERNEL, knx->numa_node);
++ if (!sdma) {
++ ret = -ENOMEM;
++ goto done;
++ }
++ sdma->hflags = kzalloc_node(PAGE_SIZE, GFP_KERNEL, knx->numa_node);
++ if (!sdma->hflags) {
++ ret = -ENOMEM;
++ goto done_free;
++ }
++ snprintf(buf, sizeof(buf), "Host SDMA flags KNx%u", knx->peer.node);
++ flags.flags_offset = qib_knx_register_memory(knx, &sdma->hflags_mem,
++ sdma->hflags,
++ PAGE_SIZE,
++ SCIF_PROT_WRITE,
++ buf);
++ if (IS_ERR_VALUE(flags.flags_offset)) {
++ ret = flags.flags_offset;
++ goto free_flags;
++ }
++ sdma->desc_num = knx->dd->pport[0].sdma_descq_cnt;
++ flags.desc_num = sdma->desc_num;
++ ret = scif_send(knx->epd.epd, &flags, sizeof(flags),
++ SCIF_SEND_BLOCK);
++ if (ret < sizeof(flags))
++ goto unregister;
++ ret = scif_recv(knx->epd.epd, &mflags, sizeof(mflags),
++ SCIF_RECV_BLOCK);
++ if (ret < sizeof(mflags)) {
++ ret = -EINVAL;
++ goto unregister;
++ }
++ ret = scif_get_pages(knx->epd.epd, mflags.flags_offset,
++ PAGE_SIZE, &sdma->mflag_pages);
++ if (ret < 0 || !sdma->mflag_pages->nr_pages) {
++ ret = -EFAULT;
++ goto unregister;
++ }
++ sdma->mflags = sdma->mflag_pages->va[0];
++ ret = scif_get_pages(knx->epd.epd, mflags.queue_offset,
++ mflags.queue_len, &sdma->queue_pages);
++ if (ret < 0)
++ goto put_flags;
++ if ((sdma->queue_pages->nr_pages * PAGE_SIZE) !=
++ mflags.queue_len) {
++ ret = -EFAULT;
++ goto put_queue;
++ }
++ sdma->queue = sdma->queue_pages->va[0];
++ sdma->complete = -1;
++ sdma->head = -1;
++ /* set the initial trigger value */
++ QIB_KNX_SDMA_SET(sdma->hflags->trigger, -1);
++ QIB_KNX_SDMA_SET(sdma->mflags->complete, sdma->complete);
++ snprintf(knx->tname, sizeof(knx->tname), "qib/mic%u/poll",
++ knx->peer.node);
++ knx->sdma = sdma;
++ ret = 0;
++ goto done;
++put_queue:
++ scif_put_pages(sdma->queue_pages);
++put_flags:
++ scif_put_pages(sdma->mflag_pages);
++unregister:
++ qib_knx_unregister_memory(knx, &sdma->hflags_mem, buf);
++free_flags:
++ kfree(sdma->hflags);
++done_free:
++ kfree(sdma);
++done:
++ /*
++ * we have to respond to the MIC so it doesn't get stuck
++ * in the scif_recv call
++ */
++ scif_send(knx->epd.epd, &ret, sizeof(ret), SCIF_SEND_BLOCK);
++ return ret;
++}
++
++static void qib_knx_sdma_teardown(struct qib_knx *knx)
++{
++ int ret;
++ if (knx->sdma_poll)
++ ret = kthread_stop(knx->sdma_poll);
++ if (knx->sdma) {
++ if (knx->sdma->queue_pages->nr_pages) {
++ knx->sdma->queue = NULL;
++ scif_put_pages(knx->sdma->queue_pages);
++ }
++ if (knx->sdma->mflag_pages->nr_pages) {
++ knx->sdma->mflags = NULL;
++ scif_put_pages(knx->sdma->mflag_pages);
++ }
++ kfree(knx->sdma->hflags);
++ kfree(knx->sdma);
++ knx->sdma = NULL;
++ }
++}
++
++int qib_knx_sdma_queue_create(struct file *fd)
++{
++ struct qib_ctxtdata *rcd = ctxt_fp(fd);
++ struct qib_devdata *dd = rcd->dd;
++ struct qib_knx *knx = rcd->krcd->knx;
++ struct qib_knx_ctxt *ctxt = knx->ctxts[rcd->ctxt];
++ u8 subctxt = subctxt_fp(fd);
++ int ret = 0;
++
++ if (!ctxt) {
++ ret = -EINVAL;
++ goto done;
++ }
++ ctxt->pq[subctxt] = qib_user_sdma_queue_create(&dd->pcidev->dev,
++ dd->unit, rcd->ctxt,
++ subctxt);
++ if (!ctxt->pq[subctxt])
++ ret = -ENOMEM;
++ user_sdma_queue_fp(fd) = ctxt->pq[subctxt];
++ /*
++ * We start the polling thread the first time a user SDMA
++ * queue is created. There is no reason to take up CPU
++ * cycles before then.
++ */
++ if (atomic_inc_return(&knx->tref) == 1) {
++ knx->sdma_poll = kthread_run(qib_knx_sdma_poll, knx,
++ knx->tname);
++ if (IS_ERR(knx->sdma_poll)) {
++ ret = -PTR_ERR(knx->sdma_poll);
++ atomic_dec(&knx->tref);
++ goto free_queue;
++ }
++ }
++ goto done;
++free_queue:
++ user_sdma_queue_fp(fd) = NULL;
++ qib_user_sdma_queue_destroy(ctxt->pq[subctxt]);
++ ctxt->pq[subctxt] = NULL;
++done:
++ return ret;
++}
++
++void qib_knx_sdma_queue_destroy(struct qib_filedata *fd)
++{
++ struct qib_ctxtdata *rcd = fd->rcd;
++ struct qib_knx *knx;
++ unsigned ctxt = rcd->ctxt, subctxt = fd->subctxt;
++
++ /* Host processes do not have a KNX rcd pointer. */
++ if (!rcd->krcd)
++ return;
++ knx = rcd->krcd->knx;
++ /* We still have the memory pointer through fd->pq */
++ spin_lock(&knx->ctxt_lock);
++ if (knx->ctxts[ctxt])
++ knx->ctxts[ctxt]->pq[subctxt] = NULL;
++ spin_unlock(&knx->ctxt_lock);
++ if (atomic_dec_and_test(&knx->tref)) {
++ int ret = kthread_stop(knx->sdma_poll);
++ knx->sdma_poll = NULL;
++ }
++}
++
++/*
++ * Convert a MIC physical address to the corresponding host page.
++ */
++static __always_inline struct page *
++qib_knx_phys_to_page(struct qib_knx *knx, unsigned long addr) {
++ unsigned long paddr;
++ if ((knx->bar + addr + PAGE_SIZE) >
++ (knx->bar + knx->barlen))
++ return NULL;
++ paddr = knx->bar + addr;
++ return pfn_to_page(paddr >> PAGE_SHIFT);
++}
++
++static int qib_knx_sdma_pkts_to_descs(struct qib_knx_ctxt *ctxt,
++ struct qib_knx_sdma_desc *desc,
++ struct qib_user_sdma_queue *pq,
++ int *ndesc, struct list_head *list)
++{
++ struct qib_knx *knx = ctxt->knx;
++ struct qib_user_sdma_pkt *pkt;
++ dma_addr_t pbc_dma_addr;
++ unsigned pktnw, pbcnw;
++ u32 counter;
++ u16 frag_size;
++ int ret = 0;
++ __le32 *pbc;
++
++ counter = pq->counter;
++
++ pbc = qib_user_sdma_alloc_header(pq, desc->pbclen, &pbc_dma_addr);
++ if (!pbc) {
++ ret = -ENOMEM;
++ goto done;
++ }
++ memcpy(pbc, desc->pbc, desc->pbclen);
++
++ pktnw = (le32_to_cpu(*pbc) & 0xFFFF);
++ /*
++ * This assignment is a bit strange. it's because the
++ * the pbc counts the number of 32 bit words in the full
++ * packet _except_ the first word of the pbc itself...
++ */
++ pbcnw = (desc->pbclen >> 2) - 1;
++
++ if (pktnw < pbcnw) {
++ ret = -EINVAL;
++ goto free_pbc;
++ }
++
++ if (pktnw != ((desc->length >> 2) + pbcnw)) {
++ ret = -EINVAL;
++ goto free_pbc;
++ }
++
++ frag_size = (le32_to_cpu(*pbc)>>16) & 0xFFFF;
++ if (((frag_size ? frag_size : desc->length) + desc->pbclen) >
++ ctxt->ppd->ibmaxlen) {
++ ret = -EINVAL;
++ goto free_pbc;
++ }
++ if (frag_size) {
++ /* new SDMA "protocol" */
++ unsigned pktsize, n;
++
++ n = desc->npages * ((2 * PAGE_SIZE / frag_size) + 1);
++ pktsize = sizeof(*pkt) + sizeof(pkt->addr[0]) * n;
++
++ pkt = kzalloc(pktsize + desc->tidlen, GFP_KERNEL);
++ if (!pkt) {
++ ret = -ENOMEM;
++ goto free_pbc;
++ }
++ pkt->largepkt = 1;
++ pkt->frag_size = frag_size;
++ pkt->addrlimit = n + ARRAY_SIZE(pkt->addr);
++
++ if (desc->tidlen) {
++ char *tidsmptr = (char *)pkt + pktsize;
++ memcpy(tidsmptr, desc->tidsm, desc->tidlen);
++ pkt->tidsm =
++ (struct qib_tid_session_member *)tidsmptr;
++ pkt->tidsmcount = desc->tidlen /
++ sizeof(*desc->tidsm);
++ pkt->tidsmidx = 0;
++ }
++ *pbc = cpu_to_le32(le32_to_cpu(*pbc) & 0x0000FFFF);
++ } else {
++ /* old SDMA */
++ pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
++ if (!pkt) {
++ ret = -ENOMEM;
++ goto free_pbc;
++ }
++ pkt->largepkt = 0;
++ pkt->frag_size = desc->length;
++ pkt->addrlimit = ARRAY_SIZE(pkt->addr);
++ }
++ pkt->bytes_togo = desc->length;
++ pkt->payload_size = 0;
++ pkt->counter = counter;
++ pkt->tiddma = !!desc->tidlen;
++ /*
++ * The generic user SDMA code will use this as a flag to
++ * decide whether to call the KNx-specific pkt free
++ * function. However, it doesn't know what the value
++ * actually means.
++ */
++ pkt->remote = (u64)knx;
++
++ qib_user_sdma_init_frag(pkt, 0,
++ 0, desc->pbclen,
++ 1, 0,
++ 0, 0,
++ NULL, pbc,
++ pbc_dma_addr, desc->pbclen);
++ pkt->index = 0;
++ pkt->naddr = 1;
++
++ if (desc->npages) {
++ /* we have user data */
++ int i;
++ struct page *page;
++ unsigned plen = 0, len = desc->length;
++ for (i = 0; i < desc->npages; i++) {
++ unsigned long off = (i == 0 ? desc->offset : 0);
++ plen = (len > PAGE_SIZE ? PAGE_SIZE : len);
++ page = qib_knx_phys_to_page(knx, desc->pages[i]);
++ ret = qib_user_sdma_page_to_frags(knx->dd, pq,
++ pkt, page, 0, off,
++ (off + plen > PAGE_SIZE ?
++ PAGE_SIZE - off : plen),
++ NULL);
++ if (ret < 0)
++ goto free_sdma;
++ len -= plen - off;
++ }
++ } else {
++ pkt->addr[0].last_desc = 1;
++ if (pbc_dma_addr == 0) {
++ pbc_dma_addr = dma_map_single(&knx->dd->pcidev->dev,
++ pbc, desc->pbclen,
++ DMA_TO_DEVICE);
++ if (dma_mapping_error(&knx->dd->pcidev->dev,
++ pbc_dma_addr)) {
++ ret = -ENOMEM;
++ goto free_sdma;
++ }
++ pkt->addr[0].addr = pbc_dma_addr;
++ pkt->addr[0].dma_mapped = 1;
++ }
++ }
++ counter++;
++ pkt->pq = pq;
++ pkt->index = 0;
++ *ndesc = pkt->naddr;
++
++ list_add_tail(&pkt->list, list);
++ goto done;
++free_sdma:
++ if (pkt->largepkt)
++ kfree(pkt);
++ else
++ kmem_cache_free(pq->pkt_slab, pkt);
++free_pbc:
++ if (pbc_dma_addr)
++ dma_pool_free(pq->header_cache, pbc, pbc_dma_addr);
++ else
++ kfree(pbc);
++done:
++ return ret;
++}
++
++void qib_knx_sdma_free_pkt(struct qib_user_sdma_pkt *pkt)
++{
++ struct qib_knx *knx = (struct qib_knx *)pkt->remote;
++ struct qib_knx_sdma *sdma = knx->sdma;
++ sdma_next(sdma, complete);
++ QIB_KNX_SDMA_SET(sdma->mflags->complete, sdma->complete);
++}
++
++static int qib_knx_sdma_poll(void *data)
++{
++ struct qib_knx *knx = (struct qib_knx *)data;
++ struct qib_knx_ctxt *ctxt;
++ struct qib_knx_sdma_desc desc;
++ struct qib_knx_sdma *sdma = knx->sdma;
++ struct qib_user_sdma_queue *pq;
++ struct list_head list;
++ u32 new_head;
++ int ret = 0, ndesc = 0, added;
++
++ if (!sdma)
++ return -EFAULT;
++
++ while (!kthread_should_stop()) {
++ added = 0;
++ new_head = QIB_KNX_SDMA_VALUE(sdma->hflags->trigger);
++ while (sdma->head != new_head) {
++ knx_sdma_next(sdma);
++ qib_knx_memcpy(&desc, sdma->queue + sdma->head,
++ sizeof(desc));
++ if (!desc.ctxt) {
++ QIB_KNX_SDMA_STATUS(sdma, -EINVAL);
++ continue;
++ }
++ spin_lock(&knx->ctxt_lock);
++ ctxt = knx->ctxts[desc.ctxt];
++ if (!ctxt) {
++ /* we should never get here */
++ QIB_KNX_SDMA_STATUS(sdma, -EINVAL);
++ goto done_unlock;
++ }
++ pq = ctxt->pq[desc.subctxt];
++ if (!pq) {
++ QIB_KNX_SDMA_STATUS(sdma, -EFAULT);
++ goto done_unlock;
++ }
++ mutex_lock(&pq->lock);
++ if (pq->added > ctxt->ppd->sdma_descq_removed)
++ qib_user_sdma_hwqueue_clean(ctxt->ppd);
++ if (pq->num_sending)
++ qib_user_sdma_queue_clean(ctxt->ppd, pq);
++
++ INIT_LIST_HEAD(&list);
++ ret = qib_knx_sdma_pkts_to_descs(ctxt, &desc, pq,
++ &ndesc, &list);
++ QIB_KNX_SDMA_STATUS(sdma, ret);
++ if (!list_empty(&list)) {
++ if (qib_sdma_descq_freecnt(ctxt->ppd) <
++ ndesc) {
++ qib_user_sdma_hwqueue_clean(
++ ctxt->ppd);
++ if (pq->num_sending)
++ qib_user_sdma_queue_clean(
++ ctxt->ppd, pq);
++ }
++ ret = qib_user_sdma_push_pkts(ctxt->ppd,
++ pq, &list, 1);
++ if (ret < 0)
++ goto free_pkts;
++ else {
++ pq->counter++;
++ added++;
++ }
++ }
++free_pkts:
++ if (!list_empty(&list))
++ qib_user_sdma_free_pkt_list(
++ &knx->dd->pcidev->dev, pq, &list);
++ mutex_unlock(&pq->lock);
++done_unlock:
++ spin_unlock(&knx->ctxt_lock);
++ }
++ if (!added) {
++ int i;
++ /*
++ * Push the queues along
++ * The polling thread will enter the inner loop only
++ * if the KNX has posted new descriptors to the queue.
++ * However, any packets that have been completed by
++ * the HW need to be cleaned and that won't happen
++ * unless we explicitly check.
++ */
++ for (i = 0;
++ i < knx->dd->ctxtcnt * QLOGIC_IB_MAX_SUBCTXT;
++ i++) {
++ int c = i / QLOGIC_IB_MAX_SUBCTXT,
++ s = i % QLOGIC_IB_MAX_SUBCTXT;
++ spin_lock(&knx->ctxt_lock);
++ ctxt = knx->ctxts[c];
++ if (!ctxt)
++ goto loop_unlock;
++ pq = ctxt->pq[s];
++ if (!pq)
++ goto loop_unlock;
++ mutex_lock(&pq->lock);
++ if (pq->num_sending)
++ qib_user_sdma_queue_clean(ctxt->ppd,
++ pq);
++ mutex_unlock(&pq->lock);
++loop_unlock:
++ spin_unlock(&knx->ctxt_lock);
++ }
++ might_sleep();
++ }
++ }
++ return ret;
++}
++
++void qib_knx_remove_device(struct qib_devdata *dd)
++{
++ if (server && dd->num_knx) {
++ struct qib_knx *knx, *knxp;
++ list_for_each_entry_safe(knx, knxp, &server->clients, list) {
++ if (knx->dd == dd) {
++ spin_lock(&server->client_lock);
++ list_del(&knx->list);
++ server->nclients--;
++ spin_unlock(&server->client_lock);
++ qib_knx_free(knx, 0);
++ kfree(knx);
++ }
++ }
++ }
++ return;
++}
++
++int __init qib_knx_server_init(void)
++{
++ server = kzalloc(sizeof(struct qib_knx_server), GFP_KERNEL);
++ if (!server)
++ return -ENOMEM;
++ INIT_LIST_HEAD(&server->clients);
++ spin_lock_init(&server->client_lock);
++ server->kthread = kthread_run(qib_knx_server_listen,
++ server, CLIENT_THREAD_NAME(0));
++ if (IS_ERR(server->kthread))
++ return -PTR_ERR(server->kthread);
++ return 0;
++}
++
++void __exit qib_knx_server_exit(void)
++{
++ if (server) {
++ struct qib_knx *t, *tt;
++ /* Stop the thread so we don't accept any new connections. */
++ kthread_stop(server->kthread);
++ list_for_each_entry_safe(t, tt, &server->clients, list) {
++ spin_lock(&server->client_lock);
++ list_del(&t->list);
++ spin_unlock(&server->client_lock);
++ qib_knx_free(t, 1);
++ kfree(t);
++ }
++ kfree(server);
++ }
++}
+diff -urN a9/drivers/infiniband/hw/qib/qib_knx_common.h a10/drivers/infiniband/hw/qib/qib_knx_common.h
+--- a9/drivers/infiniband/hw/qib/qib_knx_common.h 1969-12-31 16:00:00.000000000 -0800
++++ a10/drivers/infiniband/hw/qib/qib_knx_common.h 2015-01-05 15:10:58.252446692 -0800
+@@ -0,0 +1,126 @@
++/*
++ * Copyright (c) 2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef _QIB_KNX_COMMON_H
++#define _QIB_KNX_COMMON_H
++
++struct qib_device_info {
++ u16 unit;
++};
++
++#define QIB_SDMA_MAX_NPAGES 33
++#define QIB_KNX_SDMA_VALUE(fld) ((volatile u64)fld)
++#define QIB_KNX_SDMA_SET(fld, val) \
++ do { \
++ fld = (u64)(val); \
++ smp_mb(); \
++ } while (0)
++
++struct qib_knx_host_mem {
++ off_t flags_offset;
++ unsigned desc_num;
++};
++
++struct qib_knx_knc_mem {
++ off_t flags_offset;
++ off_t queue_offset;
++ size_t queue_len;
++};
++
++struct qib_tid_sm {
++ __u16 tid;
++ __u16 offset;
++ __u16 length;
++};
++
++/*
++ * SDMA transfer descriptor. This structure communicates the SDMA
++ * transfers from the MIC to the host. It is very important for
++ * performance reasons that its size is multiple of 64B in order
++ * to guarantee proper alignment in the descriptor array.
++ */
++struct qib_knx_sdma_desc {
++ u16 ctxt;
++ u16 subctxt;
++ u32 pbclen;
++ __le32 pbc[16];
++ u64 length;
++ u32 npages;
++ unsigned tidlen;
++ off_t offset;
++ unsigned long pages[QIB_SDMA_MAX_NPAGES];
++ /* This array is 198B so the compiler will pad
++ * it by 2B to make it multiple of 8B. */
++ struct qib_tid_sm tidsm[QIB_SDMA_MAX_NPAGES];
++ /*
++ * The two paddings below are included in order to
++ * make the size of the entire struct 576B (multiple
++ * of 64B). The goal is that all elements in an array
++ * of struct qib_knx_sdma_desc are 64B aligned.
++ */
++ u16 __padding0;
++ u64 __padding1[2];
++};
++
++/*
++ * trigger, status, and complete fields are by 8 to be
++ * cacheline size.
++ */
++struct qib_knx_sdma_hflags {
++ u64 trigger;
++ u64 __padding[7];
++};
++
++#define sdma_next(s, fld) \
++ ((s)->fld = (((s)->fld + 1) == (s)->desc_num) ? 0 : ((s)->fld + 1))
++
++struct qib_knx_sdma_mflags {
++ u64 status;
++ u64 __padding1[7];
++ u64 complete;
++ u64 __padding2[7];
++};
++
++struct qib_knx_tid_info {
++ /* this is the entire set of 512 entries (= 4K) so
++ * we can resgister. subctxt devision will be done
++ * in MIC driver. */
++ off_t tidbase_offset;
++ size_t tidbase_len;
++ u64 tidbase;
++ unsigned tidcnt;
++ u64 tidtemplate;
++ unsigned long invalidtid;
++ u64 bar_addr;
++ u64 bar_len;
++};
++
++#endif /* _QIB_KNX_COMMON_H */
+diff -urN a9/drivers/infiniband/hw/qib/qib_knx.h a10/drivers/infiniband/hw/qib/qib_knx.h
+--- a9/drivers/infiniband/hw/qib/qib_knx.h 1969-12-31 16:00:00.000000000 -0800
++++ a10/drivers/infiniband/hw/qib/qib_knx.h 2015-01-05 15:10:58.252446692 -0800
+@@ -0,0 +1,74 @@
++/*
++ * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses. You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ * Redistribution and use in source and binary forms, with or
++ * without modification, are permitted provided that the following
++ * conditions are met:
++ *
++ * - Redistributions of source code must retain the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above
++ * copyright notice, this list of conditions and the following
++ * disclaimer in the documentation and/or other materials
++ * provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++#ifndef _QIB_KNX_H
++#define _QIB_KNX_H
++
++#include "qib.h"
++
++enum qib_knx_ctxtinfo_type {
++ QIB_KNX_CTXTINFO_UREG,
++ QIB_KNX_CTXTINFO_PIOAVAIL,
++ QIB_KNX_CTXTINFO_STATUS,
++ QIB_KNX_CTXTINFO_PIOBUFBASE,
++ QIB_KNX_CTXTINFO_FLAGS
++};
++
++#ifdef QIB_CONFIG_KNX
++int __init qib_knx_server_init(void);
++void __exit qib_knx_server_exit(void);
++
++void qib_knx_remove_device(struct qib_devdata *);
++
++inline struct qib_knx *qib_knx_get(uint16_t);
++inline struct qib_devdata *qib_knx_node_to_dd(uint16_t);
++int qib_knx_alloc_ctxt(u16, unsigned);
++int qib_knx_setup_piobufs(struct qib_devdata *, struct qib_ctxtdata *, __u16);
++int qib_knx_setup_pioregs(struct qib_devdata *, struct qib_ctxtdata *,
++ struct qib_base_info *);
++int qib_knx_create_rcvhdrq(struct qib_devdata *, struct qib_ctxtdata *,
++ struct qib_base_info *);
++int qib_knx_setup_eagerbufs(struct qib_ctxtdata *, struct qib_base_info *);
++void qib_knx_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *);
++__u64 qib_knx_ctxt_info(struct qib_ctxtdata *, enum qib_knx_ctxtinfo_type,
++ struct file *);
++int qib_knx_sdma_queue_create(struct file *);
++void qib_knx_sdma_queue_destroy(struct qib_filedata *);
++#else
++static inline u64 qib_knx_ctxt_info(
++ struct qib_ctxtdata *rcd,
++ enum qib_knx_ctxtinfo_type type,
++ struct file *fp)
++{
++ return 0;
++}
++#endif
++#endif /* _QIB_KNX_H */
+diff -urN a9/drivers/infiniband/hw/qib/qib_user_sdma.c a10/drivers/infiniband/hw/qib/qib_user_sdma.c
+--- a9/drivers/infiniband/hw/qib/qib_user_sdma.c 2015-01-05 15:05:04.279461602 -0800
++++ a10/drivers/infiniband/hw/qib/qib_user_sdma.c 2015-01-05 15:10:58.252446692 -0800
+@@ -63,80 +63,6 @@
+ pid_t pid;
+ };
+
+-struct qib_user_sdma_pkt {
+- struct list_head list; /* list element */
+-
+- u8 tiddma; /* if this is NEW tid-sdma */
+- u8 largepkt; /* this is large pkt from kmalloc */
+- u16 frag_size; /* frag size used by PSM */
+- u16 index; /* last header index or push index */
+- u16 naddr; /* dimension of addr (1..3) ... */
+- u16 addrlimit; /* addr array size */
+- u16 tidsmidx; /* current tidsm index */
+- u16 tidsmcount; /* tidsm array item count */
+- u16 payload_size; /* payload size so far for header */
+- u32 bytes_togo; /* bytes for processing */
+- u32 counter; /* sdma pkts queued counter for this entry */
+- struct qib_tid_session_member *tidsm; /* tid session member array */
+- struct qib_user_sdma_queue *pq; /* which pq this pkt belongs to */
+- u64 added; /* global descq number of entries */
+-
+- struct {
+- u16 offset; /* offset for kvaddr, addr */
+- u16 length; /* length in page */
+- u16 first_desc; /* first desc */
+- u16 last_desc; /* last desc */
+- u16 put_page; /* should we put_page? */
+- u16 dma_mapped; /* is page dma_mapped? */
+- u16 dma_length; /* for dma_unmap_page() */
+- u16 padding;
+- struct page *page; /* may be NULL (coherent mem) */
+- void *kvaddr; /* FIXME: only for pio hack */
+- dma_addr_t addr;
+- } addr[4]; /* max pages, any more and we coalesce */
+-};
+-
+-struct qib_user_sdma_queue {
+- /*
+- * pkts sent to dma engine are queued on this
+- * list head. the type of the elements of this
+- * list are struct qib_user_sdma_pkt...
+- */
+- struct list_head sent;
+-
+- /*
+- * Because above list will be accessed by both process and
+- * signal handler, we need a spinlock for it.
+- */
+- spinlock_t sent_lock ____cacheline_aligned_in_smp;
+-
+- /* headers with expected length are allocated from here... */
+- char header_cache_name[64];
+- struct dma_pool *header_cache;
+-
+- /* packets are allocated from the slab cache... */
+- char pkt_slab_name[64];
+- struct kmem_cache *pkt_slab;
+-
+- /* as packets go on the queued queue, they are counted... */
+- u32 counter;
+- u32 sent_counter;
+- /* pending packets, not sending yet */
+- u32 num_pending;
+- /* sending packets, not complete yet */
+- u32 num_sending;
+- /* global descq number of entry of last sending packet */
+- u64 added;
+-
+- /* dma page table */
+- struct rb_root dma_pages_root;
+-
+- struct qib_user_sdma_rb_node *sdma_rb_node;
+-
+- /* protect everything above... */
+- struct mutex lock;
+-};
+-
+ static struct qib_user_sdma_rb_node *
+ qib_user_sdma_rb_search(struct rb_root *root, pid_t pid)
+ {
+@@ -254,12 +180,12 @@
+ return pq;
+ }
+
+-static void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt,
+- int i, u16 offset, u16 len,
+- u16 first_desc, u16 last_desc,
+- u16 put_page, u16 dma_mapped,
+- struct page *page, void *kvaddr,
+- dma_addr_t dma_addr, u16 dma_length)
++void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt,
++ int i, u16 offset, u16 len,
++ u16 first_desc, u16 last_desc,
++ u16 put_page, u16 dma_mapped,
++ struct page *page, void *kvaddr,
++ dma_addr_t dma_addr, u16 dma_length)
+ {
+ pkt->addr[i].offset = offset;
+ pkt->addr[i].length = len;
+@@ -273,7 +199,7 @@
+ pkt->addr[i].dma_length = dma_length;
+ }
+
+-static void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq,
++void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq,
+ size_t len, dma_addr_t *dma_addr)
+ {
+ void *hdr;
+@@ -295,11 +221,11 @@
+ return hdr;
+ }
+
+-static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd,
+- struct qib_user_sdma_queue *pq,
+- struct qib_user_sdma_pkt *pkt,
+- struct page *page, u16 put,
+- u16 offset, u16 len, void *kvaddr)
++int qib_user_sdma_page_to_frags(const struct qib_devdata *dd,
++ struct qib_user_sdma_queue *pq,
++ struct qib_user_sdma_pkt *pkt,
++ struct page *page, u16 put,
++ u16 offset, u16 len, void *kvaddr)
+ {
+ __le16 *pbc16;
+ void *pbcvaddr;
+@@ -314,21 +240,27 @@
+ int ret = 0;
+
+ if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+- /*
+- * dma mapping error, pkt has not managed
+- * this page yet, return the page here so
+- * the caller can ignore this page.
+- */
+- if (put) {
+- put_page(page);
+- } else {
+- /* coalesce case */
+- kunmap(page);
+- __free_page(page);
++#ifdef QIB_CONFIG_KNX
++ if (!pkt->remote) {
++#endif
++ /*
++ * dma mapping error, pkt has not managed
++ * this page yet, return the page here so
++ * the caller can ignore this page.
++ */
++ if (put) {
++ put_page(page);
++ } else {
++ /* coalesce case */
++ kunmap(page);
++ __free_page(page);
++ }
++ ret = -ENOMEM;
++ goto done;
+ }
+- ret = -ENOMEM;
+- goto done;
++#ifdef QIB_CONFIG_KNX
+ }
++#endif
+ offset = 0;
+ dma_mapped = 1;
+
+@@ -630,13 +562,19 @@
+ pkt->addr[i].dma_length,
+ DMA_TO_DEVICE);
+
+- if (pkt->addr[i].kvaddr)
+- kunmap(pkt->addr[i].page);
++#ifdef QIB_CONFIG_KNX
++ if (!pkt->remote) {
++#endif
++ if (pkt->addr[i].kvaddr)
++ kunmap(pkt->addr[i].page);
+
+- if (pkt->addr[i].put_page)
+- put_page(pkt->addr[i].page);
+- else
+- __free_page(pkt->addr[i].page);
++ if (pkt->addr[i].put_page)
++ put_page(pkt->addr[i].page);
++ else
++ __free_page(pkt->addr[i].page);
++#ifdef QIB_CONFIG_KNX
++ }
++#endif
+ } else if (pkt->addr[i].kvaddr) {
+ /* for headers */
+ if (pkt->addr[i].dma_mapped) {
+@@ -775,9 +713,9 @@
+ }
+
+ /* free a packet list -- return counter value of last packet */
+-static void qib_user_sdma_free_pkt_list(struct device *dev,
+- struct qib_user_sdma_queue *pq,
+- struct list_head *list)
++void qib_user_sdma_free_pkt_list(struct device *dev,
++ struct qib_user_sdma_queue *pq,
++ struct list_head *list)
+ {
+ struct qib_user_sdma_pkt *pkt, *pkt_next;
+
+@@ -787,6 +725,10 @@
+ for (i = 0; i < pkt->naddr; i++)
+ qib_user_sdma_free_pkt_frag(dev, pq, pkt, i);
+
++#ifdef QIB_CONFIG_KNX
++ if (pkt->remote)
++ qib_knx_sdma_free_pkt(pkt);
++#endif
+ if (pkt->largepkt)
+ kfree(pkt);
+ else
+@@ -970,6 +912,9 @@
+ pkt->payload_size = 0;
+ pkt->counter = counter;
+ pkt->tiddma = tiddma;
++#ifdef QIB_CONFIG_KNX
++ pkt->remote = 0;
++#endif
+
+ /* setup the first header */
+ qib_user_sdma_init_frag(pkt, 0, /* index */
+@@ -1045,8 +990,8 @@
+ }
+
+ /* try to clean out queue -- needs pq->lock */
+-static int qib_user_sdma_queue_clean(struct qib_pportdata *ppd,
+- struct qib_user_sdma_queue *pq)
++int qib_user_sdma_queue_clean(struct qib_pportdata *ppd,
++ struct qib_user_sdma_queue *pq)
+ {
+ struct qib_devdata *dd = ppd->dd;
+ struct list_head free_list;
+@@ -1110,7 +1055,7 @@
+ }
+
+ /* clean descriptor queue, returns > 0 if some elements cleaned */
+-static int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd)
++int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd)
+ {
+ int ret;
+ unsigned long flags;
+@@ -1321,9 +1266,9 @@
+ }
+
+ /* pq->lock must be held, get packets on the wire... */
+-static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd,
+- struct qib_user_sdma_queue *pq,
+- struct list_head *pktlist, int count)
++int qib_user_sdma_push_pkts(struct qib_pportdata *ppd,
++ struct qib_user_sdma_queue *pq,
++ struct list_head *pktlist, int count)
+ {
+ unsigned long flags;
+
+diff -urN a9/drivers/infiniband/hw/qib/qib_user_sdma.h a10/drivers/infiniband/hw/qib/qib_user_sdma.h
+--- a9/drivers/infiniband/hw/qib/qib_user_sdma.h 2015-01-05 15:05:04.280461602 -0800
++++ a10/drivers/infiniband/hw/qib/qib_user_sdma.h 2015-01-05 15:10:58.253446692 -0800
+@@ -31,12 +31,108 @@
+ */
+ #include <linux/device.h>
+
+-struct qib_user_sdma_queue;
++struct qib_user_sdma_pkt {
++ struct list_head list; /* list element */
++
++ u8 tiddma; /* if this is NEW tid-sdma */
++ u8 largepkt; /* this is large pkt from kmalloc */
++ u16 frag_size; /* frag size used by PSM */
++ u16 index; /* last header index or push index */
++ u16 naddr; /* dimension of addr (1..3) ... */
++ u16 addrlimit; /* addr array size */
++ u16 tidsmidx; /* current tidsm index */
++ u16 tidsmcount; /* tidsm array item count */
++ u16 payload_size; /* payload size so far for header */
++ u32 bytes_togo; /* bytes for processing */
++ u32 counter; /* sdma pkts queued counter for this entry */
++ struct qib_tid_session_member *tidsm; /* tid session member array */
++ struct qib_user_sdma_queue *pq; /* which pq this pkt belongs to */
++ u64 added; /* global descq number of entries */
++#ifdef QIB_CONFIG_KNX
++ u64 remote; /* does the packet originate on the host */
++#endif
++
++ struct {
++ u16 offset; /* offset for kvaddr, addr */
++ u16 length; /* length in page */
++ u16 first_desc; /* first desc */
++ u16 last_desc; /* last desc */
++ u16 put_page; /* should we put_page? */
++ u16 dma_mapped; /* is page dma_mapped? */
++ u16 dma_length; /* for dma_unmap_page() */
++ u16 padding;
++ struct page *page; /* may be NULL (coherent mem) */
++ void *kvaddr; /* FIXME: only for pio hack */
++ dma_addr_t addr;
++ } addr[4]; /* max pages, any more and we coalesce */
++};
++
++struct qib_user_sdma_queue {
++ /*
++ * pkts sent to dma engine are queued on this
++ * list head. the type of the elements of this
++ * list are struct qib_user_sdma_pkt...
++ */
++ struct list_head sent;
++
++ /*
++ * Because above list will be accessed by both process and
++ * signal handler, we need a spinlock for it.
++ */
++ spinlock_t sent_lock ____cacheline_aligned_in_smp;
++
++ /* headers with expected length are allocated from here... */
++ char header_cache_name[64];
++ struct dma_pool *header_cache;
++
++ /* packets are allocated from the slab cache... */
++ char pkt_slab_name[64];
++ struct kmem_cache *pkt_slab;
++
++ /* as packets go on the queued queue, they are counted... */
++ u32 counter;
++ u32 sent_counter;
++ /* pending packets, not sending yet */
++ u32 num_pending;
++ /* sending packets, not complete yet */
++ u32 num_sending;
++ /* global descq number of entry of last sending packet */
++ u64 added;
++
++ /* dma page table */
++ struct rb_root dma_pages_root;
++
++ struct qib_user_sdma_rb_node *sdma_rb_node;
++
++ /* protect everything above... */
++ struct mutex lock;
++};
+
+ struct qib_user_sdma_queue *
+ qib_user_sdma_queue_create(struct device *dev, int unit, int port, int sport);
+ void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq);
+-
++void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq,
++ size_t len, dma_addr_t *dma_addr);
++void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt,
++ int i, u16 offset, u16 len,
++ u16 first_desc, u16 last_desc,
++ u16 put_page, u16 dma_mapped,
++ struct page *page, void *kvaddr,
++ dma_addr_t dma_addr, u16 dma_length);
++int qib_user_sdma_page_to_frags(const struct qib_devdata *dd,
++ struct qib_user_sdma_queue *pq,
++ struct qib_user_sdma_pkt *pkt,
++ struct page *page, u16 put,
++ u16 offset, u16 len, void *kvaddr);
++int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd);
++int qib_user_sdma_queue_clean(struct qib_pportdata *ppd,
++ struct qib_user_sdma_queue *pq);
++void qib_user_sdma_free_pkt_list(struct device *dev,
++ struct qib_user_sdma_queue *pq,
++ struct list_head *list);
++int qib_user_sdma_push_pkts(struct qib_pportdata *ppd,
++ struct qib_user_sdma_queue *pq,
++ struct list_head *pktlist, int count);
+ int qib_user_sdma_writev(struct qib_ctxtdata *pd,
+ struct qib_user_sdma_queue *pq,
+ const struct iovec *iov,
+@@ -50,3 +146,9 @@
+
+ u32 qib_user_sdma_complete_counter(const struct qib_user_sdma_queue *pq);
+ u32 qib_user_sdma_inflight_counter(struct qib_user_sdma_queue *pq);
++
++/*
++ * This function prototype somewhat polutes this header file
++ * but I don't want to create a new header file just for it.
++ */
++void qib_knx_sdma_free_pkt(struct qib_user_sdma_pkt *pkt);
--- /dev/null
+From 536a8d5b5c68ecd2ca73446f25443fe8bb234a46 Mon Sep 17 00:00:00 2001
+From: Phil Cayton <phil.cayton@intel.com>
+Date: Thu, 29 May 2014 14:35:13 -0700
+Subject: [PATCH 11/12] correct ib_addr.h for older kernels
+
+Signed-off-by: Phil Cayton <phil.cayton@intel.com>
+---
+diff -urN a10/include/rdma/ib_addr.h a11/include/rdma/ib_addr.h
+--- a10/include/rdma/ib_addr.h 2015-01-05 15:10:42.263447365 -0800
++++ a11/include/rdma/ib_addr.h 2015-01-05 15:12:36.058442572 -0800
+@@ -239,6 +239,27 @@
+ return 0;
+ }
+
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0)
++static inline int iboe_get_rate(struct net_device *dev)
++{
++ struct ethtool_cmd cmd;
++
++ if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings ||
++ dev->ethtool_ops->get_settings(dev, &cmd))
++ return IB_RATE_PORT_CURRENT;
++
++ if (cmd.speed >= 40000)
++ return IB_RATE_40_GBPS;
++ else if (cmd.speed >= 30000)
++ return IB_RATE_30_GBPS;
++ else if (cmd.speed >= 20000)
++ return IB_RATE_20_GBPS;
++ else if (cmd.speed >= 10000)
++ return IB_RATE_10_GBPS;
++ else
++ return IB_RATE_PORT_CURRENT;
++}
++#else
+ static inline int iboe_get_rate(struct net_device *dev)
+ {
+ struct ethtool_cmd cmd;
+@@ -263,6 +284,7 @@
+ else
+ return IB_RATE_PORT_CURRENT;
+ }
++#endif
+
+ static inline int rdma_link_local_addr(struct in6_addr *addr)
+ {
--- /dev/null
+From 6d88a748ca017a22c08d25e29144dd392c988eb9 Mon Sep 17 00:00:00 2001
+From: Phil Cayton <phil.cayton@intel.com>
+Date: Thu, 5 Jun 2014 09:44:42 -0700
+Subject: [PATCH 12/12] add mlx4 cq_comp locking already done in event handler
+
+---
+diff -urN a11/drivers/net/ethernet/mellanox/mlx4/cq.c a12/drivers/net/ethernet/mellanox/mlx4/cq.c
+--- a11/drivers/net/ethernet/mellanox/mlx4/cq.c 2015-01-05 15:12:24.028443079 -0800
++++ a12/drivers/net/ethernet/mellanox/mlx4/cq.c 2015-01-05 15:14:27.994437857 -0800
+@@ -54,10 +54,17 @@
+
+ void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
+ {
++ struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
+ struct mlx4_cq *cq;
+
+- cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree,
+- cqn & (dev->caps.num_cqs - 1));
++ spin_lock(&cq_table->lock);
++
++ cq = radix_tree_lookup(&cq_table->tree, cqn & (dev->caps.num_cqs - 1));
++ if (cq)
++ atomic_inc(&cq->refcount);
++
++ spin_unlock(&cq_table->lock);
++
+ if (!cq) {
+ mlx4_dbg(dev, "Completion event for bogus CQ %08x\n", cqn);
+ return;
+@@ -66,6 +73,9 @@
+ ++cq->arm_sn;
+
+ cq->comp(cq);
++
++ if (atomic_dec_and_test(&cq->refcount))
++ complete(&cq->free);
+ }
+
+ void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type)