From: Vladimir Sokolovsky Date: Tue, 3 Mar 2015 09:10:26 +0000 (+0200) Subject: Added XEON Phi X-Git-Tag: vofed-3.18~32 X-Git-Url: https://openfabrics.org/gitweb/?a=commitdiff_plain;h=5e4a77ee6847ce1f94296947e3051ee2409fe27d;p=~emulex%2Ffor-vlad%2Fcompat-rdma.git Added XEON Phi Signed-off-by: Phil Cayton Signed-off-by: Vladimir Sokolovsky --- diff --git a/ofed_scripts/ofed-mic b/ofed_scripts/ofed-mic index 238b46d..9132e70 100755 --- a/ofed_scripts/ofed-mic +++ b/ofed_scripts/ofed-mic @@ -60,6 +60,7 @@ foreach_card() names=(${host}-${card} ${card} ${host}-${card}.${domn}) else eval $_failure + errors+=1 continue fi diff --git a/tech-preview/xeon-phi/0001-ib_core-add-mic-node-and-scif-transport-types.patch b/tech-preview/xeon-phi/0001-ib_core-add-mic-node-and-scif-transport-types.patch new file mode 100644 index 0000000..c3b2c55 --- /dev/null +++ b/tech-preview/xeon-phi/0001-ib_core-add-mic-node-and-scif-transport-types.patch @@ -0,0 +1,82 @@ +From c01faf2a8053f8968b9bac84a4cbd54a9952d472 Mon Sep 17 00:00:00 2001 +From: Phil Cayton +Date: Tue, 21 Jan 2014 08:59:29 -0800 +Subject: [PATCH 01/12] ib_core add mic node and scif transport types + +The OFED SCIF driver implements a software-emulated RDMA device to allow OFED +based applications, such as Intel MPI, to run on Intel(R) MIC Architecture +without the presence of a physical HCA. OFED SCIF is only targeted for inter- +node communication within a single platform, where a node is a coprocessor +or the host processor. This patch adds new node and transport types to the +ib_core kernel module to distinguish this new RDMA interface type. +--- +diff -urN a0/drivers/infiniband/core/sysfs.c a1/drivers/infiniband/core/sysfs.c +--- a0/drivers/infiniband/core/sysfs.c 2015-01-05 13:35:35.692687746 -0800 ++++ a1/drivers/infiniband/core/sysfs.c 2015-01-05 13:46:38.792659814 -0800 +@@ -253,6 +253,8 @@ + return sprintf(buf, "%s\n", "InfiniBand"); + case IB_LINK_LAYER_ETHERNET: + return sprintf(buf, "%s\n", "Ethernet"); ++ case IB_LINK_LAYER_SCIF: ++ return sprintf(buf, "%s\n", "SCIF"); + default: + return sprintf(buf, "%s\n", "Unknown"); + } +@@ -623,6 +625,7 @@ + case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type); + case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); + case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); ++ case RDMA_NODE_MIC: return sprintf(buf, "%d: MIC\n", dev->node_type); + default: return sprintf(buf, "%d: \n", dev->node_type); + } + } +diff -urN a0/drivers/infiniband/core/verbs.c a1/drivers/infiniband/core/verbs.c +--- a0/drivers/infiniband/core/verbs.c 2015-01-05 13:35:35.693687746 -0800 ++++ a1/drivers/infiniband/core/verbs.c 2015-01-05 13:49:08.470653509 -0800 +@@ -121,6 +121,8 @@ + return RDMA_TRANSPORT_USNIC; + case RDMA_NODE_USNIC_UDP: + return RDMA_TRANSPORT_USNIC_UDP; ++ case RDMA_NODE_MIC: ++ return RDMA_TRANSPORT_SCIF; + default: + BUG(); + return 0; +@@ -140,6 +142,8 @@ + case RDMA_TRANSPORT_USNIC: + case RDMA_TRANSPORT_USNIC_UDP: + return IB_LINK_LAYER_ETHERNET; ++ case RDMA_TRANSPORT_SCIF: ++ return IB_LINK_LAYER_SCIF; + default: + return IB_LINK_LAYER_UNSPECIFIED; + } +diff -urN a0/include/rdma/ib_verbs.h a1/include/rdma/ib_verbs.h +--- a0/include/rdma/ib_verbs.h 2015-01-05 13:45:40.299662278 -0800 ++++ a1/include/rdma/ib_verbs.h 2015-01-05 13:50:57.590648913 -0800 +@@ -75,13 +75,15 @@ + RDMA_NODE_RNIC, + RDMA_NODE_USNIC, + RDMA_NODE_USNIC_UDP, ++ RDMA_NODE_MIC, + }; + + enum rdma_transport_type { + RDMA_TRANSPORT_IB, + RDMA_TRANSPORT_IWARP, + RDMA_TRANSPORT_USNIC, +- RDMA_TRANSPORT_USNIC_UDP ++ RDMA_TRANSPORT_USNIC_UDP, ++ RDMA_TRANSPORT_SCIF, + }; + + __attribute_const__ enum rdma_transport_type +@@ -91,6 +93,7 @@ + IB_LINK_LAYER_UNSPECIFIED, + IB_LINK_LAYER_INFINIBAND, + IB_LINK_LAYER_ETHERNET, ++ IB_LINK_LAYER_SCIF + }; + + enum ib_device_cap_flags { +Binary files a0/include/rdma/.ib_verbs.h.rej.swp and a1/include/rdma/.ib_verbs.h.rej.swp differ diff --git a/tech-preview/xeon-phi/0002-rdma_cm-add-mic-node-and-scif-transport-types.patch b/tech-preview/xeon-phi/0002-rdma_cm-add-mic-node-and-scif-transport-types.patch new file mode 100644 index 0000000..27e90b2 --- /dev/null +++ b/tech-preview/xeon-phi/0002-rdma_cm-add-mic-node-and-scif-transport-types.patch @@ -0,0 +1,117 @@ +From faf3b3f931806d4f044068c4e9b2ca4482a9177a Mon Sep 17 00:00:00 2001 +From: Phil Cayton +Date: Tue, 3 Jun 2014 09:50:57 -0700 +Subject: [PATCH 02/12] rdma_cm add mic node and scif transport types + +The OFED SCIF driver can leverage the iWARP cm calls to establish connections. +This patch utilizes the new node and transport types in the rdma cm to call +the underlying driver as needed. +--- +diff -urN a1/drivers/infiniband/core/cma.c a2/drivers/infiniband/core/cma.c +--- a1/drivers/infiniband/core/cma.c 2015-01-05 13:46:27.953660271 -0800 ++++ a2/drivers/infiniband/core/cma.c 2015-01-05 14:05:11.897612926 -0800 +@@ -747,6 +747,7 @@ + qp_attr->rq_psn = id_priv->seq_num; + break; + case RDMA_TRANSPORT_IWARP: ++ case RDMA_TRANSPORT_SCIF: + if (!id_priv->cm_id.iw) { + qp_attr->qp_access_flags = 0; + *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; +@@ -1043,6 +1044,7 @@ + ib_destroy_cm_id(id_priv->cm_id.ib); + break; + case RDMA_TRANSPORT_IWARP: ++ case RDMA_TRANSPORT_SCIF: + if (id_priv->cm_id.iw) + iw_destroy_cm_id(id_priv->cm_id.iw); + break; +@@ -1994,6 +1996,7 @@ + } + break; + case RDMA_TRANSPORT_IWARP: ++ case RDMA_TRANSPORT_SCIF: + ret = cma_resolve_iw_route(id_priv, timeout_ms); + break; + default: +@@ -2184,6 +2187,25 @@ + return ret; + } + ++static int cma_resolve_scif(struct rdma_id_private *id_priv) ++{ ++ struct cma_work *work; ++ ++ work = kzalloc(sizeof *work, GFP_KERNEL); ++ if (!work) ++ return -ENOMEM; ++ ++ /* we probably can leave it empty here */ ++ ++ work->id = id_priv; ++ INIT_WORK(&work->work, cma_work_handler); ++ work->old_state = RDMA_CM_ADDR_QUERY; ++ work->new_state = RDMA_CM_ADDR_RESOLVED; ++ work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; ++ queue_work(cma_wq, &work->work); ++ return 0; ++} ++ + static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, + struct sockaddr *dst_addr) + { +@@ -2225,9 +2247,12 @@ + if (cma_any_addr(dst_addr)) { + ret = cma_resolve_loopback(id_priv); + } else { +- if (dst_addr->sa_family == AF_IB) { ++ if (dst_addr->sa_family == AF_IB) + ret = cma_resolve_ib_addr(id_priv); +- } else { ++ else if ((id_priv->id.device != NULL) && ++ (rdma_node_get_transport(id_priv->id.device->node_type) == RDMA_TRANSPORT_SCIF)) ++ ret = cma_resolve_scif(id_priv); ++ else { + ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv), + dst_addr, &id->route.addr.dev_addr, + timeout_ms, addr_handler, id_priv); +@@ -2598,6 +2623,7 @@ + goto err; + break; + case RDMA_TRANSPORT_IWARP: ++ case RDMA_TRANSPORT_SCIF: + ret = cma_iw_listen(id_priv, backlog); + if (ret) + goto err; +@@ -2946,6 +2972,7 @@ + ret = cma_connect_ib(id_priv, conn_param); + break; + case RDMA_TRANSPORT_IWARP: ++ case RDMA_TRANSPORT_SCIF: + ret = cma_connect_iw(id_priv, conn_param); + break; + default: +@@ -3073,6 +3100,7 @@ + } + break; + case RDMA_TRANSPORT_IWARP: ++ case RDMA_TRANSPORT_SCIF: + ret = cma_accept_iw(id_priv, conn_param); + break; + default: +@@ -3133,6 +3161,7 @@ + 0, private_data, private_data_len); + break; + case RDMA_TRANSPORT_IWARP: ++ case RDMA_TRANSPORT_SCIF: + ret = iw_cm_reject(id_priv->cm_id.iw, + private_data, private_data_len); + break; +@@ -3163,6 +3192,7 @@ + ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); + break; + case RDMA_TRANSPORT_IWARP: ++ case RDMA_TRANSPORT_SCIF: + ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); + break; + default: diff --git a/tech-preview/xeon-phi/0003-add-context-based-udata-support.patch b/tech-preview/xeon-phi/0003-add-context-based-udata-support.patch new file mode 100644 index 0000000..113a3e9 --- /dev/null +++ b/tech-preview/xeon-phi/0003-add-context-based-udata-support.patch @@ -0,0 +1,100 @@ +From 2ddd9c09050d6f74a2ea9e3e21a76510bbdff155 Mon Sep 17 00:00:00 2001 +From: Phil Cayton +Date: Thu, 6 Feb 2014 14:23:36 -0800 +Subject: [PATCH 03/12] add context based udata support + +Normally the copy_to_user and copy_from_user calls are used to access vendor +private data when allocating resources from processes. However, when the +processes are running on MIC, this communication is proxied to the host kernel +via SCIF. This patch allows setup of context-based udata access routines. +--- +diff -urN a2/drivers/infiniband/core/uverbs_cmd.c a3/drivers/infiniband/core/uverbs_cmd.c +--- a2/drivers/infiniband/core/uverbs_cmd.c 2015-01-05 13:59:55.217626266 -0800 ++++ a3/drivers/infiniband/core/uverbs_cmd.c 2015-01-05 14:30:40.647548530 -0800 +@@ -57,6 +57,21 @@ + static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; + static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; + ++static int uverbs_copy_from_udata(void *dst, struct ib_udata *udata, size_t len) ++{ ++ return copy_from_user(dst, udata->inbuf, len) ? -EFAULT : 0; ++} ++ ++static int uverbs_copy_to_udata(struct ib_udata *udata, void *src, size_t len) ++{ ++ return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; ++} ++ ++struct ib_udata_ops uverbs_copy = { ++ .copy_from = uverbs_copy_from_udata, ++ .copy_to = uverbs_copy_to_udata ++}; ++ + /* + * The ib_uobject locking scheme is as follows: + * +@@ -330,6 +345,7 @@ + goto err; + } + ++ ucontext->umem_ops = NULL; + ucontext->device = ibdev; + INIT_LIST_HEAD(&ucontext->pd_list); + INIT_LIST_HEAD(&ucontext->mr_list); +Binary files a2/drivers/infiniband/core/.uverbs_cmd.c.rej.swp and a3/drivers/infiniband/core/.uverbs_cmd.c.rej.swp differ +diff -urN a2/drivers/infiniband/core/uverbs.h a3/drivers/infiniband/core/uverbs.h +--- a2/drivers/infiniband/core/uverbs.h 2015-01-05 13:59:55.216626266 -0800 ++++ a3/drivers/infiniband/core/uverbs.h 2015-01-05 14:29:27.559551609 -0800 +@@ -47,8 +47,11 @@ + #include + #include + ++extern struct ib_udata_ops uverbs_copy; ++ + #define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ + do { \ ++ (udata)->ops = &uverbs_copy; \ + (udata)->inbuf = (const void __user *) (ibuf); \ + (udata)->outbuf = (void __user *) (obuf); \ + (udata)->inlen = (ilen); \ +@@ -57,6 +60,7 @@ + + #define INIT_UDATA_BUF_OR_NULL(udata, ibuf, obuf, ilen, olen) \ + do { \ ++ (udata)->ops = &uverbs_copy; \ + (udata)->inbuf = (ilen) ? (const void __user *) (ibuf) : NULL; \ + (udata)->outbuf = (olen) ? (void __user *) (obuf) : NULL; \ + (udata)->inlen = (ilen); \ +diff -urN a2/include/rdma/ib_verbs.h a3/include/rdma/ib_verbs.h +--- a2/include/rdma/ib_verbs.h 2015-01-05 13:59:55.219626266 -0800 ++++ a3/include/rdma/ib_verbs.h 2015-01-05 14:18:48.871578512 -0800 +@@ -1147,7 +1147,14 @@ + int live; + }; + ++struct ib_udata; ++struct ib_udata_ops { ++ int (*copy_from)(void *dest, struct ib_udata *udata, size_t len); ++ int (*copy_to)(struct ib_udata *udata, void *src, size_t len); ++}; ++ + struct ib_udata { ++ struct ib_udata_ops *ops; + const void __user *inbuf; + void __user *outbuf; + size_t inlen; +@@ -1664,12 +1671,12 @@ + + static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) + { +- return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; ++ return udata->ops->copy_from(dest, udata, len); + } + + static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) + { +- return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; ++ return udata->ops->copy_to(udata, src, len); + } + + /** diff --git a/tech-preview/xeon-phi/0004-add-context-based-umem-support.patch b/tech-preview/xeon-phi/0004-add-context-based-umem-support.patch new file mode 100644 index 0000000..41970be --- /dev/null +++ b/tech-preview/xeon-phi/0004-add-context-based-umem-support.patch @@ -0,0 +1,353 @@ +From 8b06f1090da0e12c6012d0d13d8b48c69640a6a7 Mon Sep 17 00:00:00 2001 +From: Phil Cayton +Date: Thu, 6 Feb 2014 14:08:02 -0800 +Subject: [PATCH 04/12] add context based umem support + +The ib_umem_get routine calls get_user_pages to pin pages and create the +ib_umem structure. Memory on MIC, however, must be mapped through SCIF for +access across PCI. This patch allows setup of context-based ib_umem mapping +routines. + +Also update mthca to support these changes +--- +diff -urN a3/drivers/infiniband/core/umem.c a4/drivers/infiniband/core/umem.c +--- a3/drivers/infiniband/core/umem.c 2015-01-05 14:12:52.117593540 -0800 ++++ a4/drivers/infiniband/core/umem.c 2015-01-05 14:41:51.927520253 -0800 +@@ -57,6 +57,10 @@ + for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { + + page = sg_page(sg); ++ ++ if (!pfn_valid(page_to_pfn(page))) ++ continue; ++ + if (umem->writable && dirty) + set_page_dirty_lock(page); + put_page(page); +@@ -68,14 +72,71 @@ + } + + /** +- * ib_umem_get - Pin and DMA map userspace memory. ++ * get_remap_pages() - get pages remapped to user virtual space ++ * @mm: mm struct of target mm ++ * @start: starting user address ++ * @nr_pages: number of pages to lookup ++ * @write flag to verify if vma is writable ++ * @pages: array that receives pointers to the pages. Should ++ * be at least nr_pages long. Or NULL, if caller only ++ * intends to ensure the pages are valid. ++ * @vmas: array of pointers to vmas corresponding to each page. ++ * Or NULL if the caller does not require them. ++ * ++ * Pages may be system ram or io space mmapped to user virtual ++ * space via remap_pfn_range or io_remap_page_range, respectively. ++ * ++ * Returns number of pages found, which may be less than the number ++ * requested. Returns 0 if nr_pages is 0. ++ * ++ * Must be called with mmap_sem held for read or write. ++ */ ++static long get_remap_pages(struct mm_struct *mm, unsigned long start, ++ unsigned long nr_pages, int write, ++ struct page **pages, struct vm_area_struct **vmas) ++{ ++ struct vm_area_struct *vma; ++ unsigned long pfn; ++ long i = 0; ++ int ret; ++ ++ while (nr_pages) { ++ if (!(vma = find_vma(mm, start))) ++ return i ? : -EFAULT; ++ if (write && !(vma->vm_flags & VM_WRITE)) ++ return i ? : -EFAULT; ++ ++ do { ++ ret = follow_pfn(vma, start, &pfn); ++ if (ret) ++ return i ? : ret; ++ ++ if (pages) { ++ pages[i] = pfn_to_page(pfn); ++ if (pfn_valid(pfn)) ++ get_page(pages[i]); ++ } ++ if (vmas) ++ vmas[i] = vma; ++ ++ start += PAGE_SIZE; ++ nr_pages--; ++ i++; ++ } while (nr_pages && start < vma->vm_end); ++ } ++ ++ return i; ++} ++ ++/** ++ * ib_get_umem - Pin and DMA map userspace memory. + * @context: userspace context to pin memory for + * @addr: userspace virtual address to start at + * @size: length of region to pin + * @access: IB_ACCESS_xxx flags for memory being pinned + * @dmasync: flush in-flight DMA when the memory region is written + */ +-struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, ++struct ib_umem *ib_get_umem(struct ib_ucontext *context, unsigned long addr, + size_t size, int access, int dmasync) + { + struct ib_umem *umem; +@@ -101,7 +162,6 @@ + if (!umem) + return ERR_PTR(-ENOMEM); + +- umem->context = context; + umem->length = size; + umem->offset = addr & ~PAGE_MASK; + umem->page_size = PAGE_SIZE; +@@ -163,11 +223,18 @@ + sg_list_start = umem->sg_head.sgl; + + while (npages) { ++ + ret = get_user_pages(current, current->mm, cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof (struct page *)), + 1, !umem->writable, page_list, vma_list); + ++ if (ret == -EFAULT) /* may be a remapped area; try again */ ++ ret = get_remap_pages(current->mm, cur_base, ++ min_t(unsigned long, npages, ++ PAGE_SIZE / sizeof (struct page *)), ++ !umem->writable, page_list, vma_list); ++ + if (ret < 0) + goto out; + +@@ -219,7 +286,6 @@ + + return ret < 0 ? ERR_PTR(ret) : umem; + } +-EXPORT_SYMBOL(ib_umem_get); + + static void ib_umem_account(struct work_struct *work) + { +@@ -237,10 +303,10 @@ + } + + /** +- * ib_umem_release - release memory pinned with ib_umem_get ++ * ib_release_umem - release memory pinned with ib_umem_get + * @umem: umem struct to release + */ +-void ib_umem_release(struct ib_umem *umem) ++void ib_release_umem(struct ib_umem *umem) + { + struct ib_ucontext *context = umem->context; + struct mm_struct *mm; +@@ -290,9 +356,8 @@ + out: + kfree(umem); + } +-EXPORT_SYMBOL(ib_umem_release); + +-int ib_umem_page_count(struct ib_umem *umem) ++int ib_page_count_umem(struct ib_umem *umem) + { + int shift; + int i; +@@ -307,4 +372,40 @@ + + return n; + } ++ ++struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, ++ size_t size, int access, int dmasync) ++{ ++ struct ib_umem_ops *ops = context->umem_ops; ++ struct ib_umem *umem; ++ ++ umem = (ops && ops->get) ? ++ ops->get(context, addr, size, access, dmasync) : ++ ib_get_umem(context, addr, size, access, dmasync); ++ ++ if (!IS_ERR(umem)) ++ umem->context = context; ++ ++ return umem; ++} ++EXPORT_SYMBOL(ib_umem_get); ++ ++void ib_umem_release(struct ib_umem *umem) ++{ ++ struct ib_umem_ops *ops = umem->context->umem_ops; ++ ++ if (ops && ops->release) ++ ops->release(umem); ++ else ++ ib_release_umem(umem); ++} ++EXPORT_SYMBOL(ib_umem_release); ++ ++int ib_umem_page_count(struct ib_umem *umem) ++{ ++ struct ib_umem_ops *ops = umem->context->umem_ops; ++ ++ return (ops && ops->page_count) ? ++ ops->page_count(umem) : ib_page_count_umem(umem); ++} + EXPORT_SYMBOL(ib_umem_page_count); +diff -urN a3/drivers/infiniband/hw/mthca/mthca_memfree.c a4/drivers/infiniband/hw/mthca/mthca_memfree.c +--- a3/drivers/infiniband/hw/mthca/mthca_memfree.c 2015-01-05 14:12:52.112593540 -0800 ++++ a4/drivers/infiniband/hw/mthca/mthca_memfree.c 2015-01-05 14:36:00.825535043 -0800 +@@ -39,6 +39,12 @@ + + #include + ++/* Must use the ib_umem routines to support the IB proxy server. */ ++#define MTHCA_IB_UMEM ++#ifdef MTHCA_IB_UMEM ++#include ++#endif ++ + #include "mthca_memfree.h" + #include "mthca_dev.h" + #include "mthca_cmd.h" +@@ -56,7 +62,11 @@ + struct mutex mutex; + struct { + u64 uvirt; ++#ifdef MTHCA_IB_UMEM ++ struct ib_umem *umem; ++#else + struct scatterlist mem; ++#endif + int refcount; + } page[0]; + }; +@@ -446,7 +456,12 @@ + int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, + struct mthca_user_db_table *db_tab, int index, u64 uaddr) + { ++#ifdef MTHCA_IB_UMEM ++ struct mthca_ucontext *context; ++ struct ib_umem_chunk *chunk; ++#else + struct page *pages[1]; ++#endif + int ret = 0; + int i; + +@@ -472,6 +487,22 @@ + goto out; + } + ++#ifdef MTHCA_IB_UMEM ++ context = container_of(uar, struct mthca_ucontext, uar); ++ ++ db_tab->page[i].umem = ib_umem_get(&context->ibucontext, ++ uaddr & PAGE_MASK, PAGE_SIZE, 0, 0); ++ if (IS_ERR(db_tab->page[i].umem)) { ++ ret = PTR_ERR(db_tab->page[i].umem); ++ goto out; ++ } ++ ++ chunk = list_entry(db_tab->page[i].umem->chunk_list.next, ++ struct ib_umem_chunk, list); ++ ++ ret = mthca_MAP_ICM_page(dev, sg_dma_address(&chunk->page_list[0]), ++ mthca_uarc_virt(dev, uar, i)); ++#else + ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0, + pages, NULL); + if (ret < 0) +@@ -488,9 +519,14 @@ + + ret = mthca_MAP_ICM_page(dev, sg_dma_address(&db_tab->page[i].mem), + mthca_uarc_virt(dev, uar, i)); ++#endif + if (ret) { ++#ifdef MTHCA_IB_UMEM ++ ib_umem_release(db_tab->page[i].umem); ++#else + pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); + put_page(sg_page(&db_tab->page[i].mem)); ++#endif + goto out; + } + +@@ -505,6 +541,9 @@ + void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar, + struct mthca_user_db_table *db_tab, int index) + { ++#ifdef MTHCA_IB_UMEM ++ int i; ++#endif + if (!mthca_is_memfree(dev)) + return; + +@@ -515,7 +554,16 @@ + + mutex_lock(&db_tab->mutex); + ++#ifdef MTHCA_IB_UMEM ++ i = index / MTHCA_DB_REC_PER_PAGE; ++ if (!--db_tab->page[i].refcount) { ++ mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1); ++ ib_umem_release(db_tab->page[i].umem); ++ db_tab->page[i].uvirt = 0; ++ } ++#else + --db_tab->page[index / MTHCA_DB_REC_PER_PAGE].refcount; ++#endif + + mutex_unlock(&db_tab->mutex); + } +@@ -538,7 +586,11 @@ + for (i = 0; i < npages; ++i) { + db_tab->page[i].refcount = 0; + db_tab->page[i].uvirt = 0; ++#ifdef MTHCA_IB_UMEM ++ db_tab->page[i].umem = NULL; ++#else + sg_init_table(&db_tab->page[i].mem, 1); ++#endif + } + + return db_tab; +@@ -555,8 +607,12 @@ + for (i = 0; i < dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE; ++i) { + if (db_tab->page[i].uvirt) { + mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1); ++#ifdef MTHCA_IB_UMEM ++ ib_umem_release(db_tab->page[i].umem); ++#else + pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); + put_page(sg_page(&db_tab->page[i].mem)); ++#endif + } + } + +diff -urN a3/include/rdma/ib_verbs.h a4/include/rdma/ib_verbs.h +--- a3/include/rdma/ib_verbs.h 2015-01-05 14:18:48.871578512 -0800 ++++ a4/include/rdma/ib_verbs.h 2015-01-05 14:36:00.826535043 -0800 +@@ -1122,7 +1122,18 @@ + u8 page_shift; + }; + ++struct ib_ucontext; ++struct ib_umem_ops { ++ struct ib_umem *(*get)(struct ib_ucontext *context, ++ unsigned long addr, size_t size, ++ int access, int dmasync); ++ void (*release)(struct ib_umem *umem); ++ int (*page_count)(struct ib_umem *umem); ++}; ++ + struct ib_ucontext { ++ struct ib_umem_ops *umem_ops; /* set to NULL for default ops */ ++ void *umem_private_data; + struct ib_device *device; + struct list_head pd_list; + struct list_head mr_list; diff --git a/tech-preview/xeon-phi/0005-allow-mic-ipoib-qp-creation.patch b/tech-preview/xeon-phi/0005-allow-mic-ipoib-qp-creation.patch new file mode 100644 index 0000000..7c97357 --- /dev/null +++ b/tech-preview/xeon-phi/0005-allow-mic-ipoib-qp-creation.patch @@ -0,0 +1,94 @@ +From 8e3cff460efe00954b4c99ea23e42527c234c3f9 Mon Sep 17 00:00:00 2001 +From: Phil Cayton +Date: Tue, 4 Feb 2014 12:22:38 -0800 +Subject: [PATCH 05/12] allow mic ipoib qp creation + +From the host point of view, each MIC kernel appears as a "user-mode process" +to allow address translation to access the correct coprocessor mapped across +PCI. To enable the IPoIB driver in MIC kernel, some QP creation flags must +be checked regardless of whether the call originates from kernel or user +space. Because these create_flags cannot be set by normal user-mode calls +through ib_uverbs, moving the check is not an issue. This patch allows the +IPoIB driver on MIC to create QPs correctly. +--- +diff -urN a4/drivers/infiniband/hw/mlx4/qp.c a5/drivers/infiniband/hw/mlx4/qp.c +--- a4/drivers/infiniband/hw/mlx4/qp.c 2015-01-05 14:35:38.055536002 -0800 ++++ a5/drivers/infiniband/hw/mlx4/qp.c 2015-01-08 09:50:29.971123797 -0800 +@@ -692,6 +692,12 @@ + + qp->mlx4_ib_qp_type = qp_type; + ++ if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) ++ qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; ++ ++ if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) ++ qp->flags |= MLX4_IB_QP_LSO; ++ + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); +@@ -744,13 +750,7 @@ + } + } else { + qp->sq_no_prefetch = 0; +- +- if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) +- qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; +- +- if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) +- qp->flags |= MLX4_IB_QP_LSO; +- ++/* + if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { + if (dev->steering_support == + MLX4_STEERING_MODE_DEVICE_MANAGED) +@@ -758,7 +758,7 @@ + else + goto err; + } +- ++*/ + err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); + if (err) + goto err; +@@ -1060,6 +1060,7 @@ + + gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ? + GFP_NOIO : GFP_KERNEL; ++#if 0 /* Removed to allow Xeon Phi's use of ib_ipoib via CCL-Direct (ibp) */ + /* + * We only support LSO, vendor flag1, and multicast loopback blocking, + * and only for kernel UD QPs. +@@ -1084,6 +1085,7 @@ + ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) && + init_attr->qp_type > IB_QPT_GSI))) + return ERR_PTR(-EINVAL); ++#endif /* if 0 */ + + switch (init_attr->qp_type) { + case IB_QPT_XRC_TGT: +@@ -1120,9 +1122,11 @@ + case IB_QPT_SMI: + case IB_QPT_GSI: + { ++#if 0 /* Removed to allow Xeon Phi's use of ib_ipoib via CCL-Direct (ibp) */ + /* Userspace is not allowed to create special QPs: */ + if (udata) + return ERR_PTR(-EINVAL); ++#endif /* if 0 */ + + err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, + get_sqp_num(to_mdev(pd->device), init_attr), +diff -urN a4/drivers/infiniband/hw/mlx5/qp.c a5/drivers/infiniband/hw/mlx5/qp.c +--- a4/drivers/infiniband/hw/mlx5/qp.c 2015-01-05 14:35:38.065536002 -0800 ++++ a5/drivers/infiniband/hw/mlx5/qp.c 2015-01-05 14:46:41.322508063 -0800 +@@ -852,6 +852,9 @@ + } + + if (pd) { ++ if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) ++ qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; ++ + if (pd->uobject) { + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count); + if (ucmd.rq_wqe_shift != qp->rq.wqe_shift || diff --git a/tech-preview/xeon-phi/0006-add-scif.h-to-the-include-directory-matching-the-loc.patch b/tech-preview/xeon-phi/0006-add-scif.h-to-the-include-directory-matching-the-loc.patch new file mode 100644 index 0000000..ca0529c --- /dev/null +++ b/tech-preview/xeon-phi/0006-add-scif.h-to-the-include-directory-matching-the-loc.patch @@ -0,0 +1,1760 @@ +From 129a1e301d8567b8d79abe19fd2d998738951cda Mon Sep 17 00:00:00 2001 +From: Phil Cayton +Date: Tue, 4 Feb 2014 12:23:56 -0800 +Subject: [PATCH 06/12] add scif.h to the include directory matching the + location that is in the mpss installation + +Signed-off-by: Phil Cayton +--- +diff -urN a5/include/modules/scif.h a6/include/modules/scif.h +--- a5/include/modules/scif.h 1969-12-31 16:00:00.000000000 -0800 ++++ a6/include/modules/scif.h 2015-01-05 14:59:07.370476637 -0800 +@@ -0,0 +1,1748 @@ ++/* ++ * Copyright 2010-2013 Intel Corporation. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License, version 2, ++ * as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software Foundation, ++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. ++ * ++ * Disclaimer: The codes contained in these modules may be specific to ++ * the Intel Software Development Platform codenamed Knights Ferry, ++ * and the Intel product codenamed Knights Corner, and are not backward ++ * compatible with other Intel products. Additionally, Intel will NOT ++ * support the codes or instruction set in future products. ++ * ++ * Intel offers no warranty of any kind regarding the code. This code is ++ * licensed on an "AS IS" basis and Intel is not obligated to provide ++ * any support, assistance, installation, training, or other services ++ * of any kind. Intel is also not obligated to provide any updates, ++ * enhancements or extensions. Intel specifically disclaims any warranty ++ * of merchantability, non-infringement, fitness for any particular ++ * purpose, and any other warranty. ++ * ++ * Further, Intel disclaims all liability of any kind, including but ++ * not limited to liability for infringement of any proprietary rights, ++ * relating to the use of the code, even if Intel is notified of the ++ * possibility of such liability. Except as expressly stated in an Intel ++ * license agreement provided with this code and agreed upon with Intel, ++ * no license, express or implied, by estoppel or otherwise, to any ++ * intellectual property rights is granted herein. ++ */ ++ ++/* ++ * Revised 15:05 11/24/2010 ++ * Derived from SCIF SAS v0.41 with additional corrections ++ */ ++ ++#ifndef __SCIF_H__ ++#define __SCIF_H__ ++ ++#include ++#include ++#include ++#include ++ ++#ifdef __cplusplus ++extern "C" { ++#endif ++ ++#define SCIF_ACCEPT_SYNC 1 ++#define SCIF_SEND_BLOCK 1 ++#define SCIF_RECV_BLOCK 1 ++ ++/** ++ * The purpose of SCIF_VERSION is to check for compatibility between host and ++ * card SCIF modules. This version should be incremented whenever any changes ++ * are made to the SCIF driver code that is common to both card and the host. ++ * Whenever this version is incremented, SCIF_LIB_VERSION in user mode libscif ++ * scif.h file should be incremented and vice versa. Both the versions should ++ * always match. ++ */ ++#define SCIF_VERSION 1 ++ ++/* Start: Deprecated Temporary definition for compatability */ ++#define ACCEPT_SYNC SCIF_ACCEPT_SYNC ++#define SEND_BLOCK SCIF_SEND_BLOCK ++#define RECV_BLOCK SCIF_RECV_BLOCK ++/* End: Deprecated Temporary definition for compatability */ ++ ++enum { ++ SCIF_PROT_READ = (1<<0), ++ SCIF_PROT_WRITE = (1<<1) ++}; ++ ++enum { ++ SCIF_MAP_FIXED = 0x10, ++ SCIF_MAP_KERNEL = 0x20 ++}; ++ ++enum { ++ SCIF_FENCE_INIT_SELF = (1<<0), ++ SCIF_FENCE_INIT_PEER = (1<<1) ++}; ++ ++enum { ++ SCIF_FENCE_RAS_SELF = (1<<2), ++ SCIF_FENCE_RAS_PEER = (1<<3) ++}; ++ ++enum { ++ SCIF_SIGNAL_LOCAL = (1<<4), ++ SCIF_SIGNAL_REMOTE = (1<<5) ++}; ++ ++#define SCIF_RMA_USECPU 1 ++#define SCIF_RMA_USECACHE (1<<1) ++#define SCIF_RMA_SYNC (1<<2) ++#define SCIF_RMA_ORDERED (1<<3) ++//! @cond (Prevent doxygen from including these) ++#define SCIF_POLLIN POLLIN ++#define SCIF_POLLOUT POLLOUT ++#define SCIF_POLLERR POLLERR ++#define SCIF_POLLHUP POLLHUP ++#define SCIF_POLLNVAL POLLNVAL ++ ++/* SCIF Reserved Ports */ ++/* COI */ ++#define SCIF_COI_PORT_0 40 ++#define SCIF_COI_PORT_1 41 ++#define SCIF_COI_PORT_2 42 ++#define SCIF_COI_PORT_3 43 ++#define SCIF_COI_PORT_4 44 ++#define SCIF_COI_PORT_5 45 ++#define SCIF_COI_PORT_6 46 ++#define SCIF_COI_PORT_7 47 ++#define SCIF_COI_PORT_8 48 ++#define SCIF_COI_PORT_9 49 ++ ++/* OFED */ ++#define SCIF_OFED_PORT_0 60 ++#define SCIF_OFED_PORT_1 61 ++#define SCIF_OFED_PORT_2 62 ++#define SCIF_OFED_PORT_3 63 ++#define SCIF_OFED_PORT_4 64 ++#define SCIF_OFED_PORT_5 65 ++#define SCIF_OFED_PORT_6 66 ++#define SCIF_OFED_PORT_7 67 ++#define SCIF_OFED_PORT_8 68 ++#define SCIF_OFED_PORT_9 69 ++ ++/* NETDEV */ ++#define SCIF_NETDEV_PORT_0 80 ++#define SCIF_NETDEV_PORT_1 81 ++#define SCIF_NETDEV_PORT_2 82 ++#define SCIF_NETDEV_PORT_3 83 ++#define SCIF_NETDEV_PORT_4 84 ++#define SCIF_NETDEV_PORT_5 85 ++#define SCIF_NETDEV_PORT_6 86 ++#define SCIF_NETDEV_PORT_7 87 ++#define SCIF_NETDEV_PORT_8 88 ++#define SCIF_NETDEV_PORT_9 89 ++ ++/* RAS */ ++#define SCIF_RAS_PORT_0 100 ++#define SCIF_RAS_PORT_1 101 ++#define SCIF_RAS_PORT_2 102 ++#define SCIF_RAS_PORT_3 103 ++#define SCIF_RAS_PORT_4 104 ++#define SCIF_RAS_PORT_5 105 ++#define SCIF_RAS_PORT_6 106 ++#define SCIF_RAS_PORT_7 107 ++#define SCIF_RAS_PORT_8 108 ++#define SCIF_RAS_PORT_9 109 ++ ++/* Power Management */ ++#define SCIF_PM_PORT_0 120 ++#define SCIF_PM_PORT_1 121 ++#define SCIF_PM_PORT_2 122 ++#define SCIF_PM_PORT_3 123 ++#define SCIF_PM_PORT_4 124 ++#define SCIF_PM_PORT_5 125 ++#define SCIF_PM_PORT_6 126 ++#define SCIF_PM_PORT_7 127 ++#define SCIF_PM_PORT_8 128 ++#define SCIF_PM_PORT_9 129 ++ ++/* Board Tools */ ++#define SCIF_BT_PORT_0 130 ++#define SCIF_BT_PORT_1 131 ++#define SCIF_BT_PORT_2 132 ++#define SCIF_BT_PORT_3 133 ++#define SCIF_BT_PORT_4 134 ++#define SCIF_BT_PORT_5 135 ++#define SCIF_BT_PORT_6 136 ++#define SCIF_BT_PORT_7 137 ++#define SCIF_BT_PORT_8 138 ++#define SCIF_BT_PORT_9 139 ++ ++/* MIC Boot/Configuration support */ ++#define MPSSD_DOWNLOAD 160 ++#define MIC_NOTIFY 161 ++ ++#define SCIF_ADMIN_PORT_END 1024 ++ ++/* MYO */ ++#define SCIF_MYO_PORT_0 1025 ++#define SCIF_MYO_PORT_1 1026 ++#define SCIF_MYO_PORT_2 1027 ++#define SCIF_MYO_PORT_3 1028 ++#define SCIF_MYO_PORT_4 1029 ++#define SCIF_MYO_PORT_5 1030 ++#define SCIF_MYO_PORT_6 1031 ++#define SCIF_MYO_PORT_7 1032 ++#define SCIF_MYO_PORT_8 1033 ++#define SCIF_MYO_PORT_9 1034 ++ ++/* SSG Tools */ ++#define SCIF_ST_PORT_0 1044 ++#define SCIF_ST_PORT_1 1045 ++#define SCIF_ST_PORT_2 1046 ++#define SCIF_ST_PORT_3 1047 ++#define SCIF_ST_PORT_4 1048 ++#define SCIF_ST_PORT_5 1049 ++#define SCIF_ST_PORT_6 1050 ++#define SCIF_ST_PORT_7 1051 ++#define SCIF_ST_PORT_8 1052 ++#define SCIF_ST_PORT_9 1053 ++ ++/* End of SCIF Reserved Ports */ ++#define SCIF_PORT_RSVD 1088 ++//! @endcond ++ ++typedef struct endpt *scif_epd_t; ++ ++typedef struct scif_pinned_pages *scif_pinned_pages_t; ++ ++struct scif_range { ++ void *cookie; /* cookie */ ++ int nr_pages; /* Number of Pages */ ++ int prot_flags; /* R/W protection */ ++ /* Arrays phys_addr/va below are virtually contiguous */ ++ dma_addr_t *phys_addr; /* Array of physical addresses */ ++ void **va; /* Array of virtual addresses ++ * and populated only when called ++ * on the host for a remote SCIF ++ * connection on MIC. ++ */ ++}; ++ ++struct scif_pollepd { ++ scif_epd_t epd; /* endpoint descriptor */ ++ short events; /* requested events */ ++ short revents; /* returned events */ ++}; ++enum scif_event_type { ++ SCIF_NODE_ADDED = 1<<0, ++ SCIF_NODE_REMOVED = 1<<1 ++}; ++ ++union eventd { ++ uint16_t scif_node_added; ++ uint16_t scif_node_removed; ++}; ++ ++typedef void (*scif_callback_t)(enum scif_event_type event, union eventd ++data); ++ ++struct scif_callback { ++ struct list_head list_member; ++ scif_callback_t callback_handler; ++}; ++ ++#define SCIF_OPEN_FAILED ((scif_epd_t)-1) ++#define SCIF_REGISTER_FAILED ((off_t)-1) ++#define SCIF_MMAP_FAILED ((void *)-1) ++ ++struct scif_portID { ++ uint16_t node; /* node on which port resides */ ++ uint16_t port; /* Local port number */ ++}; ++ ++/* Start: Deprecated Temporary definition for compatability */ ++#define portID scif_portID ++typedef struct portID portID_t; ++/* End: Deprecated Temporary definition for compatability */ ++ ++/** ++ * scif_open - Create an endpoint ++ * ++ *\return ++ * The scif_open() function creates a new endpoint. ++ * ++ * Upon successful completion, scif_open() returns an endpoint descriptor to ++ * be used in subsequent SCIF functions calls to refer to that endpoint; ++ * otherwise: in user mode SCIF_OPEN_FAILED (that is ((scif_epd_t)-1)) is ++ * returned and errno is set to indicate the error; in kernel mode a NULL ++ * scif_epd_t is returned. ++ * ++ *\par Errors: ++ *- ENOMEM ++ * - Insufficient kernel memory was available. ++ *- ENXIO ++ * - Version mismatch between micscif driver and libscif. ++ */ ++scif_epd_t scif_open(void); ++ ++/** ++ * scif _bind - Bind an endpoint to a port ++ * \param epd endpoint descriptor ++ * \param pn port number ++ * ++ * scif_bind() binds endpoint epd to port pn, where pn is a port number on the ++ * local node. If pn is zero, a port number greater than or equal to ++ * SCIF_PORT_RSVD is assigned and returned. Each endpoint may be bound to ++ * exactly one local port. Ports less than 1024 when requested can only be bound ++ * by system (or root) processes or by processes executed by privileged users. ++ * ++ *\return ++ * Upon successful completion, scif_bind() returns the port number to which epd ++ * is bound; otherwise: in user mode -1 is returned and errno is set to ++ * indicate the error; in kernel mode the negative of one of the following ++ * errors is returned. ++ * ++ *\par Errors: ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - The endpoint or the port are already bound. ++ *- EISCONN ++ * - The endpoint is already connected. ++ *- ENOSPC ++ * - No port number available for assignment (when pn==0). ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++ *- EACCES ++ * - The port requested is protected and the user is not the superuser. ++*/ ++int scif_bind(scif_epd_t epd, uint16_t pn); ++ ++/** ++ * scif_listen - Listen for connections on an endpoint ++ * ++ * \param epd endpoint descriptor ++ * \param backlog maximum pending connection requests ++ * ++ * scif_listen() marks the endpoint epd as a listening endpoint - that is, as ++ * an endpoint that will be used to accept incoming connection requests. Once ++ * so marked, the endpoint is said to be in the listening state and may not be ++ * used as the endpoint of a connection. ++ * ++ * The endpoint, epd, must have been bound to a port. ++ * ++ * The backlog argument defines the maximum length to which the queue of ++ * pending connections for epd may grow. If a connection request arrives when ++ * the queue is full, the client may receive an error with an indication that ++ * the connection was refused. ++ * ++ *\return ++ * Upon successful completion, scif_listen() returns 0; otherwise: in user mode ++ * -1 is returned and errno is set to indicate the error; in kernel mode the ++ * negative of one of the following errors is returned. ++ * ++ *\par Errors: ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - The endpoint is not bound to a port ++ *- EISCONN ++ * - The endpoint is already connected or listening ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++*/ ++int scif_listen(scif_epd_t epd, int backlog); ++ ++/** ++ * scif_connect - Initiate a connection on a port ++ * \param epd endpoint descriptor ++ * \param dst global id of port to which to connect ++ * ++ * The scif_connect() function requests the connection of endpoint epd to remote ++ * port dst. If the connection is successful, a peer endpoint, bound to dst, is ++ * created on node dst.node. On successful return, the connection is complete. ++ * ++ * If the endpoint epd has not already been bound to a port, scif_connect() ++ * will bind it to an unused local port. ++ * ++ * A connection is terminated when an endpoint of the connection is closed, ++ * either explicitly by scif_close(), or when a process that owns one of the ++ * endpoints of a connection is terminated. ++ * ++ *\return ++ * Upon successful completion, scif_connect() returns the port ID to which the ++ * endpoint, epd, is bound; otherwise: in user mode -1 is returned and errno is ++ * set to indicate the error; in kernel mode the negative of one of the ++ * following errors is returned. ++ * ++ *\par Errors: ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- ECONNREFUSED ++ * - The destination was not listening for connections or refused the ++ * connection request. ++ *- EINTR ++ * - Interrupted function ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - dst.port is not a valid port ID ++ *- EISCONN ++ * - The endpoint is already connected ++ *- ENOBUFS ++ * - No buffer space is available ++ *- ENODEV ++ * - The destination node does not exist, or ++ * - The node is lost. ++ *- ENOSPC ++ * - No port number available for assignment (when pn==0). ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++ *- EOPNOTSUPP ++ * - The endpoint is listening and cannot be connected ++*/ ++int scif_connect(scif_epd_t epd, struct scif_portID *dst); ++ ++/** ++ * scif_accept - Accept a connection on an endpoint ++ * \param epd endpoint descriptor ++ * \param peer global id of port to which connected ++ * \param newepd new connected endpoint descriptor ++ * \param flags flags ++ * ++ * The scif_accept() call extracts the first connection request on the queue of ++ * pending connections for the port on which epd is listening. scif_accept() ++ * creates a new endpoint, bound to the same port as epd, and allocates a new ++ * SCIF endpoint descriptor, returned in newepd, for the endpoint. The new ++ * endpoint is connected to the endpoint through which the connection was ++ * requested. epd is unaffected by this call, and remains in the listening ++ * state. ++ * ++ * On successful return, peer holds the global port identifier (node id and ++ * local port number) of the port which requested the connection. ++ * ++ * If the peer endpoint which requested the connection is closed, the endpoint ++ * returned by scif_accept() is closed. ++ * ++ * The number of connections that can (subsequently) be accepted on epd is only ++ * limited by system resources (memory). ++ * ++ * The flags argument is formed by OR'ing together zero or more of the ++ * following values: ++ *- SCIF_ACCEPT_SYNC: block until a connection request is presented. If ++ * SCIF_ACCEPT_SYNC is not in flags, and no pending ++ * connections are present on the queue, scif_accept()fails ++ * with an EAGAIN error ++ * ++ * On Linux in user mode, the select() and poll() functions can be used to ++ * determine when there is a connection request. On Microsoft Windows* and on ++ * Linux in kernel mode, the scif_poll() function may be used for this purpose. ++ * A readable event will be delivered when a connection is requested. ++ * ++ *\return ++ * Upon successful completion, scif_accept() returns 0; otherwise: in user mode ++ * -1 is returned and errno is set to indicate the error; in kernel mode the ++ * negative of one of the following errors is returned. ++ * ++ *\par Errors: ++ *- EAGAIN ++ * - SCIF_ACCEPT_SYNC is not set and no connections are present to be accepted, or ++ * - SCIF_ACCEPT_SYNC is not set and remote node failed to complete its ++ * connection request ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- EINTR ++ * - Interrupted function ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - epd is not a listening endpoint ++ * - flags is invalid ++ * - peer is NULL ++ * - newepd is NULL ++ *- ENOBUFS ++ * - No buffer space is available ++ *- ENODEV ++ * - The requesting node is lost. ++ *- ENOMEM ++ * - Not enough space ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++ *- ENOENT ++ * - Secondary part of epd registeration failed. ++*/ ++int scif_accept(scif_epd_t epd, struct scif_portID *peer, scif_epd_t ++*newepd, int flags); ++ ++/** ++ * scif_close - Close an endpoint ++ * \param epd endpoint descriptor ++ * ++ * scif_close() closes an endpoint and performs necessary teardown of ++ * facilities associated with that endpoint. ++ * ++ * If epd is a listening endpoint then it will no longer accept connection ++ * requests on the port to which it is bound. Any pending connection requests ++ * are rejected. ++ * ++ * If epd is a connected endpoint, then its peer endpoint is also closed. RMAs ++ * which are in-process through epd or its peer endpoint will complete before ++ * scif_close() returns. Registered windows of the local and peer endpoints are ++ * released as if scif_unregister() was called against each window. ++ * ++ * Closing an endpoint does not affect mappings to remote memory. These remain ++ * until explicitly removed by calling scif_munmap(). ++ * ++ * If the peer endpoint's receive queue is not empty at the time that epd is ++ * closed, then the peer endpoint can be passed as the endpoint parameter to ++ * scif_recv() until the receive queue is empty. ++ * ++ * If epd is bound to a port, then the port is returned to the pool of ++ * available ports. ++ * ++ * epd is freed and may no longer be accessed. ++ * ++ *\return ++ * Upon successful completion, scif_close() returns 0; otherwise: in user mode ++ * -1 is returned and errno is set to indicate the error; in kernel mode the ++ * negative of one of the following errors is returned. ++ * ++ *\par Errors: ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor ++ */ ++int scif_close(scif_epd_t epd); ++ ++/** ++ * scif_send - Send a message ++ * \param epd endpoint descriptor ++ * \param msg message buffer address ++ * \param len message length ++ * \param flags blocking mode flags ++ * ++ * scif_send() sends data to the peer of endpoint epd. Up to len bytes of data ++ * are copied from memory starting at address msg. On successful execution the ++ * return value of scif_send() is the number of bytes that were sent, and is ++ * zero if no bytes were sent because len was zero. scif_send() may be called ++ * only when the endpoint is in a connected state. ++ * ++ * If a scif_send() call is non-blocking, then it sends only those bytes which ++ * can be sent without waiting, up to a maximum of len bytes. ++ * ++ * If a scif_send() call is blocking, then it normally returns after sending ++ * all len bytes. If a blocking call is interrupted or the connection is ++ * forcibly closed, the call is considered successful if some bytes were sent ++ * or len is zero, otherwise the call is considered unsuccessful. ++ * ++ * On Linux in user mode, the select() and poll() functions can be used to ++ * determine when the send queue is not full. On Microsoft Windows* and on ++ * Linux in kernel mode, the scif_poll() function may be used for this purpose. ++ * ++ * It is recommended that scif_send()/scif_recv() only be used for short ++ * control-type message communication between SCIF endpoints. The SCIF RMA ++ * APIs are expected to provide better performance for transfer sizes of ++ * 1024 bytes or longer. ++ * ++ * The flags argument is formed by ORing together zero or more of the following ++ * values: ++ *- SCIF_SEND_BLOCK: block until the entire message is sent. ++ * ++ *\return ++ * Upon successful completion, scif_send() returns the number of bytes sent; ++ * otherwise: in user mode -1 is returned and errno is set to indicate the ++ * error; in kernel mode the negative of one of the following errors is ++ * returned. ++ * ++ *\par Errors: ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- ECONNRESET ++ * - A connection was forcibly closed by a peer. ++ *- EFAULT ++ * - An invalid address was specified for a parameter. ++ *- EINTR ++ * - epd was closed by scif_close() ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - flags is invalid ++ * - len is negative ++ *- ENODEV ++ * - The remote node is lost. ++ *- ENOMEM ++ * - Not enough space ++ *- ENOTCONN ++ * - The endpoint is not connected ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++ */ ++int scif_send(scif_epd_t epd, void *msg, int len, int flags); ++ ++/** ++ * scif_recv - Receive a message ++ * \param epd endpoint descriptor ++ * \param msg message buffer address ++ * \param len message buffer length ++ * \param flags blocking mode flags ++ * ++ * scif_recv() receives data from the peer of endpoint epd. Up to len bytes of ++ * data are copied to memory starting at address msg. On successful execution ++ * the return value of scif_recv() is the number of bytes that were received, ++ * and is zero if no bytes were received because len was zero. scif_recv() may ++ * be called only when the endpoint is in a connected state. ++ * ++ * If a scif_recv() call is non-blocking, then it receives only those bytes ++ * which can be received without waiting, up to a maximum of len bytes. ++ * ++ * If a scif_recv() call is blocking, then it normally returns after receiving ++ * all len bytes. If a blocking call is interrupted or the connection is ++ * forcibly closed, the call is considered successful if some bytes were ++ * received or len is zero, otherwise the call is considered unsuccessful; ++ * subsequent calls to scif_recv() will successfully receive all data sent ++ * through peer endpoint interruption or the connection was forcibly closed. ++ * ++ * On Linux in user mode, the select() and poll() functions can be used to ++ * determine when data is available to be received. On Microsoft Windows* and ++ * on Linux in kernel mode, the scif_poll() function may be used for this ++ * purpose. ++ * ++ * It is recommended that scif_send()/scif_recv() only be used for short ++ * control-type message communication between SCIF endpoints. The SCIF RMA ++ * APIs are expected to provide better performance for transfer sizes of ++ * 1024 bytes or longer. ++ * ++ * The flags argument is formed by ORing together zero or more of the following ++ * values: ++ *- SCIF_RECV_BLOCK: block until the entire message is received. ++ * ++ *\return ++ * Upon successful completion, scif_recv() returns the number of bytes ++ * received; otherwise: in user mode -1 is returned and errno is set to ++ * indicate the error; in kernel mode the negative of one of the following ++ * errors is returned. ++ * ++ *\par Errors: ++ *- EAGAIN ++ * - The destination node is returning from a low power state. ++ *- EBADF ++ * - epd is not a valid endpoint descriptor . ++ *- ECONNRESET ++ * - A connection was forcibly closed by a peer. ++ *- EFAULT ++ * - An invalid address was specified for a parameter. ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - flags is invalid, or ++ * - len is negative. ++ *- ENODEV ++ * - The remote node is lost. ++ *- ENOMEM ++ * - Not enough space. ++ *- ENOTCONN ++ * - The endpoint is not connected. ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++ */ ++int scif_recv(scif_epd_t epd, void *msg, int len, int flags); ++ ++/** ++ * scif_register - Mark a memory region for remote access. ++ * \param epd endpoint descriptor ++ * \param addr starting virtual address ++ * \param len length of range ++ * \param offset offset of window ++ * \param prot_flags read/write protection flags ++ * \param map_flags mapping flags ++ * ++ * The scif_register() function opens a window, a range of whole pages of the ++ * registered address space of the endpoint epd, starting at offset po and ++ * continuing for len bytes. The value of po, further described below, is a ++ * function of the parameters offset and len, and the value of map_flags. Each ++ * page of the window represents the physical memory page which backs the ++ * corresponding page of the range of virtual address pages starting at addr ++ * and continuing for len bytes. addr and len are constrained to be multiples ++ * of the page size. addr is interpreted as a user space address. A successful ++ * scif_register() call returns po as the return value. ++ * ++ * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset ++ * exactly, and offset is constrained to be a multiple of the page size. The ++ * mapping established by scif_register() will not replace any existing ++ * registration; an error is returned if any page within the range [offset, ++ * offset+len-1] intersects an existing window. ++ * Note: When SCIF_MAP_FIXED is set the current implementation limits ++ * offset to the range [0..2^62-1] and returns EADDRINUSE if the offset ++ * requested with SCIF_MAP_FIXED is in the range [2^62..2^63-1]. ++ * ++ * When SCIF_MAP_FIXED is not set, the implementation uses offset in an ++ * implementation-defined manner to arrive at po. The po value so chosen will ++ * be an area of the registered address space that the implementation deems ++ * suitable for a mapping of len bytes. An offset value of 0 is interpreted as ++ * granting the implementation complete freedom in selecting po, subject to ++ * constraints described below. A non-zero value of offset is taken to be a ++ * suggestion of an offset near which the mapping should be placed. When the ++ * implementation selects a value for po, it does not replace any extant ++ * window. In all cases, po will be a multiple of the page size. ++ * ++ * The physical pages which are so represented by a window are available for ++ * access in calls to scif_mmap(), scif_readfrom(), scif_writeto(), ++ * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the ++ * physical pages represented by the window will not be reused by the memory ++ * subsystem for any other purpose. Note that the same physical page may be ++ * represented by multiple windows. ++ * ++ * Subsequent operations which change the memory pages to which virtual ++ * addresses are mapped (such as mmap(), munmap(), scif_mmap() and ++ * scif_munmap()) have no effect on existing windows. ++ * ++ * On Linux, if the process will fork(), it is recommended that the registered ++ * virtual address range be marked with MADV_DONTFORK. Doing so will prevent ++ * problems due to copy-on-write semantics. ++ * ++ * The prot_flags argument is formed by OR'ing together one or more of the ++ * following values: ++ *- SCIF_PROT_READ: allow read operations from the window ++ *- SCIF_PROT_WRITE: allow write operations to the window ++ * ++ * The map_flags argument is formed by OR'ing together zero or more of ++ * the following values: ++ *- SCIF_MAP_FIXED: interpret offset exactly ++ * ++ *\return ++ * Upon successful completion, scif_register() returns the offset at which the ++ * mapping was placed (po); otherwise: in user mode SCIF_REGISTER_FAILED (that ++ * is (off_t *)-1) is returned and errno is set to indicate the error; in ++ * kernel mode the negative of one of the following errors is returned. ++ * ++ *\par Errors: ++ *- EADDRINUSE ++ * - SCIF_MAP_FIXED is set in map_flags, and pages in the range [offset, ++ * offset+len-1] are already registered ++ *- EAGAIN ++ * - The mapping could not be performed due to lack of resources ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- ECONNRESET ++ * - A connection was forcibly closed by a peer. ++ *- EFAULT ++ * - Addresses in the range [addr , addr + len - 1] are invalid ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - map_flags is invalid, or ++ * - prot_flags is invalid, or ++ * - SCIF_MAP_FIXED is set in flags, and offset is not a multiple of ++ * the page size, or ++ * - addr is not a multiple of the page size, or ++ * - len is not a multiple of the page size, or is 0, or ++ * - offset is negative ++ *- ENODEV ++ * - The remote node is lost. ++ *- ENOMEM ++ * - Not enough space ++ *- ENOTCONN ++ * - The endpoint is not connected ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++ */ ++off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset, ++int prot_flags, int map_flags); ++ ++/** ++ * scif_unregister - Mark a memory region for remote access. ++ * \param epd endpoint descriptor ++ * \param offset start of range to unregister ++ * \param len length of range to unregister ++ * ++ * The scif_unregister() function closes those previously registered windows ++ * which are entirely within the range [offset,offset+len-1]. It is an error to ++ * specify a range which intersects only a subrange of a window. ++ * ++ * On a successful return, pages within the window may no longer be specified ++ * in calls to scif_mmap(), scif_readfrom(), scif_writeto(), scif_vreadfrom(), ++ * scif_vwriteto(), scif_get_pages, and scif_fence_signal(). The window, however, ++ * continues to exist until all previous references against it are removed. A ++ * window is referenced if there is a mapping to it created by scif_mmap(), or if ++ * scif_get_pages() was called against the window (and the pages have not been ++ * returned via scif_put_pages()). A window is also referenced while an RMA, in ++ * which some range of the window is a source or destination, is in progress. ++ * Finally a window is referenced while some offset in that window was specified ++ * to scif_fence_signal(), and the RMAs marked by that call to ++ * scif_fence_signal() have not completed. While a window is in this state, its ++ * registered address space pages are not available for use in a new registered ++ * window. ++ * ++ * When all such references to the window have been removed, its references to ++ * all the physical pages which it represents are removed. Similarly, the ++ * registered address space pages of the window become available for ++ * registration in a new window. ++ * ++ *\return ++ * Upon successful completion, scif_unregister() returns 0; otherwise: in user ++ * mode -1 is returned and errno is set to indicate the error; in kernel mode ++ * the negative of one of the following errors is returned. In the event of an ++ * error, no windows are unregistered. ++ * ++ *\par Errors: ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- ECONNRESET ++ * - A connection was forcibly closed by a peer. ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - The range [offset,offset+len-1] intersects a subrange of a window, or ++ * - offset is negative ++ *- ENODEV ++ * -The remote node is lost. ++ *- ENOTCONN ++ * - The endpoint is not connected ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++ *- ENXIO ++ * - Addresses in the range [offset,offset+len-1] are invalid for the ++ * registered address space of epd. ++ */ ++int scif_unregister(scif_epd_t epd, off_t offset, size_t len); ++ ++ ++/** ++ * scif_readfrom - Copy from a remote address space ++ * \param epd endpoint descriptor ++ * \param loffset offset in local registered address space to ++ * which to copy ++ * \param len length of range to copy ++ * \param roffset offset in remote registered address space ++ * from which to copy ++ * \param rma_flags transfer mode flags ++ * ++ * scif_readfrom() copies len bytes from the remote registered address space of ++ * the peer of endpoint epd, starting at the offset roffset to the local ++ * registered address space of epd, starting at the offset loffset. ++ * ++ * Each of the specified ranges [loffset,loffset+len-1] and [roffset,roffset+ ++ * len-1] must be within some registered window or windows of the local and ++ * remote nodes respectively. A range may intersect multiple registered ++ * windows, but only if those windows are contiguous in the registered address ++ * space. ++ * ++ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using ++ * programmed read/writes. Otherwise the data is copied using DMA. If rma_- ++ * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the ++ * transfer is complete. Otherwise, the transfer may be performed asynchron- ++ * ously. The order in which any two aynchronous RMA operations complete ++ * is non-deterministic. The synchronization functions, scif_fence_mark()/ ++ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to ++ * the completion of asynchronous RMA operations. ++ * ++ * The DMA transfer of individual bytes is not guaranteed to complete in ++ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last ++ * cacheline or partial cacheline of the source range will become visible on ++ * the destination node after all other transferred data in the source ++ * range has become visible on the destination node. ++ * ++ * The optimal DMA performance will likely be realized if both ++ * loffset and roffset are cacheline aligned (are a multiple of 64). Lower ++ * performance will likely be realized if loffset and roffset are not ++ * cacheline aligned but are separated by some multiple of 64. The lowest level ++ * of performance is likely if loffset and roffset are not separated by a ++ * multiple of 64. ++ * ++ * The rma_flags argument is formed by ORing together zero or more of the ++ * following values: ++ *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA ++ * engine. ++ *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the ++ * transfer has completed. Passing this flag might result in ++ * the API busy waiting and consuming CPU cycles while the DMA ++ * transfer is in progress. ++ *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of ++ * the source range becomes visible on the destination node ++ * after all other transferred data in the source range has ++ * become visible on the destination ++ * ++ *\return ++ * Upon successful completion, scif_readfrom() returns 0; otherwise: in user ++ * mode -1 is returned and errno is set to indicate the error; in kernel mode ++ * the negative of one of the following errors is returned. ++ * ++ *\par Errors ++ *- EACCESS ++ * - Attempt to write to a read-only range or read from a write-only range ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- ECONNRESET ++ * - A connection was forcibly closed by a peer. ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - rma_flags is invalid ++ *- ENODEV ++ * -The remote node is lost. ++ *- ENOTCONN ++ * - The endpoint is not connected ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++ *- ENXIO ++ * - The range [loffset,loffset+len-1] is invalid for the registered address ++ * space of epd, or, ++ * - The range [roffset,roffset+len-1] is invalid for the registered address ++ * space of the peer of epd, or ++ * - loffset or roffset is negative ++*/ ++int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, off_t ++roffset, int rma_flags); ++ ++/** ++ * scif_writeto - Copy to a remote address space ++ * \param epd endpoint descriptor ++ * \param loffset offset in local registered address space ++ * from which to copy ++ * \param len length of range to copy ++ * \param roffset offset in remote registered address space to ++ * which to copy ++ * \param rma_flags transfer mode flags ++ * ++ * scif_writeto() copies len bytes from the local registered address space of ++ * epd, starting at the offset loffset to the remote registered address space ++ * of the peer of endpoint epd, starting at the offset roffset. ++ * ++ * Each of the specified ranges [loffset,loffset+len-1] and [roffset,roffset+ ++ * len-1] must be within some registered window or windows of the local and ++ * remote nodes respectively. A range may intersect multiple registered ++ * windows, but only if those windows are contiguous in the registered address ++ * space. ++ * ++ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using ++ * programmed read/writes. Otherwise the data is copied using DMA. If rma_- ++ * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the ++ * transfer is complete. Otherwise, the transfer may be performed asynchron- ++ * ously. The order in which any two aynchronous RMA operations complete ++ * is non-deterministic. The synchronization functions, scif_fence_mark()/ ++ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to ++ * the completion of asynchronous RMA operations. ++ * ++ * The DMA transfer of individual bytes is not guaranteed to complete in ++ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last ++ * cacheline or partial cacheline of the source range will become visible on ++ * the destination node after all other transferred data in the source ++ * range has become visible on the destination node. ++ * ++ * The optimal DMA performance will likely be realized if both ++ * loffset and roffset are cacheline aligned (are a multiple of 64). Lower ++ * performance will likely be realized if loffset and roffset are not cacheline ++ * aligned but are separated by some multiple of 64. The lowest level of ++ * performance is likely if loffset and roffset are not separated by a multiple ++ * of 64. ++ * ++ * The rma_flags argument is formed by ORing together zero or more of the ++ * following values: ++ *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA ++ * engine. ++ *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the ++ * transfer has completed. Passing this flag might result in ++ * the API busy waiting and consuming CPU cycles while the DMA ++ * transfer is in progress. ++ *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of ++ * the source range becomes visible on the destination node ++ * after all other transferred data in the source range has ++ * become visible on the destination ++ * ++ *\return ++ * Upon successful completion, scif_readfrom() returns 0; otherwise: in user ++ * mode -1 is returned and errno is set to indicate the error; in kernel mode ++ * the negative of one of the following errors is returned. ++ * ++ *\par Errors: ++ *- EACCESS ++ * - Attempt to write to a read-only range or read from a write-only range ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- ECONNRESET ++ * - A connection was forcibly closed by a peer. ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - rma_flags is invalid ++ *- ENODEV ++ * - The remote node is lost. ++ *- ENOTCONN ++ * - The endpoint is not connected ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++ *- ENXIO ++ * - The range [loffset,loffset+len-1] is invalid for the registered address ++ * space of epd, or, ++ * - The range [roffset , roffset + len -1] is invalid for the registered ++ * address space of the peer of epd, or ++ * - loffset or roffset is negative ++ */ ++int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, off_t ++roffset, int rma_flags); ++ ++/** ++ * scif_vreadfrom - Copy from a remote address space ++ * \param epd endpoint descriptor ++ * \param addr address to which to copy ++ * \param len length of range to copy ++ * \param roffset offset in remote registered address space ++ * from which to copy ++ * \param rma_flags transfer mode flags ++ * ++ * scif_vreadfrom() copies len bytes from the remote registered address ++ * space of the peer of endpoint epd, starting at the offset roffset, to local ++ * memory, starting at addr. addr is interpreted as a user space address. ++ * ++ * The specified range [roffset,roffset+len-1] must be within some registered ++ * window or windows of the remote nodes respectively. The range may intersect ++ * multiple registered windows, but only if those windows are contiguous in the ++ * registered address space. ++ * ++ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using ++ * programmed read/writes. Otherwise the data is copied using DMA. If rma_- ++ * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the ++ * transfer is complete. Otherwise, the transfer may be performed asynchron- ++ * ously. The order in which any two aynchronous RMA operations complete ++ * is non-deterministic. The synchronization functions, scif_fence_mark()/ ++ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to ++ * the completion of asynchronous RMA operations. ++ * ++ * The DMA transfer of individual bytes is not guaranteed to complete in ++ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last ++ * cacheline or partial cacheline of the source range will become visible on ++ * the destination node after all other transferred data in the source ++ * range has become visible on the destination node. ++ * ++ * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back ++ * the specified local memory range may be remain in a pinned state even after ++ * the specified transfer completes. This may reduce overhead if some or all of ++ * the same virtual address range is referenced in a subsequent call of ++ * scif_vreadfrom() or scif_vwriteto(). ++ * ++ * The optimal DMA performance will likely be realized if both ++ * loffset and roffset are cacheline aligned (are a multiple of 64). Lower ++ * performance will likely be realized if loffset and roffset are not ++ * cacheline aligned but are separated by some multiple of 64. The lowest level ++ * of performance is likely if loffset and roffset are not separated by a ++ * multiple of 64. ++ * ++ * The rma_flags argument is formed by ORing together zero or more of the ++ * following values: ++ *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA ++ * engine. ++ *- SCIF_RMA_USECACHE: enable registration caching ++ *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the ++ * transfer has completed. Passing this flag might result in ++ * the API busy waiting and consuming CPU cycles while the DMA ++ * transfer is in progress. ++ *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of ++ * the source range becomes visible on the destination node ++ * after all other transferred data in the source range has ++ * become visible on the destination ++ * ++ *\return ++ * Upon successful completion, scif_vreadfrom() returns 0; otherwise: in user ++ * mode -1 is returned and errno is set to indicate the error; in kernel mode ++ * the negative of one of the following errors is returned. ++ * ++ *\par Errors: ++ *- EACCESS ++ * - Attempt to write to a read-only range or read from a write-only range ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- ECONNRESET ++ * - A connection was forcibly closed by a peer. ++ *- EFAULT ++ * - Addresses in the range [addr,addr+len-1] are invalid ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - rma_flags is invalid ++ *- ENODEV ++ * - The remote node is lost. ++ *- ENOTCONN ++ * - The endpoint is not connected ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++ *- ENXIO ++ * - Addresses in the range [roffset,roffset+len-1] are invalid for the ++ * registered address space of epd. ++ */ ++int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, off_t offset, ++int rma_flags); ++ ++/** ++ * scif_vwriteto - Copy to a remote address space ++ * \param epd endpoint descriptor ++ * \param addr address from which to copy ++ * \param len length of range to copy ++ * \param roffset offset in remote registered address space to ++ * which to copy ++ * \param rma_flags transfer mode flags ++ * ++ * scif_vwriteto() copies len bytes from the local memory, starting at addr, to ++ * the remote registered address space of the peer of endpoint epd, starting at ++ * the offset roffset. addr is interpreted as a user space address. ++ * ++ * The specified range [roffset,roffset+len-1] must be within some registered ++ * window or windows of the remote nodes respectively. The range may intersect ++ * multiple registered windows, but only if those windows are contiguous in the ++ * registered address space. ++ * ++ * If rma_flags includes SCIF_RMA_USECPU, then the data is copied using ++ * programmed read/writes. Otherwise the data is copied using DMA. If rma_- ++ * flags includes SCIF_RMA_SYNC, then scif_readfrom() will return after the ++ * transfer is complete. Otherwise, the transfer may be performed asynchron- ++ * ously. The order in which any two aynchronous RMA operations complete ++ * is non-deterministic. The synchronization functions, scif_fence_mark()/ ++ * scif_fence_wait() and scif_fence_signal(), can be used to synchronize to ++ * the completion of asynchronous RMA operations. ++ * ++ * The DMA transfer of individual bytes is not guaranteed to complete in ++ * address order. If rma_flags includes SCIF_RMA_ORDERED, then the last ++ * cacheline or partial cacheline of the source range will become visible on ++ * the destination node after all other transferred data in the source ++ * range has become visible on the destination node. ++ * ++ * If rma_flags includes SCIF_RMA_USECACHE, then the physical pages which back ++ * the specified local memory range may be remain in a pinned state even after ++ * the specified transfer completes. This may reduce overhead if some or all of ++ * the same virtual address range is referenced in a subsequent call of ++ * scif_vreadfrom() or scif_vwriteto(). ++ * ++ * The optimal DMA performance will likely be realized if both ++ * addr and offset are cacheline aligned (are a multiple of 64). Lower ++ * performance will likely be realized if addr and offset are not cacheline ++ * aligned but are separated by some multiple of 64. The lowest level of ++ * performance is likely if addr and offset are not separated by a multiple of ++ * 64. ++ * ++ * The rma_flags argument is formed by ORing together zero or more of the ++ * following values: ++ *- SCIF_RMA_USECPU: perform the transfer using the CPU, otherwise use the DMA ++ * engine. ++ *- SCIF_RMA_USECACHE: allow registration caching ++ *- SCIF_RMA_SYNC: perform the transfer synchronously, returning after the ++ * transfer has completed. Passing this flag might result in ++ * the API busy waiting and consuming CPU cycles while the DMA ++ * transfer is in progress. ++ *- SCIF_RMA_ORDERED: ensure that the last cacheline or partial cacheline of ++ * the source range becomes visible on the destination node ++ * after all other transferred data in the source range has ++ * become visible on the destination ++ * ++ *\return ++ * Upon successful completion, scif_vwriteto () returns 0; otherwise: in user ++ * mode -1 is returned and errno is set to indicate the error; in kernel mode ++ * the negative of one of the following errors is returned. ++ * ++ *\par Errors: ++ *- EACCESS ++ * - Attempt to write to a read-only range or read from a write-only range ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- ECONNRESET ++ * - A connection was forcibly closed by a peer. ++ *- EFAULT ++ * - Addresses in the range [addr,addr+len-1] are invalid ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - rma_flags is invalid ++ *- ENODEV ++ * - The remote node is lost. ++ *- ENOTCONN ++ * - The endpoint is not connected ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++ *- ENXIO ++ * - Addresses in the range [roffset,roffset+len-1] are invalid for the ++ * registered address space of epd. ++ */ ++int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, off_t offset, ++int rma_flags); ++ ++/** ++ * scif_fence_mark - Mark previously issued RMAs ++ * \param epd endpoint descriptor ++ * \param flags control flags ++ * \param mark marked handle returned as output. ++ * ++ * scif_fence_mark() returns after marking the current set of all uncompleted ++ * RMAs initiated through the endpoint epd or the current set of all ++ * uncompleted RMAs initiated through the peer of endpoint epd. The RMAs are ++ * marked with a value returned at mark. The application may subsequently call ++ * scif_fence_wait(), passing the value returned at mark, to await completion ++ * of all RMAs so marked. ++ * ++ * The flags argument has exactly one of the following values: ++ *- SCIF_FENCE_INIT_SELF: RMA operations initiated through endpoint ++ * epd are marked ++ *- SCIF_FENCE_INIT_PEER: RMA operations initiated through the peer ++ * of endpoint epd are marked ++ * ++ * \return ++ * Upon successful completion, scif_fence_mark() returns 0; otherwise: in user ++ * mode -1 is returned and errno is set to indicate the error; in kernel mode ++ * the negative of one of the following errors is returned. ++ * ++ *\par Errors: ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- ECONNRESET ++ * - A connection was forcibly closed by a peer. ++ *- EINVAL ++ * - flags is invalid, or ++ * - epd is not a valid endpoint descriptor, or ++ *- ENODEV ++ * - The remote node is lost. ++ *- ENOTCONN ++ * - The endpoint is not connected ++ *- ENOMEM ++ * - Insufficient kernel memory was available. ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++ */ ++int scif_fence_mark(scif_epd_t epd, int flags, int *mark); ++ ++/** ++ * scif_fence_wait - Wait for completion of marked RMAs ++ * ++ * \param epd endpoint descriptor ++ * \param mark mark request ++ * ++ * scif_fence_wait() returns after all RMAs marked with mark have completed. ++ * The value passed in mark must have been obtained in a previous call to ++ * scif_fence_mark(). ++ * ++ *\return ++ * Upon successful completion, scif_fence_wait() returns 0; otherwise: in user ++ * mode -1 is returned and errno is set to indicate the error; in kernel mode ++ * the negative of one of the following errors is returned. ++ * ++ *\par Errors: ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- ECONNRESET ++ * - A connection was forcibly closed by a peer. ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ *- ENODEV ++ * - The remote node is lost. ++ *- ENOTCONN ++ * - The endpoint is not connected ++ *- ENOMEM ++ * - Insufficient kernel memory was available. ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++ */ ++int scif_fence_wait(scif_epd_t epd, int mark); ++ ++/** ++ * scif_fence_signal - Request a signal on completion of RMAs ++ * \param loff local offset ++ * \param lval local value to write to loffset ++ * \param roff remote offset ++ * \param rval remote value to write to roffset ++ * \param flags flags ++ * ++ * scif_fence_signal() returns after marking the current set of all uncompleted ++ * RMAs initiated through the endpoint epd or marking the current set of all ++ * uncompleted RMAs initiated through the peer of endpoint epd. ++ * ++ * If flags includes SCIF_SIGNAL_LOCAL, then on completion of the RMAs in the ++ * marked set, lval is written to memory at the address corresponding to offset ++ * loff in the local registered address space of epd. loff must be within a ++ * registered window. If flags includes SCIF_SIGNAL_REMOTE, then on completion ++ * of the RMAs in the marked set, rval is written to memory at the * address ++ * corresponding to offset roff in the remote registered address space of epd. ++ * roff must be within a remote registered window of the peer of epd. Note ++ * that any specified offset must be DWORD (4 byte / 32 bit) aligned. ++ * ++ * The flags argument is formed by OR'ing together the following: ++ *- Exactly one of the following values: ++ * - SCIF_FENCE_INIT_SELF: RMA operations initiated through endpoint ++ * epd are marked ++ * - SCIF_FENCE_INIT_PEER: RMA operations initiated through the peer ++ * of endpoint epd are marked ++ *- One or more of the following values: ++ * - SCIF_SIGNAL_LOCAL: On completion of the marked set of RMAs, write lval to ++ * memory at the address corresponding to offset loff in the local registered ++ * address space of epd. ++ * - SCIF_SIGNAL_REMOTE: On completion of the marked set of RMAs, write lval to ++ * memory at the address corresponding to offset roff in the remote registered ++ * address space of epd. ++ * ++ *\return ++ * Upon successful completion, scif_fence_signal() returns 0; otherwise: in ++ * user mode -1 is returned and errno is set to indicate the error; in kernel ++ * mode the negative of one of the following errors is returned. ++ *\par Errors: ++ *- EBADF ++ * - epd is not a valid endpoint descriptor ++ *- ECONNRESET ++ * - A connection was forcibly closed by a peer. ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - flags is invalid, or ++ * - loff or roff are not DWORD aligned ++ *- ENODEV ++ * - The remote node is lost. ++ *- ENOTCONN ++ * - The endpoint is not connected ++ *- ENOTTY ++ * - epd is not a valid endpoint descriptor ++ *- ENXIO ++ * - loff is invalid for the registered address of epd, or ++ * - roff is invalid for the registered address space, of the peer of epd ++ */ ++int scif_fence_signal(scif_epd_t epd, off_t loff, uint64_t lval, off_t roff, ++uint64_t rval, int flags); ++ ++/** ++ * scif_get_nodeIDs - Return information about online nodes ++ * \param nodes array in which to return online node IDs ++ * \param len number of entries in the nodes array ++ * \param self address to place the node ID of the local node ++ * ++ * scif_get_nodeIDs() fills in the nodes array with up to len node IDs of the ++ * nodes in the SCIF network. If there is not enough space in nodes, as ++ * indicated by the len parameter, only len node IDs are returned in nodes. The ++ * return value of scif_get_nodeID() is the total number of nodes currently in ++ * the SCIF network. By checking the return value against the len parameter, the user may ++ * determine if enough space for nodes was allocated. ++ * ++ * The node ID of the local node is returned at self. ++ * ++ *\return ++ * Upon successful completion, scif_get_nodeIDs() returns the actual number of ++ * online nodes in the SCIF network including 'self'; otherwise: in user mode ++ * -1 is returned and errno is set to indicate the error; in kernel mode no ++ * errors are returned. ++ * ++ *\par Errors: ++ *- EFAULT ++ * - Bad address ++ */ ++int scif_get_nodeIDs(uint16_t *nodes, int len, uint16_t *self); ++ ++ ++/** ++ * scif_pin_pages - Pin a set of pages ++ * \param addr Virtual address of range to pin ++ * \param len Length of range to pin ++ * \param prot_flags Page protection flags ++ * \param map_flags Page classification flags ++ * \param pinned_pages Opaque handle of pinned pages ++ * ++ * scif_pin_pages() pins (locks in physical memory) the physical pages which ++ * back the range of virtual address pages starting at addr and continuing for ++ * len bytes. addr and len are constrained to be multiples of the page size. A ++ * successful scif_register() call returns an opaque pointer value at ++ * pinned_pages which may be used in subsequent calls to ++ * scif_register_pinned_pages(). ++ * ++ * The pages will remain pinned as long as there is a reference against the ++ * scif_pinned_pages_t value returned by scif_pin_pages() and until ++ * scif_unpin_pages() is called, passing the scif_pinned_pages_t value. A ++ * reference is added to a scif_pinned_pages_t value each time a window is ++ * created by calling scif_register_pinned_pages() and passing the ++ * scif_pinned_pages_t value. A reference is removed from a scif_pinned_pages_t value ++ * each time such a window is deleted. ++ * ++ * Subsequent operations which change the memory pages to which virtual ++ * addresses are mapped (such as mmap(), munmap(), scif_mmap() and ++ * scif_munmap()) have no effect on the scif_pinned_pages_t value or windows ++ * created against it. ++ * ++ * On Linux, if the process will fork(), it is recommended that the registered ++ * virtual address range be marked with MADV_DONTFORK. Doing so will prevent ++ * problems due to copy-on-write semantics. ++ * ++ * The prot_flags argument is formed by OR'ing together one or more of the ++ * following values: ++ *- SCIF_PROT_READ: allow read operations against the pages ++ *- SCIF_PROT_WRITE: allow write operations against the pages ++ * The map_flags argument is formed by OR'ing together zero or more of the ++ * following values: ++ *- SCIF_MAP_KERNEL: interpret addr as a kernel space address. By default, addr ++ * is interpreted as a user space address. ++ * ++ *\return ++ * Upon successful completion, scif_register() returns 0; otherwise the ++ * negative of one of the following errors is returned. ++ *\par Errors: ++ *- EFAULT ++ * - Addresses in the range [addr,addr+len-1] are invalid ++ *- EINVAL ++ * - prot_flags is invalid, ++ * - map_flags is invalid, or ++ * - offset is negative ++ *- ENOMEM ++ * - Not enough space ++ */ ++int ++scif_pin_pages( ++ void *addr, ++ size_t len, ++ int prot_flags, ++ int map_flags, ++ scif_pinned_pages_t *pinned_pages); ++ ++/** ++ * scif_unpin_pages - Unpin a set of pages ++ * \param pinned_pages Opaque handle of pages to be unpinned ++ * ++ * scif_unpin_pages() prevents scif_register_pinned_pages()from registering new ++ * windows against pinned_pages. The physical pages represented by pinned_pages ++ * will remain pinned until all windows previously registered against ++ * pinned_pages are deleted (the window is scif_unregister()'d and all ++ * references to the window are removed (see scif_unregister()). ++ * ++ * pinned_pages must have been obtain from a previous call to scif_pin_pages(). ++ * After calling scif_unpin_pages(), it is an error to pass pinned_pages to ++ * scif_register_pinned_pages(). ++ * ++ *\return: ++ * Upon successful completion, scif_unpin_pages() returns 0; otherwise the ++ * negative of one of the following errors is returned. ++ * ++ *\par Errors: ++ *- EINVAL ++ * - pinned_pages is not valid ++ */ ++int ++scif_unpin_pages( ++ scif_pinned_pages_t pinned_pages); ++ ++/** ++ * scif_register_pinned_pages - Mark a memory region for remote access. ++ * \param epd Endpoint descriptor ++ * \param pinned_pages Opaque handle of pinned pages ++ * \param offset Registered address space offset ++ * \param map_flags Flags which control where pages are mapped ++ * ++ * The scif_register_pinned_pages() function opens a window, a range of whole ++ * pages of the registered address space of the endpoint epd, starting at ++ * offset po. The value of po, further described below, is a function of the ++ * parameters offset and pinned_pages, and the value of map_flags. Each page of ++ * the window represents a corresponding physical memory page of the range ++ * represented by pinned_pages; the length of the window is the same as the ++ * length of range represented by pinned_pages. A successful scif_register() ++ * call returns po as the return value. ++ * ++ * When SCIF_MAP_FIXED is set in the map_flags argument, po will be offset ++ * exactly, and offset is constrained to be a multiple of the page size. The ++ * mapping established by scif_register() will not replace any existing ++ * registration; an error is returned if any page of the new window would ++ * intersect an existing window. ++ * ++ * When SCIF_MAP_FIXED is not set, the implementation uses offset in an ++ * implementation-defined manner to arrive at po. The po so chosen will be an ++ * area of the registered address space that the implementation deems suitable ++ * for a mapping of the required size. An offset value of 0 is interpreted as ++ * granting the implementation complete freedom in selecting po, subject to ++ * constraints described below. A non-zero value of offset is taken to be a ++ * suggestion of an offset near which the mapping should be placed. When the ++ * implementation selects a value for po, it does not replace any extant ++ * window. In all cases, po will be a multiple of the page size. ++ * ++ * The physical pages which are so represented by a window are available for ++ * access in calls to scif_get_pages(), scif_readfrom(), scif_writeto(), ++ * scif_vreadfrom(), and scif_vwriteto(). While a window is registered, the ++ * physical pages represented by the window will not be reused by the memory ++ * subsytem for any other purpose. Note that the same physical page may be ++ * represented by multiple windows. ++ * ++ * Windows created by scif_register_pinned_pages() are unregistered by ++ * scif_unregister(). ++ * ++ * The map_flags argument is formed by OR'ing together zero or more of the ++ * following values: ++ *- SCIF_MAP_FIXED: interpret offset exactly ++ * ++ *\return ++ * Upon successful completion, scif_register_pinned_pages() returns the offset ++ * at which the mapping was placed (po); otherwise the negative of one of the ++ * following errors is returned. ++ *\par Errors: ++ *- EADDRINUSE ++ * - SCIF_MAP_FIXED is set in map_flags and pages in the new ++ * window would intersect an existing window ++ *- EAGAIN ++ * - The mapping could not be performed due to lack of resources ++ *- ECONNRESET ++ * - A connection was forcibly closed by a peer. ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - map_flags is invalid, or ++ * - SCIF_MAP_FIXED is set in map_flags, and offset is not a ++ * multiple of the page size, or ++ * - offset is negative ++ *- ENODEV ++ * - The remote node is lost. ++ *- ENOMEM ++ * - Not enough space ++ *- ENOTCONN ++ * - The endpoint is not connected ++ */ ++off_t ++scif_register_pinned_pages( ++ scif_epd_t epd, ++ scif_pinned_pages_t pinned_pages, ++ off_t offset, ++ int map_flags); ++ ++/** ++ * scif_get_pages - Add references to remote registered pages ++ * \param epd endpoint descriptor ++ * \param offset registered address space offset ++ * \param len length of range of pages ++ * \param pages returned scif_range structure ++ * ++ * scif_get_pages() returns the addresses of the physical pages represented by ++ * those pages of the registered address space of the peer of epd, starting at ++ * offset and continuing for len bytes. offset and len are constrained to be ++ * multiples of the page size. ++ * ++ * All of the pages in the specified range [offset,offset+len-1] must be within ++ * a single window of the registered address space of the peer of epd. ++ * ++ * The addresses are returned as a virtually contiguous array pointed to by the ++ * phys_addr component of the scif_range structure whose address is returned in ++ * pages. The nr_pages component of scif_range is the length of the array. The ++ * prot_flags component of scif_range holds the protection flag value passed ++ * when the pages were registered. ++ * ++ * Each physical page whose address is returned by scif_get_pages() remains ++ * available and will not be released for reuse until the scif_range structure ++ * is returned in a call to scif_put_pages(). The scif_range structure returned ++ * by scif_get_pages() must be unmodified. ++ * ++ * It is an error to call scif_close() on an endpoint on which a scif_range ++ * structure of that endpoint has not been returned to scif_put_pages(). ++ * ++ *\return ++ * Upon successful completion, scif_get_pages() returns 0; otherwise the ++ * negative of one of the following errors is returned. ++ *\par Errors: ++ *- ECONNRESET ++ * - A connection was forcibly closed by a peer. ++ *- EINVAL ++ * - epd is not a valid endpoint descriptor, or ++ * - offset is not a multiple of the page size, or ++ * - offset is negative, or ++ * - len is not a multiple of the page size ++ *- ENODEV ++ * -The remote node is lost. ++ *- ENOTCONN ++ * - The endpoint is not connected ++ *- ENXIO ++ * - Addresses in the range [offset,offset+len-1] are invalid ++ * for the registered address space of the peer epd. ++ */ ++int scif_get_pages( ++ scif_epd_t epd, ++ off_t offset, ++ size_t len, ++ struct scif_range **pages); ++ ++/** ++ * scif_put_pages - Remove references from remote registered pages ++ * \param pages pages to be returned ++ * ++ * scif_put_pages() releases a scif_range structure previously obtained by ++ * calling scif_get_pages(). The physical pages represented by pages may ++ * be reused when the window which represented those pages is unregistered. ++ * Therefore, those pages must not be accessed after calling scif_put_pages(). ++ * ++ *\return ++ * Upon successful completion, scif_put_pages() returns 0; otherwise the ++ * negative of one of the following errors is returned. ++ *\par Errors: ++ *- EINVAL ++ * - pages does not point to a valid scif_range structure, or ++ * - the scif_range structure pointed to by pages was already returned. ++ *- ENODEV ++ * - The remote node is lost. ++ *- ENOTCONN ++ * - The endpoint is not connected. ++ */ ++int scif_put_pages( ++ struct scif_range *pages); ++ ++/** ++ * scif_poll - Wait for some event on an endpoint ++ * \param epds Array of endpoint descriptors ++ * \param nepds Length of epds ++ * \param timeout Upper limit on time for which scif_poll() will ++ * block ++ * ++ * scif_poll() waits for one of a set of endpoints to become ready to perform ++ * an I/O operation. scif_poll() exposes a subset of the functionality of the ++ * POSIX standard poll() function. ++ * ++ * The epds argument specifies the endpoint descriptors to be examined and the ++ * events of interest for each endpoint descriptor. epds is a pointer to an ++ * array with one member for each open endpoint descriptor of interest. ++ * ++ * The number of items in the epds array is specified in nepds. The epd field ++ * of scif_pollepd is an endpoint descriptor of an open endpoint. The field ++ * events is a bitmask specifying the events which the application is ++ * interested in. The field revents is an output parameter, filled by the ++ * kernel with the events that actually occurred. The bits returned in revents ++ * can include any of those specified in events, or one of the values ++ * SCIF_POLLERR, SCIF_POLLHUP, or SCIF_POLLNVAL. (These three bits are ++ * meaningless in the events field, and will be set in the revents field ++ * whenever the corresponding condition is true.) ++ * ++ * If none of the events requested (and no error) has occurred for any of the ++ * endpoint descriptors, then scif_poll() blocks until one of the events occurs. ++ * ++ * The timeout argument specifies an upper limit on the time for which ++ * scif_poll() will block, in milliseconds. Specifying a negative value in ++ * timeout means an infinite timeout. ++ * ++ * The following bits may be set in events and returned in revents: ++ *- SCIF_POLLIN: Data may be received without blocking. For a connected ++ * endpoint, this means that scif_recv() may be called without blocking. For a ++ * listening endpoint, this means that scif_accept() may be called without ++ * blocking. ++ *- SCIF_POLLOUT: Data may be sent without blocking. For a connected endpoint, ++ * this means that scif_send() may be called without blocking. This bit value ++ * has no meaning for a listening endpoint and is ignored if specified. ++ * ++ * The following bits are only returned in revents, and are ignored if set in ++ * events: ++ *- SCIF_POLLERR: An error occurred on the endpoint ++ *- SCIF_POLLHUP: The connection to the peer endpoint was disconnected ++ *- SCIF_POLLNVAL: The specified endpoint descriptor is invalid. ++ * ++ *\return ++ * Upon successful completion, scif_poll()returns a non-negative value. A ++ * positive value indicates the total number of endpoint descriptors that have ++ * been selected (that is, endpoint descriptors for which the revents member is ++ * non-zero. A value of 0 indicates that the call timed out and no endpoint ++ * descriptors have been selected. Otherwise: in user mode -1 is returned and ++ * errno is set to indicate the error; in kernel mode the negative of one of ++ * the following errors is returned. ++ * ++ *\par Errors: ++ *- EFAULT ++ * - The array given as argument was not contained in the calling program's ++ * address space. ++ *- EINTR ++ * - A signal occurred before any requested event. ++ *- EINVAL ++ * - The nepds argument is greater than {OPEN_MAX} ++ *- ENOMEM ++ * - There was no space to allocate file descriptor tables. ++*/ ++int ++scif_poll( ++ struct scif_pollepd *epds, ++ unsigned int nepds, ++ long timeout); ++ ++/** ++ * scif_event_register - Register an event handler ++ * \param handler Event handler to be registered ++ * ++ * scif_event_register() registers a routine, handler, to be called when some ++ * event occurs. The event parameter to handler indicates the type of event ++ * which has occurred, and the corresponding component of the data parameter to ++ * handler provides additional data about the event. ++ * ++ * The following events are defined: ++ *- SCIF_NODE_ADDED: A node has been added to the SCIF network. The ++ * scif_node_added component of the data parameter to handler identifies the ++ * node. This event is informational. There are no requirements on the event ++ * handler. ++ *- SCIF_NODE_REMOVED: A node is being removed from the SCIF network. The ++ * scif_node_removed component of the data parameter to handler identifies the ++ * node. Upon being called, and before returning, the event handler must ++ * return, using scif_put_pages(), all structures obtained using ++ * scif_get_pages() against an endpoint connected to the lost node. It is ++ * recommended and expected that the handler will also scif_close() all ++ * endpoints connected to the lost node. ++ * ++ *\return ++ * Upon successful completion scif_event_register() returns 0. ++ * ++ *\par Errors: ++ *- ENOMEM ++ * - There was no space to allocate file descriptor tables. ++*/ ++ ++int ++scif_event_register( ++ scif_callback_t handler); ++ ++/** ++ * scif_event_unregister - Unregister event handler ++ * \param handler Event handler to be unregistered ++ * ++ * scif_event_unregister() unregisters the handler which was registered ++ * previously by using scif_event_register(). ++ * ++ * WARNING: scif_event_unregister must be called before the module ++ * (that registered handles) exits for every handler that is registered. ++ * Failure to do so will result in crash of the scif module. ++ * ++ *\return ++ * Upon successful completion scif_event_unregister() returns 0. ++ *\par Errors: ++ *- EINVAL ++ * -If the event handler was not found/registered. ++*/ ++int ++scif_event_unregister( ++ scif_callback_t handler); ++ ++/* ++ * Note: The callee can use pci_resource_start(dev, index) and ++ * pci_resource_len(dev, index) to obtain the PCI resource starting ++ * physical address and length for valid non null indexes of the va ++ * array. MMIO bars will not have IORESOURCE_PREFETCH set in the ++ * flags obtained from pci_resource_flags(dev, index). va[index] ++ * will be set to NULL for invalid resources. ++ */ ++struct scif_pci_info { ++ /* pci_dev pointer associated with a node */ ++ struct pci_dev *pdev; ++ /* Ioremapped virtual address base for every valid PCIe resource */ ++ void __iomem *va[PCI_NUM_RESOURCES]; ++}; ++ ++/** ++ * scif_pci_info - Populate the scif_pci_info structure for a node. ++ * \param node The node to query ++ * \param dev The scif_pci_info structure to populate. ++ * ++ * scif_pci_info() populates the provided scif_pci_info structure ++ * associated with a node. The requested node ID cannot be the same as ++ * the current node. This routine will only return success when called from ++ * the host. ++ * ++ *\return ++ * Upon successful completion, scif_pci_info() returns 0; otherwise the ++ * negative of one of the following errors is returned. ++ * ++ *\par Errors: ++ *- EINVAL ++ * - The requested node is not valid. ++ * - Called on MIC instead of the host. ++ *- ENODEV ++ * - No pci_dev association exists for the node. ++ */ ++int ++scif_pci_info( ++ uint16_t node, ++ struct scif_pci_info *dev); ++ ++ ++#ifdef __cplusplus ++} /* extern "C" */ ++#endif ++ ++#endif /* __SCIF_H__ */ diff --git a/tech-preview/xeon-phi/0007-Add-CCL-Direct-ibp-drivers-to-Infiniband.patch b/tech-preview/xeon-phi/0007-Add-CCL-Direct-ibp-drivers-to-Infiniband.patch new file mode 100644 index 0000000..86e2eef --- /dev/null +++ b/tech-preview/xeon-phi/0007-Add-CCL-Direct-ibp-drivers-to-Infiniband.patch @@ -0,0 +1,9643 @@ +From a6d3fc7a6f6d3b3b621dfbd71babbff5ae58d1dd Mon Sep 17 00:00:00 2001 +From: Phil Cayton +Date: Wed, 28 May 2014 15:50:26 -0700 +Subject: [PATCH 07/13] Add CCL-Direct (ibp) drivers to Infiniband + +This includes the base ibp server module as well as +the server modules for sa and cm + +Signed-off-by: Phil Cayton +--- +diff -urN a6/drivers/infiniband/ibp/cm/cm_ibp_abi.h a7/drivers/infiniband/ibp/cm/cm_ibp_abi.h +--- a6/drivers/infiniband/ibp/cm/cm_ibp_abi.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/cm/cm_ibp_abi.h 2015-02-23 10:01:30.289769309 -0800 +@@ -0,0 +1,399 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef CM_IBP_ABI_H ++#define CM_IBP_ABI_H ++ ++#include ++#include ++#include ++ ++/* Increment this value if any changes break compatibility. */ ++#define IBP_CM_ABI_VERSION 1 ++ ++/* ++ * Make sure that all structs defined in this file are laid out to pack ++ * the same way on different architectures to avoid incompatibility. ++ * ++ * Specifically: ++ * - Do not use pointer types -- pass pointers in a u64 instead. ++ * - Make sure that any structure larger than 4 bytes is padded ++ * to a multiple of 8 bytes; otherwise the structure size may ++ * be different between architectures. ++ */ ++ ++struct ibp_event_msg { ++ struct ibp_msg_header header; ++ u64 length; ++ u8 event[0]; ++}; ++ ++ ++struct ibp_sa_path_rec { ++ __be64 service_id; ++ u64 dgid_prefix; ++ u64 dgid_id; ++ u64 sgid_prefix; ++ u64 sgid_id; ++ __be16 dlid; ++ __be16 slid; ++ u32 raw_traffic; ++ __be32 flow_label; ++ u8 hop_limit; ++ u8 traffic_class; ++ u32 reversible; ++ u8 numb_path; ++ __be16 pkey; ++ __be16 qos_class; ++ u8 sl; ++ u8 mtu_selector; ++ u8 mtu; ++ u8 rate_selector; ++ u8 rate; ++ u8 packet_life_time_selector; ++ u8 packet_life_time; ++ u8 preference; ++}; ++ ++struct ibp_create_cm_id_cmd { ++ struct ibp_msg_header header; ++ u64 device; ++}; ++ ++struct ibp_create_cm_id_resp { ++ u64 ibp_cm_id; ++ __be64 service_id; ++ __be64 service_mask; ++ __be32 local_id; ++ __be32 remote_id; ++ u32 remote_cm_qpn; ++ u32 filler; ++}; ++ ++struct ibp_destroy_cm_id_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++}; ++ ++struct ibp_cm_listen_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++ __be64 service_id; ++ __be64 service_mask; ++ u64 null_comp_data; ++ struct ib_cm_compare_data compare_data; ++}; ++ ++struct ibp_send_cm_req_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++ struct ibp_sa_path_rec primary_path; ++ struct ibp_sa_path_rec alternate_path; ++ __be64 service_id; ++ u32 qp_num; ++ enum ib_qp_type qp_type; ++ u32 starting_psn; ++ u8 peer_to_peer; ++ u8 responder_resources; ++ u8 initiator_depth; ++ u8 remote_cm_response_timeout; ++ u8 flow_control; ++ u8 local_cm_response_timeout; ++ u8 retry_count; ++ u8 rnr_retry_count; ++ u8 max_cm_retries; ++ u8 srq; ++ u8 private_data_len; ++ char private_data[0]; ++}; ++ ++struct ibp_send_cm_rep_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++ u32 qp_num; ++ u32 starting_psn; ++ u8 responder_resources; ++ u8 initiator_depth; ++ u8 failover_accepted; ++ u8 flow_control; ++ u8 rnr_retry_count; ++ u8 srq; ++ u8 private_data_len; ++ char private_data[0]; ++}; ++ ++struct ibp_send_cm_rtu_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++ u8 private_data_len; ++ char private_data[0]; ++}; ++ ++struct ibp_send_cm_dreq_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++ u8 private_data_len; ++ char private_data[0]; ++}; ++ ++struct ibp_send_cm_drep_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++ u8 private_data_len; ++ char private_data[0]; ++}; ++ ++struct ibp_send_cm_rej_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++ u64 reason; ++ u8 private_data_len; ++ u8 ari_length; ++ char data[0]; ++}; ++ ++struct ibp_send_cm_mra_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++ u8 service_timeout; ++ u8 private_data_len; ++ char private_data[0]; ++}; ++ ++struct ibp_send_cm_lap_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++ struct ibp_sa_path_rec alternate_path; ++ u8 private_data_len; ++ char private_data[0]; ++}; ++ ++struct ibp_send_cm_apr_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++ u64 status; ++ u8 private_data_len; ++ u8 info_length; ++ char data[0]; ++}; ++ ++struct ibp_send_cm_sidr_req_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++ struct ibp_sa_path_rec path; ++ __be64 service_id; ++ int timeout_ms; ++ u8 max_cm_retries; ++ u8 private_data_len; ++ char private_data[0]; ++}; ++ ++struct ibp_send_cm_sidr_rep_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++ u32 qp_num; ++ u32 qkey; ++ u64 status; ++ u8 info_length; ++ u8 private_data_len; ++ char data[0]; ++}; ++ ++struct ibp_cm_notify_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++ u64 event; ++}; ++ ++struct ibp_cm_init_qp_attr_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_cm_id; ++ u64 qp_attr_state; ++}; ++ ++struct ibp_cm_init_qp_attr_resp { ++ u64 qp_attr_mask; ++ u64 qp_access_flags; ++ u64 qp_state; ++ u64 cur_qp_state; ++ u64 path_mtu; ++ u64 path_mig_state; ++ u32 qkey; ++ u32 rq_psn; ++ u32 sq_psn; ++ u64 dest_qp_num; ++ ++ u32 cap_max_send_wr; ++ u32 cap_max_recv_wr; ++ u32 cap_max_send_sge; ++ u32 cap_max_recv_sge; ++ u32 cap_max_inline_data; ++ ++ u64 ah_attr_grh_dgid_subnet_prefix; ++ u64 ah_attr_grh_dgid_interface_id; ++ u32 ah_attr_grh_flow_label; ++ u8 ah_attr_grh_sgid_index; ++ u8 ah_attr_grh_hop_limit; ++ u8 ah_attr_grh_traffic_class; ++ u16 ah_attr_dlid; ++ u8 ah_attr_sl; ++ u8 ah_attr_src_path_bits; ++ u8 ah_attr_static_rate; ++ u8 ah_attr_ah_flags; ++ u8 ah_attr_port_num; ++ ++ u64 alt_attr_grh_dgid_subnet_prefix; ++ u64 alt_attr_grh_dgid_interface_id; ++ u32 alt_attr_grh_flow_label; ++ u8 alt_attr_grh_sgid_index; ++ u8 alt_attr_grh_hop_limit; ++ u8 alt_attr_grh_traffic_class; ++ u16 alt_attr_dlid; ++ u8 alt_attr_sl; ++ u8 alt_attr_src_path_bits; ++ u8 alt_attr_static_rate; ++ u8 alt_attr_ah_flags; ++ u8 alt_attr_port_num; ++ ++ u16 pkey_index; ++ u16 alt_pkey_index; ++ u8 en_sqd_async_notify; ++ u8 sq_draining; ++ u8 max_rd_atomic; ++ u8 max_dest_rd_atomic; ++ u8 min_rnr_timer; ++ u8 port_num; ++ u8 timeout; ++ u8 retry_cnt; ++ u8 rnr_retry; ++ u8 alt_port_num; ++ u8 alt_timeout; ++ ++}; ++ ++struct ibp_cm_req_event_resp { ++ struct ibp_sa_path_rec primary_path; ++ struct ibp_sa_path_rec alternate_path; ++ u64 listen_id; ++ __be64 remote_ca_guid; ++ __u32 remote_qkey; ++ __u32 remote_qpn; ++ __u32 qp_type; ++ __u32 starting_psn; ++ __u8 responder_resources; ++ __u8 initiator_depth; ++ __u8 local_cm_response_timeout; ++ __u8 flow_control; ++ __u8 remote_cm_response_timeout; ++ __u8 retry_count; ++ __u8 rnr_retry_count; ++ __u8 srq; ++ __u8 port; ++ __u8 reserved[7]; ++}; ++ ++struct ibp_cm_rep_event_resp { ++ __be64 remote_ca_guid; ++ __u32 remote_qkey; ++ __u32 remote_qpn; ++ __u32 starting_psn; ++ __u8 responder_resources; ++ __u8 initiator_depth; ++ __u8 target_ack_delay; ++ __u8 failover_accepted; ++ __u8 flow_control; ++ __u8 rnr_retry_count; ++ __u8 srq; ++ __u8 reserved[5]; ++}; ++ ++struct ibp_cm_rej_event_resp { ++ __u32 reason; ++}; ++ ++struct ibp_cm_mra_event_resp { ++ __u8 timeout; ++ __u8 reserved[3]; ++}; ++ ++struct ibp_cm_lap_event_resp { ++ struct ibp_sa_path_rec path; ++}; ++ ++struct ibp_cm_rtu_event_resp { ++ __u32 status; ++ __be32 local_id; ++ __be32 remote_id; ++}; ++ ++struct ibp_cm_apr_event_resp { ++ __u32 status; ++}; ++ ++struct ibp_cm_sidr_req_event_resp { ++ u64 listen_id; ++ __u16 pkey; ++ __u8 port; ++ __u8 reserved; ++}; ++ ++struct ibp_cm_sidr_rep_event_resp { ++ __u32 status; ++ __u32 qkey; ++ __u32 qpn; ++}; ++ ++struct ibp_cm_event { ++ enum ib_event_type event_type; ++ union { ++ struct ibp_cm_req_event_resp req_resp; ++ struct ibp_cm_rep_event_resp rep_resp; ++ struct ibp_cm_rej_event_resp rej_resp; ++ struct ibp_cm_rtu_event_resp rtu_resp; ++ struct ibp_cm_mra_event_resp mra_resp; ++ struct ibp_cm_lap_event_resp lap_resp; ++ struct ibp_cm_apr_event_resp apr_resp; ++ struct ibp_cm_sidr_req_event_resp sidr_req_resp; ++ struct ibp_cm_sidr_rep_event_resp sidr_rep_resp; ++ ++ __u32 send_status; ++ } u; ++ ++ u64 event_cm_id; ++ u64 ibp_cm_id; ++ u64 data_length; ++ u64 info_length; ++ ++ u8 data[0]; ++}; ++ ++#endif /* CM_IBP_ABI_H */ +diff -urN a6/drivers/infiniband/ibp/cm/cm_server_msg.c a7/drivers/infiniband/ibp/cm/cm_server_msg.c +--- a6/drivers/infiniband/ibp/cm/cm_server_msg.c 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/cm/cm_server_msg.c 2015-02-23 10:18:09.042820508 -0800 +@@ -0,0 +1,1058 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#include "server.h" ++ ++LIST_HEAD(cm_entry_list); ++ ++void ibp_copy_sa_path_rec(struct ibp_sa_path_rec *a, struct ib_sa_path_rec *b) ++{ ++ /*Copy ibp_sa_path_rec to ib_sa_path_rec*/ ++ b->service_id = a->service_id; ++ b->dgid.global.subnet_prefix = a->dgid_prefix; ++ b->dgid.global.interface_id = a->dgid_id; ++ b->sgid.global.subnet_prefix = a->sgid_prefix; ++ b->sgid.global.interface_id = a->sgid_id; ++ b->dlid = a->dlid; ++ b->slid = a->slid; ++ b->raw_traffic = a->raw_traffic; ++ b->flow_label = a->flow_label; ++ b->hop_limit = a->hop_limit; ++ b->traffic_class = a->traffic_class; ++ b->reversible = a->reversible; ++ b->numb_path = a->numb_path; ++ b->pkey = a->pkey; ++ b->qos_class = a->qos_class; ++ b->sl = a->sl; ++ b->mtu_selector = a->mtu_selector; ++ b->mtu = a->mtu; ++ b->rate_selector = a->rate_selector; ++ b->rate = a->rate; ++ b->packet_life_time_selector = a->packet_life_time_selector; ++ b->packet_life_time = a->packet_life_time; ++ b->preference = a->preference; ++} ++ ++void ib_copy_sa_path_rec(struct ibp_sa_path_rec *a, struct ib_sa_path_rec *b) ++{ ++ /*Copy ib_sa_path_rec to ibp_sa_path_rec*/ ++ a->service_id = b->service_id; ++ a->dgid_prefix = b->dgid.global.subnet_prefix; ++ a->dgid_id = b->dgid.global.interface_id; ++ a->sgid_prefix = b->sgid.global.subnet_prefix; ++ a->sgid_id = b->sgid.global.interface_id; ++ a->dlid = b->dlid; ++ a->slid = b->slid; ++ a->raw_traffic = b->raw_traffic; ++ a->flow_label = b->flow_label; ++ a->hop_limit = b->hop_limit; ++ a->traffic_class = b->traffic_class; ++ a->reversible = b->reversible; ++ a->numb_path = b->numb_path; ++ a->pkey = b->pkey; ++ a->qos_class = b->qos_class; ++ a->sl = b->sl; ++ a->mtu_selector = b->mtu_selector; ++ a->mtu = b->mtu; ++ a->rate_selector = b->rate_selector; ++ a->rate = b->rate; ++ a->packet_life_time_selector = b->packet_life_time_selector; ++ a->packet_life_time = b->packet_life_time; ++ a->preference = b->preference; ++} ++ ++void cleanup_cm_entry_list(void) ++{ ++ struct cm_entry *entry; ++ struct cm_entry *next; ++ ++ down_write(&list_rwsem); ++ ++ list_for_each_entry_safe(entry, next, &cm_entry_list, list) ++ kfree(entry); ++ ++ up_write(&list_rwsem); ++} ++ ++static struct cm_entry *find_cm_entry(struct ib_cm_id *cm_id) ++{ ++ struct cm_entry *entry; ++ ++ down_read(&list_rwsem); ++ ++ list_for_each_entry(entry, &cm_entry_list, list) ++ if (entry->cm_id == cm_id) ++ goto out; ++ ++ print_err("Could not find cm id %p\n", cm_id); ++ entry = NULL; ++ ++out: ++ up_read(&list_rwsem); ++ ++ return entry; ++} ++ ++/* find the entry id for the listen cm id so we can add the new cm id ++ * that is being accepted to the list so it can be found on future events ++ */ ++static struct cm_entry *find_cm_entry_and_add(struct ib_cm_id *listen_id, ++ struct ib_cm_id *cm_id) ++{ ++ struct cm_entry *entry; ++ struct cm_entry *listen_entry; ++ ++ listen_entry = find_cm_entry(listen_id); ++ if (!listen_entry) { ++ print_err("Could not find listen id %p\n", listen_id); ++ return NULL; ++ } ++ ++ entry = kzalloc(sizeof(struct cm_entry), GFP_KERNEL); ++ if (!entry) { ++ print_err("kzalloc failed\n"); ++ return NULL; ++ } ++ ++ entry->client = listen_entry->client; ++ entry->cm_id = cm_id; ++ ++ down_write(&list_rwsem); ++ list_add(&entry->list, &cm_entry_list); ++ up_write(&list_rwsem); ++ ++ return listen_entry; ++} ++ ++static void ibp_event_req_get(struct ibp_cm_req_event_resp *proxy_req, ++ struct ib_cm_req_event_param *req) ++ ++{ ++ proxy_req->listen_id = (u64) req->listen_id; ++ proxy_req->remote_ca_guid = req->remote_ca_guid; ++ proxy_req->remote_qkey = req->remote_qkey; ++ proxy_req->remote_qpn = req->remote_qpn; ++ proxy_req->qp_type = req->qp_type; ++ proxy_req->starting_psn = req->starting_psn; ++ proxy_req->responder_resources = req->responder_resources; ++ proxy_req->initiator_depth = req->initiator_depth; ++ proxy_req->local_cm_response_timeout = req->local_cm_response_timeout; ++ proxy_req->flow_control = req->flow_control; ++ proxy_req->remote_cm_response_timeout = req->remote_cm_response_timeout; ++ proxy_req->retry_count = req->retry_count; ++ proxy_req->rnr_retry_count = req->rnr_retry_count; ++ proxy_req->srq = req->srq; ++ proxy_req->port = req->port; ++ ib_copy_sa_path_rec(&proxy_req->primary_path, req->primary_path); ++ if (req->alternate_path) ++ ib_copy_sa_path_rec(&proxy_req->alternate_path, ++ req->alternate_path); ++} ++ ++static void ibp_event_rep_get(struct ibp_cm_rep_event_resp *proxy_rep, ++ struct ib_cm_rep_event_param *rep) ++{ ++ proxy_rep->remote_ca_guid = rep->remote_ca_guid; ++ proxy_rep->remote_qkey = rep->remote_qkey; ++ proxy_rep->remote_qpn = rep->remote_qpn; ++ proxy_rep->starting_psn = rep->starting_psn; ++ proxy_rep->responder_resources = rep->responder_resources; ++ proxy_rep->initiator_depth = rep->initiator_depth; ++ proxy_rep->target_ack_delay = rep->target_ack_delay; ++ proxy_rep->failover_accepted = rep->failover_accepted; ++ proxy_rep->flow_control = rep->flow_control; ++ proxy_rep->rnr_retry_count = rep->rnr_retry_count; ++ proxy_rep->srq = rep->srq; ++} ++ ++static ++void ibp_event_sidr_rep_get(struct ibp_cm_sidr_rep_event_resp *proxy_resp, ++ struct ib_cm_sidr_rep_event_param *rep) ++{ ++ proxy_resp->status = rep->status; ++ proxy_resp->qkey = rep->qkey; ++ proxy_resp->qpn = rep->qpn; ++} ++ ++static void ibp_event(struct work_struct *work) ++{ ++ struct ibp_event *event_work; ++ struct ibp_event_msg *msg; ++ int msg_len; ++ int event_len; ++ ++ print_trace("in\n"); ++ ++ event_work = (struct ibp_event *) work; ++ ++ event_len = event_work->event.data_length + ++ event_work->event.info_length + ++ sizeof(struct ibp_cm_event); ++ ++ msg_len = sizeof(struct ibp_event_msg) + event_len; ++ ++ msg = kzalloc(msg_len, GFP_KERNEL); ++ if (!msg) { ++ print_err("kzmalloc failed\n"); ++ goto err; ++ } ++ ++ memcpy(msg->event, &(event_work->event), event_len); ++ msg->length = event_len; ++ ++ IBP_INIT_MSG(NULL, msg, msg_len, IBP_EVENT); ++ ++ ibp_send(event_work->client->ep, msg, msg_len); ++err: ++ kfree(event_work); ++} ++ ++static int ibp_event_handler(struct ib_cm_id *cm_id, ++ struct ib_cm_event *ib_cm_event) ++{ ++ struct ibp_event *event_work; ++ struct ibp_client *client; ++ struct cm_entry *entry; ++ void *info = NULL; ++ int info_length = 0; ++ int data_length = 0; ++ ++ print_trace("in\n"); ++ ++ switch (ib_cm_event->event) { ++ case IB_CM_REQ_RECEIVED: ++ data_length = IB_CM_REQ_PRIVATE_DATA_SIZE; ++ break; ++ case IB_CM_REP_RECEIVED: ++ data_length = IB_CM_REP_PRIVATE_DATA_SIZE; ++ break; ++ case IB_CM_RTU_RECEIVED: ++ data_length = IB_CM_RTU_PRIVATE_DATA_SIZE; ++ break; ++ case IB_CM_DREQ_RECEIVED: ++ data_length = IB_CM_DREQ_PRIVATE_DATA_SIZE; ++ break; ++ case IB_CM_DREP_RECEIVED: ++ data_length = IB_CM_DREP_PRIVATE_DATA_SIZE; ++ break; ++ case IB_CM_MRA_RECEIVED: ++ data_length = IB_CM_MRA_PRIVATE_DATA_SIZE; ++ break; ++ case IB_CM_REJ_RECEIVED: ++ data_length = IB_CM_REJ_PRIVATE_DATA_SIZE; ++ info_length = ib_cm_event->param.rej_rcvd.ari_length; ++ break; ++ case IB_CM_LAP_RECEIVED: ++ data_length = IB_CM_LAP_PRIVATE_DATA_SIZE; ++ break; ++ case IB_CM_APR_RECEIVED: ++ data_length = IB_CM_APR_PRIVATE_DATA_SIZE; ++ info_length = ib_cm_event->param.apr_rcvd.info_len; ++ break; ++ case IB_CM_SIDR_REQ_RECEIVED: ++ data_length = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE; ++ break; ++ case IB_CM_SIDR_REP_RECEIVED: ++ data_length = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; ++ info_length = ib_cm_event->param.sidr_rep_rcvd.info_len; ++ break; ++ default: ++ break; ++ } ++ event_work = kzalloc((sizeof(struct ibp_event)) + ++ data_length + info_length, GFP_KERNEL); ++ if (!event_work) { ++ print_err("kzalloc failed\n"); ++ return -ENOMEM; ++ } ++ ++ if (ib_cm_event->event == IB_CM_REQ_RECEIVED) { ++ struct ib_cm_req_event_param *param; ++ param = &ib_cm_event->param.req_rcvd; ++ entry = find_cm_entry_and_add(param->listen_id, cm_id); ++ } else if (ib_cm_event->event == IB_CM_SIDR_REQ_RECEIVED) { ++ struct ib_cm_sidr_req_event_param *param; ++ param = &ib_cm_event->param.sidr_req_rcvd; ++ entry = find_cm_entry_and_add(param->listen_id, cm_id); ++ } else ++ entry = find_cm_entry(cm_id); ++ ++ if (!entry) { ++ kfree(event_work); ++ return -EINVAL; ++ } ++ ++ client = entry->client; ++ ++ event_work->client = client; ++ event_work->event.ibp_cm_id = (u64) entry->cm_id; ++ event_work->event.event_cm_id = (u64) cm_id; ++ event_work->event.event_type = ib_cm_event->event; ++ event_work->event.data_length = data_length; ++ event_work->event.info_length = info_length; ++ ++ /* parse and copy the proper event */ ++ switch (ib_cm_event->event) { ++ case IB_CM_REQ_RECEIVED: ++ print_dbg("IB_CM_REQ_RECEIVED (%d)\n", ib_cm_event->event); ++ ibp_event_req_get(&event_work->event.u.req_resp, ++ &ib_cm_event->param.req_rcvd); ++ break; ++ case IB_CM_REP_RECEIVED: ++ print_dbg("IB_CM_REP_RECEIVED (%d)\n", ib_cm_event->event); ++ ibp_event_rep_get(&event_work->event.u.rep_resp, ++ &ib_cm_event->param.rep_rcvd); ++ break; ++ case IB_CM_MRA_RECEIVED: ++ print_dbg("IB_CM_MRA_RECEIVED (%d)\n", ib_cm_event->event); ++ event_work->event.u.mra_resp.timeout = ++ ib_cm_event->param.mra_rcvd.service_timeout; ++ break; ++ case IB_CM_REJ_RECEIVED: ++ print_dbg("IB_CM_REJ_RECEIVED (%d)\n", ib_cm_event->event); ++ event_work->event.u.rej_resp.reason = ++ ib_cm_event->param.rej_rcvd.reason; ++ info = ib_cm_event->param.rej_rcvd.ari; ++ break; ++ case IB_CM_RTU_RECEIVED: ++ print_dbg("IB_CM_RTU_RECEIVED (%d)\n", ib_cm_event->event); ++ event_work->event.u.rtu_resp.status = ++ ib_cm_event->param.send_status; ++ event_work->event.u.rtu_resp.local_id = cm_id->local_id; ++ event_work->event.u.rtu_resp.remote_id = cm_id->remote_id; ++ break; ++ case IB_CM_LAP_RECEIVED: ++ print_dbg("IB_CM_LAP_RECEIVED (%d)\n", ib_cm_event->event); ++ ib_copy_sa_path_rec(&event_work->event.u.lap_resp.path, ++ ib_cm_event->param.lap_rcvd.alternate_path); ++ break; ++ case IB_CM_APR_RECEIVED: ++ print_dbg("IB_CM_APR_RECEIVED (%d)\n", ib_cm_event->event); ++ event_work->event.u.apr_resp.status = ++ ib_cm_event->param.apr_rcvd.ap_status; ++ info = ib_cm_event->param.apr_rcvd.apr_info; ++ break; ++ case IB_CM_SIDR_REQ_RECEIVED: ++ print_dbg("IB_CM_SIDR_REQ_RECEIVED (%d)\n", ++ ib_cm_event->event); ++ event_work->event.u.sidr_req_resp.listen_id = ++ (u64) ib_cm_event->param.sidr_req_rcvd.listen_id; ++ event_work->event.u.sidr_req_resp.pkey = ++ ib_cm_event->param.sidr_req_rcvd.pkey; ++ event_work->event.u.sidr_req_resp.port = ++ ib_cm_event->param.sidr_req_rcvd.port; ++ break; ++ case IB_CM_SIDR_REP_RECEIVED: ++ print_dbg("IB_CM_SIDR_REP_RECEIVED (%d)\n", ++ ib_cm_event->event); ++ ibp_event_sidr_rep_get(&event_work->event.u.sidr_rep_resp, ++ &ib_cm_event->param.sidr_rep_rcvd); ++ info = ib_cm_event->param.sidr_rep_rcvd.info; ++ break; ++ case IB_CM_TIMEWAIT_EXIT: ++ case IB_CM_REQ_ERROR: ++ case IB_CM_REP_ERROR: ++ case IB_CM_DREQ_ERROR: ++ case IB_CM_LAP_ERROR: ++ case IB_CM_SIDR_REQ_ERROR: ++ print_dbg("IB_CM_..._ERROR (%d)\n", ib_cm_event->event); ++ event_work->event.u.send_status = ++ ib_cm_event->param.send_status; ++ break; ++ ++ case IB_CM_USER_ESTABLISHED: ++ print_dbg("IB_CM_USER_ESTABLISHED (%d)\n", ++ ib_cm_event->event); ++ event_work->event.u.send_status = ++ ib_cm_event->param.send_status; ++ break; ++ case IB_CM_DREQ_RECEIVED: ++ print_dbg("IB_CM_DREQ_RECEIVED (%d)\n", ib_cm_event->event); ++ event_work->event.u.send_status = ++ ib_cm_event->param.send_status; ++ break; ++ case IB_CM_DREP_RECEIVED: ++ print_dbg("IB_CM_DREP_RECEIVED (%d)\n", ib_cm_event->event); ++ event_work->event.u.send_status = ++ ib_cm_event->param.send_status; ++ break; ++ default: ++ print_dbg("event not handled %d\n", ib_cm_event->event); ++ break; ++ } ++ ++ if (data_length) ++ memcpy(event_work->event.data, ib_cm_event->private_data, ++ data_length); ++ ++ if (info_length) ++ memcpy(event_work->event.data + data_length, info, info_length); ++ ++ INIT_WORK(&event_work->work, ibp_event); ++ queue_work(client->workqueue, &event_work->work); ++ ++ return 0; ++} ++ ++int ibp_cmd_create_cm_id(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_create_cm_id_cmd *cmd; ++ struct ibp_create_cm_id_resp *resp; ++ struct ib_device *ib_device; ++ struct ib_cm_id *cm_id = NULL; ++ struct cm_entry *entry; ++ size_t len; ++ int status = 0; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_create_cm_id_cmd *) hdr; ++ ib_device = (struct ib_device *) cmd->device; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ entry = kzalloc(sizeof(struct cm_entry), GFP_KERNEL); ++ if (!entry) { ++ print_err("kzalloc failed\n"); ++ status = -ENOMEM; ++ goto send_resp; ++ } ++ ++ cm_id = ib_create_cm_id(ib_device, ++ (ib_cm_handler) ibp_event_handler, ++ NULL); ++ if (IS_ERR(cm_id)) { ++ status = PTR_ERR(cm_id); ++ print_err("ib_create_cm_id returned %d\n", status); ++ goto send_resp; ++ } ++ ++ len += sizeof(*resp); ++ ++ resp = (struct ibp_create_cm_id_resp *) msg->data; ++ ++ resp->ibp_cm_id = (u64) cm_id; ++ resp->service_id = cm_id->service_id; ++ resp->service_mask = cm_id->service_mask; ++ resp->local_id = cm_id->local_id; ++ resp->remote_id = cm_id->remote_id; ++ resp->remote_cm_qpn = cm_id->remote_cm_qpn; ++ ++send_resp: ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, status); ++ ++ ret = ibp_send(client->ep, msg, len); ++ if (ret) { ++ kfree(entry); ++ print_err("ibp_send returned %d\n", ret); ++ return ret; ++ } ++ if (status) { ++ kfree(entry); ++ return status; ++ } ++ ++ entry->client = client; ++ entry->cm_id = cm_id; ++ ++ down_write(&list_rwsem); ++ list_add(&entry->list, &cm_entry_list); ++ up_write(&list_rwsem); ++ ++ return 0; ++} ++ ++int ibp_cmd_destroy_cm_id(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_destroy_cm_id_cmd *cmd; ++ struct ib_cm_id *cm_id; ++ struct cm_entry *entry; ++ size_t len; ++ int ret = 0; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_destroy_cm_id_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ entry = find_cm_entry(cm_id); ++ if (!entry) ++ goto send_resp; ++ ++ down_write(&list_rwsem); ++ list_del(&entry->list); ++ up_write(&list_rwsem); ++ ++ kfree(entry); ++ ++ ib_destroy_cm_id(cm_id); ++ ++send_resp: ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++int ibp_cmd_cm_listen(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_cm_listen_cmd *cmd; ++ struct ib_cm_id *cm_id; ++ struct ib_cm_compare_data *data = NULL; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_cm_listen_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ if (!cmd->null_comp_data) ++ data = &(cmd->compare_data); ++ ++ ret = ib_cm_listen(cm_id, cmd->service_id, cmd->service_mask, data); ++ if (ret) ++ print_err("ib_cm_listen returned %d\n", ret); ++ ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ ++ return ibp_send(client->ep, msg, len); ++} ++ ++int ibp_cmd_send_cm_req(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_send_cm_req_cmd *cmd; ++ struct ib_cm_id *cm_id; ++ struct ib_cm_req_param param = {0}; ++ struct ib_sa_path_rec primary_path; ++ struct ib_sa_path_rec alternate_path; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_send_cm_req_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ if (cmd->alternate_path.pkey) { ++ param.alternate_path = &alternate_path; ++ ibp_copy_sa_path_rec(&cmd->alternate_path, &alternate_path); ++ } ++ ++ param.primary_path = &primary_path; ++ ibp_copy_sa_path_rec(&cmd->primary_path, &primary_path); ++ ++ param.service_id = cmd->service_id; ++ param.qp_num = cmd->qp_num; ++ param.qp_type = cmd->qp_type; ++ param.starting_psn = cmd->starting_psn; ++ param.peer_to_peer = cmd->peer_to_peer; ++ param.responder_resources = cmd->responder_resources; ++ param.initiator_depth = cmd->initiator_depth; ++ param.remote_cm_response_timeout = cmd->remote_cm_response_timeout; ++ param.flow_control = cmd->flow_control; ++ param.local_cm_response_timeout = cmd->local_cm_response_timeout; ++ param.retry_count = cmd->retry_count; ++ param.rnr_retry_count = cmd->rnr_retry_count; ++ param.max_cm_retries = cmd->max_cm_retries; ++ param.srq = cmd->srq; ++ param.private_data_len = cmd->private_data_len; ++ ++ if (cmd->private_data_len) ++ param.private_data = cmd->private_data; ++ ++ ret = ib_send_cm_req(cm_id, ¶m); ++ ++ if (ret) ++ print_err("send_cm_req returned %d\n", ret); ++ ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ ++ return ibp_send(client->ep, msg, len); ++} ++ ++int ibp_cmd_send_cm_rep(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_send_cm_rep_cmd *cmd; ++ struct ib_cm_id *cm_id; ++ struct ib_cm_rep_param param = {0}; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_send_cm_rep_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ param.qp_num = cmd->qp_num; ++ param.starting_psn = cmd->starting_psn; ++ param.responder_resources = cmd->responder_resources; ++ param.initiator_depth = cmd->initiator_depth; ++ param.failover_accepted = cmd->failover_accepted; ++ param.rnr_retry_count = cmd->rnr_retry_count; ++ param.srq = cmd->srq; ++ param.private_data_len = cmd->private_data_len; ++ ++ if (cmd->private_data_len) ++ param.private_data = cmd->private_data; ++ ++ ret = ib_send_cm_rep(cm_id, ¶m); ++ if (ret) ++ print_err("send_cm_rep returned %d\n", ret); ++ ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ ++ return ibp_send(client->ep, msg, len); ++} ++ ++int ibp_cmd_send_cm_rtu(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_send_cm_rtu_cmd *cmd; ++ struct ibp_response_msg *msg; ++ struct ib_cm_id *cm_id; ++ void *private_data = NULL; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_send_cm_rtu_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ if (cmd->private_data_len) ++ private_data = cmd->private_data; ++ ++ ret = ib_send_cm_rtu(cm_id, private_data, cmd->private_data_len); ++ if (ret) ++ print_err("send_cm_rtu returned %d\n", ret); ++ ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ ++ return ibp_send(client->ep, msg, len); ++} ++ ++int ibp_cmd_send_cm_dreq(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_send_cm_dreq_cmd *cmd; ++ struct ib_cm_id *cm_id; ++ void *private_data = NULL; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_send_cm_dreq_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ if (cmd->private_data_len) ++ private_data = cmd->private_data; ++ ++ ret = ib_send_cm_dreq(cm_id, private_data, cmd->private_data_len); ++ if (ret) ++ print_dbg("send_cm_dreq returned %d\n", ret); ++ ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ ++ return ibp_send(client->ep, msg, len); ++} ++ ++int ibp_cmd_send_cm_drep(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_send_cm_drep_cmd *cmd; ++ struct ib_cm_id *cm_id; ++ void *private_data = NULL; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_send_cm_drep_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ if (cmd->private_data_len) ++ private_data = cmd->private_data; ++ ++ ret = ib_send_cm_drep(cm_id, private_data, cmd->private_data_len); ++ if (ret) ++ print_dbg("send_cm_drep returned %d\n", ret); ++ ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ ++ return ibp_send(client->ep, msg, len); ++} ++ ++int ibp_cmd_send_cm_rej(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_send_cm_rej_cmd *cmd; ++ struct ib_cm_id *cm_id; ++ void *ari; ++ void *private_data = NULL; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_send_cm_rej_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ if (cmd->private_data_len) ++ private_data = cmd->data; ++ ++ ari = &(cmd->data[cmd->private_data_len]); ++ ++ ret = ib_send_cm_rej(cm_id, cmd->reason, ari, cmd->ari_length, ++ private_data, cmd->private_data_len); ++ if (ret) ++ print_err("send_cm_rej returned %d\n", ret); ++ ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ ++ return ibp_send(client->ep, msg, len); ++} ++ ++int ibp_cmd_send_cm_mra(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_send_cm_mra_cmd *cmd; ++ struct ib_cm_id *cm_id; ++ void *private_data = NULL; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_send_cm_mra_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ if (cmd->private_data_len) ++ private_data = cmd->private_data; ++ ++ ret = ib_send_cm_mra(cm_id, cmd->service_timeout, ++ private_data, cmd->private_data_len); ++ if (ret) ++ print_err("send_cm_mra returned %d\n", ret); ++ ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ ++ return ibp_send(client->ep, msg, len); ++} ++ ++int ibp_cmd_send_cm_lap(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_send_cm_lap_cmd *cmd; ++ struct ib_cm_id *cm_id; ++ struct ib_sa_path_rec alt_path; ++ void *private_data = NULL; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_send_cm_lap_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ if (cmd->private_data_len) ++ private_data = cmd->private_data; ++ ++ ibp_copy_sa_path_rec(&cmd->alternate_path, &alt_path); ++ ++ ret = ib_send_cm_lap(cm_id, &alt_path, ++ private_data, cmd->private_data_len); ++ if (ret) ++ print_err("send_cm_lap returned %d\n", ret); ++ ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ ++ return ibp_send(client->ep, msg, len); ++} ++ ++int ibp_cmd_send_cm_apr(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_send_cm_apr_cmd *cmd; ++ struct ib_cm_id *cm_id; ++ void *info = NULL; ++ void *private_data = NULL; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_send_cm_apr_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ if (cmd->private_data_len) ++ private_data = cmd->data; ++ if (cmd->info_length) ++ info = &(cmd->data[cmd->private_data_len]); ++ ++ ret = ib_send_cm_apr(cm_id, cmd->status, info, cmd->info_length, ++ private_data, cmd->private_data_len); ++ if (ret) ++ print_err("send_cm_apr returned %d\n", ret); ++ ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ ++ return ibp_send(client->ep, msg, len); ++} ++ ++int ++ibp_cmd_send_cm_sidr_req(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_send_cm_sidr_req_cmd *cmd; ++ struct ib_cm_id *cm_id; ++ struct ib_cm_sidr_req_param param = {0}; ++ struct ib_sa_path_rec path; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_send_cm_sidr_req_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ param.path = &path; ++ ibp_copy_sa_path_rec(&cmd->path, &path); ++ ++ param.service_id = cmd->service_id; ++ param.timeout_ms = cmd->timeout_ms; ++ param.max_cm_retries = cmd->max_cm_retries; ++ param.private_data_len = cmd->private_data_len; ++ ++ if (cmd->private_data_len) ++ param.private_data = cmd->private_data; ++ ++ ret = ib_send_cm_sidr_req(cm_id, ¶m); ++ if (ret) ++ print_err("send_cm_sidr_req returned %d\n", ret); ++ ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ ++ return ibp_send(client->ep, msg, len); ++} ++ ++int ++ibp_cmd_send_cm_sidr_rep(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_send_cm_sidr_rep_cmd *cmd; ++ struct ib_cm_sidr_rep_param param = {0}; ++ struct ib_cm_id *cm_id; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_send_cm_sidr_rep_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ ++ param.qp_num = cmd->qp_num; ++ param.qkey = cmd->qkey; ++ param.status = cmd->status; ++ param.info_length = cmd->info_length; ++ param.private_data_len = cmd->private_data_len; ++ ++ if (cmd->private_data_len) ++ param.private_data = cmd->data; ++ if (cmd->info_length) ++ param.info = &(cmd->data[cmd->private_data_len]); ++ ++ ret = ib_send_cm_sidr_rep(cm_id, ¶m); ++ if (ret) ++ print_err("send_cm_sidr_rep returned %d\n", ret); ++ ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ ++ return ibp_send(client->ep, msg, len); ++} ++ ++int ibp_cmd_cm_notify(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_cm_notify_cmd *cmd; ++ struct ib_cm_id *cm_id; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_cm_notify_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ ret = ib_cm_notify(cm_id, cmd->event); ++ if (ret) ++ print_err("cm_notify returned %d\n", ret); ++ ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ ++ return ibp_send(client->ep, msg, len); ++} ++ ++int ++ibp_cmd_cm_init_qp_attr(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ struct ibp_cm_init_qp_attr_cmd *cmd; ++ struct ibp_cm_init_qp_attr_resp *resp; ++ struct ib_cm_id *cm_id; ++ struct ib_qp_attr qp_attr; ++ int qp_attr_mask; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_cm_init_qp_attr_cmd *) hdr; ++ cm_id = (struct ib_cm_id *) cmd->ibp_cm_id; ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ qp_attr.qp_state = cmd->qp_attr_state; ++ ++ ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask); ++ if (ret) { ++ print_err("init_qp_attr returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ /* Workaround to avoid modify_qp error from Xeon Phi IPoIB connected mode */ ++ qp_attr_mask &= ~IB_QP_SMAC; ++ ++ len += sizeof(*resp); ++ ++ resp = (struct ibp_cm_init_qp_attr_resp *) msg->data; ++ ++ resp->qp_attr_mask = qp_attr_mask; ++ resp->qp_access_flags = qp_attr.qp_access_flags; ++ resp->qp_state = qp_attr.qp_state; ++ resp->cur_qp_state = qp_attr.cur_qp_state; ++ resp->path_mtu = qp_attr.path_mtu; ++ resp->path_mig_state = qp_attr.path_mig_state; ++ resp->qkey = qp_attr.qkey; ++ resp->rq_psn = qp_attr.rq_psn; ++ resp->sq_psn = qp_attr.sq_psn; ++ resp->dest_qp_num = qp_attr.dest_qp_num; ++ ++ resp->cap_max_send_wr = qp_attr.cap.max_send_wr; ++ resp->cap_max_recv_wr = qp_attr.cap.max_recv_wr; ++ resp->cap_max_send_sge = qp_attr.cap.max_send_sge; ++ resp->cap_max_recv_sge = qp_attr.cap.max_recv_sge; ++ resp->cap_max_inline_data = qp_attr.cap.max_inline_data; ++ ++ resp->ah_attr_grh_dgid_subnet_prefix = ++ qp_attr.ah_attr.grh.dgid.global.subnet_prefix; ++ resp->ah_attr_grh_dgid_interface_id = ++ qp_attr.ah_attr.grh.dgid.global.interface_id; ++ resp->ah_attr_grh_flow_label = qp_attr.ah_attr.grh.flow_label; ++ resp->ah_attr_grh_sgid_index = qp_attr.ah_attr.grh.sgid_index; ++ resp->ah_attr_grh_hop_limit = qp_attr.ah_attr.grh.hop_limit; ++ resp->ah_attr_grh_traffic_class = qp_attr.ah_attr.grh.traffic_class; ++ resp->ah_attr_dlid = qp_attr.ah_attr.dlid; ++ resp->ah_attr_sl = qp_attr.ah_attr.sl; ++ resp->ah_attr_src_path_bits = qp_attr.ah_attr.src_path_bits; ++ resp->ah_attr_static_rate = qp_attr.ah_attr.static_rate; ++ resp->ah_attr_ah_flags = qp_attr.ah_attr.ah_flags; ++ resp->ah_attr_port_num = qp_attr.ah_attr.port_num; ++ ++ resp->alt_attr_grh_dgid_subnet_prefix = ++ qp_attr.alt_ah_attr.grh.dgid.global.subnet_prefix; ++ resp->alt_attr_grh_dgid_interface_id = ++ qp_attr.alt_ah_attr.grh.dgid.global.interface_id; ++ resp->alt_attr_grh_flow_label = qp_attr.alt_ah_attr.grh.flow_label; ++ resp->alt_attr_grh_sgid_index = qp_attr.alt_ah_attr.grh.sgid_index; ++ resp->alt_attr_grh_hop_limit = qp_attr.alt_ah_attr.grh.hop_limit; ++ resp->alt_attr_grh_traffic_class ++ = qp_attr.alt_ah_attr.grh.traffic_class; ++ resp->alt_attr_dlid = qp_attr.alt_ah_attr.dlid; ++ resp->alt_attr_sl = qp_attr.alt_ah_attr.sl; ++ resp->alt_attr_src_path_bits = qp_attr.alt_ah_attr.src_path_bits; ++ resp->alt_attr_static_rate = qp_attr.alt_ah_attr.static_rate; ++ resp->alt_attr_ah_flags = qp_attr.alt_ah_attr.ah_flags; ++ resp->alt_attr_port_num = qp_attr.alt_ah_attr.port_num; ++ ++ resp->pkey_index = qp_attr.pkey_index; ++ resp->alt_pkey_index = qp_attr.alt_pkey_index; ++ resp->en_sqd_async_notify = qp_attr.en_sqd_async_notify; ++ resp->sq_draining = qp_attr.sq_draining; ++ resp->max_rd_atomic = qp_attr.max_rd_atomic; ++ resp->max_dest_rd_atomic = qp_attr.max_dest_rd_atomic; ++ resp->min_rnr_timer = qp_attr.min_rnr_timer; ++ resp->port_num = qp_attr.port_num; ++ resp->timeout = qp_attr.timeout; ++ resp->retry_cnt = qp_attr.retry_cnt; ++ resp->rnr_retry = qp_attr.rnr_retry; ++ resp->alt_port_num = qp_attr.alt_port_num; ++ resp->alt_timeout = qp_attr.alt_timeout; ++ ++send_resp: ++ IBP_INIT_RESP(cm_id, msg, len, IBP_RESPONSE, hdr->request, ret); ++ ++ return ibp_send(client->ep, msg, len); ++} +diff -urN a6/drivers/infiniband/ibp/cm/common.h a7/drivers/infiniband/ibp/cm/common.h +--- a6/drivers/infiniband/ibp/cm/common.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/cm/common.h 2015-02-23 10:01:30.289769309 -0800 +@@ -0,0 +1,106 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef COMMON_H ++#define COMMON_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define DRV_DESC "CCL Direct CM " DRV_ROLE ++#define DRV_VERSION "1.0" ++#define DRV_BASE "ibp_cm" ++#define PFX DRV_BASE "_" ++#define DRV_PFX DRV_NAME ": " ++ ++#define DRV_COPYRIGHT "Copyright (c) 2011-2013 Intel Corporation" ++#define DRV_SIGNON DRV_DESC " v" DRV_VERSION "\n" DRV_COPYRIGHT "\n" ++ ++#define MODULE_PARAM(name, var, type, value, desc) \ ++ type var = value; \ ++ module_param_named(name, var, type, 0644); \ ++ MODULE_PARM_DESC(name, desc) ++ ++#ifdef IBP_DEBUG ++extern int debug_level; ++#endif ++ ++enum { ++ IBP_DEBUG_NONE, ++ IBP_DEBUG_TARGETED, ++ IBP_DEBUG_VERBOSE, ++}; ++ ++#define _PRINTK(l, f, arg...) \ ++ printk(l DRV_PFX "%s(%d) " f, __func__, __LINE__, ##arg) ++ ++#ifdef IBP_DEBUG ++#define PRINTK(dbg, l, f, arg...) \ ++ do { \ ++ if (debug_level >= dbg) \ ++ printk(l DRV_PFX "%s(%d) " f, \ ++ __func__, __LINE__, ##arg); \ ++ } while (0) ++#else ++#define PRINTK(dbg, l, f, arg...) do { } while (0) ++#endif ++ ++#define print_dbg(f, arg...) PRINTK(IBP_DEBUG_TARGETED, KERN_DEBUG, f, ##arg) ++#define print_err(f, arg...) _PRINTK(KERN_ERR, f, ##arg) ++#define print_info(f, arg...) pr_info(f, ##arg) ++ ++#if 0 ++#define FORCED_FUNCTION_TRACING ++#endif ++ ++#ifdef FORCED_FUNCTION_TRACING ++#define print_trace(f, arg...) _PRINTK(KERN_ERR, f, ##arg) ++#else ++#define print_trace(f, arg...) PRINTK(IBP_DEBUG_VERBOSE, KERN_ERR, f, ##arg) ++#endif ++ ++#ifndef IBP_CM_PORT /* unique scif port for this service */ ++#define IBP_CM_PORT SCIF_OFED_PORT_3 ++#endif ++ ++int ibp_send(scif_epd_t ep, void *buf, size_t len); ++int ibp_recv(scif_epd_t ep, void *buf, size_t len); ++ ++#endif /* COMMON_H */ +diff -urN a6/drivers/infiniband/ibp/cm/ibp-abi.h a7/drivers/infiniband/ibp/cm/ibp-abi.h +--- a6/drivers/infiniband/ibp/cm/ibp-abi.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/cm/ibp-abi.h 2015-02-23 10:01:30.290769309 -0800 +@@ -0,0 +1,94 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef IBP_ABI_H ++#define IBP_ABI_H ++ ++#include ++#include ++#include ++ ++/* Increment this value if any changes break compatibility. */ ++#define IBP_CM_ABI_VERSION 1 ++ ++/* Client to server message enums. */ ++enum { ++ IBP_CREATE_CM_ID, ++ IBP_DESTROY_CM_ID, ++ IBP_CM_LISTEN, ++ IBP_CM_NOTIFY, ++ IBP_SEND_CM_REQ, ++ IBP_SEND_CM_REP, ++ IBP_SEND_CM_RTU, ++ IBP_SEND_CM_DREQ, ++ IBP_SEND_CM_DREP, ++ IBP_SEND_CM_REJ, ++ IBP_SEND_CM_MRA, ++ IBP_SEND_CM_LAP, ++ IBP_SEND_CM_APR, ++ IBP_SEND_CM_SIDR_REQ, ++ IBP_SEND_CM_SIDR_REP, ++ IBP_CM_INIT_QP_ATTR, ++}; ++ ++/* Server to client message enums. */ ++enum { ++ IBP_IBP_EVENT, ++ IBP_IBP_RESPONSE, ++}; ++ ++/* ++ * Make sure that all structs defined in this file are laid out to pack ++ * the same way on different architectures to avoid incompatibility. ++ * ++ * Specifically: ++ * - Do not use pointer types -- pass pointers in a u64 instead. ++ * - Make sure that any structure larger than 4 bytes is padded ++ * to a multiple of 8 bytes; otherwise the structure size may ++ * be different between architectures. ++ */ ++ ++struct ibp_msg_header { /* present in all messages */ ++ u32 opcode; ++ u32 length; ++ u32 status; ++ u32 reserved; ++ u64 request; ++ u64 data[0]; ++}; ++ ++struct ibp_response_msg { ++ struct ibp_msg_header header; ++ u64 data[0]; ++}; ++ ++#endif /* IBP_ABI_H */ +diff -urN a6/drivers/infiniband/ibp/cm/ibp_exports.h a7/drivers/infiniband/ibp/cm/ibp_exports.h +--- a6/drivers/infiniband/ibp/cm/ibp_exports.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/cm/ibp_exports.h 2015-02-23 10:01:30.290769309 -0800 +@@ -0,0 +1,50 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef IBP_EXPORTS_H ++#define IBP_EXPORTS_H ++ ++#include ++ ++/* ++ ibp_resolve_ib_device - Return the host ib_device handle ++ @ibdev:Card IB device ++ ++ Upper level drivers may require the host ib_device handle associated ++ with the card ib_device. This routine resolves the card ib_device to ++ the cooresponding host ib_device handle. A value of 0 is returned if ++ no match was found. ++*/ ++u64 ibp_resolve_ib_device(struct ib_device *ibdev); ++ ++ ++#endif /* IBP_EXPORTS_H */ +diff -urN a6/drivers/infiniband/ibp/cm/Makefile a7/drivers/infiniband/ibp/cm/Makefile +--- a6/drivers/infiniband/ibp/cm/Makefile 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/cm/Makefile 2015-02-23 10:01:30.290769309 -0800 +@@ -0,0 +1,21 @@ ++KDIR ?= /lib/modules/`uname -r`/build ++ ++obj-$(CONFIG_IBP_SERVER) += ibp_cm_server.o ++ ++ccflags-$(CONFIG_IBP_DEBUG) += -g -DIBP_DEBUG ++ ++ibp_cm_server-y := server.o \ ++ server_msg.o \ ++ cm_server_msg.o ++ ++default: ++ $(MAKE) -C $(KDIR) M=`pwd` ++ ++modules_install: ++ $(MAKE) -C $(KDIR) M=`pwd` modules_install ++ ++clean: ++ rm -rf *.ko *.o .*.ko.cmd .*.o.cmd *.mod.c Module.* modules.order .tmp_versions ++ ++unix: ++ dos2unix *.[ch] Kconfig Makefile +diff -urN a6/drivers/infiniband/ibp/cm/server.c a7/drivers/infiniband/ibp/cm/server.c +--- a6/drivers/infiniband/ibp/cm/server.c 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/cm/server.c 2015-02-23 10:01:30.290769309 -0800 +@@ -0,0 +1,221 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#include "server.h" ++ ++MODULE_AUTHOR("Jerrie Coffman"); ++MODULE_AUTHOR("Phil Cayton"); ++MODULE_AUTHOR("Jay Sternberg"); ++MODULE_LICENSE("Dual BSD/GPL"); ++MODULE_DESCRIPTION(DRV_DESC); ++MODULE_VERSION(DRV_VERSION); ++ ++MODULE_PARAM(port, port, int, IBP_CM_PORT, "Connection port"); ++MODULE_PARAM(backlog, backlog, int, 8, "Connection backlog"); ++MODULE_PARAM(timeout, timeout, int, 1000, "Listen/Poll time in milliseconds"); ++ ++#ifdef IBP_DEBUG ++MODULE_PARAM(debug_level, debug_level, int, 0, "Debug: 0-none, 1-some, 2-all"); ++#endif ++ ++struct rw_semaphore list_rwsem; ++ ++LIST_HEAD(client_list); ++ ++static struct task_struct *listen_thread; ++ ++static struct ibp_client *ibp_create_client(scif_epd_t ep, uint16_t node) ++{ ++ struct ibp_client *client; ++ int ret = -ENOMEM; ++ ++ client = kzalloc(sizeof(*client), GFP_KERNEL); ++ if (!client) { ++ print_err("kzalloc failed\n"); ++ return ERR_PTR(ret); ++ } ++ ++ client->ep = ep; ++ ++ client->rx_buf = (void *)__get_free_page(GFP_KERNEL); ++ if (!client->rx_buf) { ++ print_err("__get_free_page rx_buf failed\n"); ++ goto err0; ++ } ++ ++ client->tx_buf = (void *)__get_free_page(GFP_KERNEL); ++ if (!client->tx_buf) { ++ print_err("__get_free_page tx_buf failed\n"); ++ goto err1; ++ } ++ ++ client->workqueue = create_singlethread_workqueue(DRV_NAME); ++ if (!client->workqueue) { ++ print_err("create_singlethread_workqueue failed\n"); ++ goto err2; ++ } ++ ++ down_write(&list_rwsem); ++ list_add(&client->list, &client_list); ++ up_write(&list_rwsem); ++ ++ client->ibp_cm_client_thread = kthread_run(ibp_process_recvs, ++ client, DRV_NAME); ++ if (!client->ibp_cm_client_thread) { ++ print_err("create cleint thread failed\n"); ++ goto err3; ++ } ++ ++ return client; ++err3: ++ down_write(&list_rwsem); ++ list_del(&client->list); ++ up_write(&list_rwsem); ++ ++ destroy_workqueue(client->workqueue); ++err2: ++ free_page((uintptr_t)client->tx_buf); ++err1: ++ free_page((uintptr_t)client->rx_buf); ++err0: ++ kfree(client); ++ return ERR_PTR(ret); ++} ++ ++static int ibp_cm_listen(void *data) ++{ ++ struct ibp_client *client; ++ struct scif_pollepd listen; ++ struct scif_portID peer; ++ scif_epd_t ep; ++ int ret; ++ ++ listen.epd = scif_open(); ++ if (!listen.epd) { ++ print_err("scif_open failed\n"); ++ ret = -EIO; ++ goto err0; ++ } ++ listen.events = POLLIN; ++ ++ ret = scif_bind(listen.epd, port); ++ if (ret < 0) { ++ print_err("scif_bind returned %d\n", ret); ++ goto err1; ++ } ++ ++ ret = scif_listen(listen.epd, backlog); ++ if (ret) { ++ print_err("scif_listen returned %d\n", ret); ++ goto err1; ++ } ++ ++ while (!kthread_should_stop()) { ++ ++ schedule(); ++ ++ ret = scif_poll(&listen, 1, timeout); ++ if (ret == 0) /* timeout */ ++ continue; ++ if (ret < 0) { ++ print_err("scif_poll revents 0x%x\n", listen.revents); ++ continue; ++ } ++ ++ ret = scif_accept(listen.epd, &peer, &ep, 0); ++ if (ret) { ++ print_err("scif_accept returned %d\n", ret); ++ continue; ++ } ++ ++ print_dbg("accepted node %d port %d\n", peer.node, peer.port); ++ ++ client = ibp_create_client(ep, peer.node); ++ if (IS_ERR(client)) { ++ ret = PTR_ERR(client); ++ print_err("ibp_create_client returned %d\n", ret); ++ scif_close(ep); ++ } ++ } ++err1: ++ scif_close(listen.epd); ++err0: ++ return ret; ++} ++ ++static int __init ibp_cm_server_init(void) ++{ ++ int ret = 0; ++ ++ print_info(DRV_SIGNON); ++ ++ init_rwsem(&list_rwsem); ++ ++ /* Start a thread for inbound connections. */ ++ listen_thread = kthread_run(ibp_cm_listen, NULL, DRV_NAME); ++ if (IS_ERR(listen_thread)) { ++ ret = PTR_ERR(listen_thread); ++ print_err("kthread_run returned %d\n", ret); ++ } ++ ++ return ret; ++} ++ ++static void __exit ibp_cm_server_exit(void) ++{ ++ struct ibp_client *client, *next; ++ struct completion done; ++ ++ kthread_stop(listen_thread); ++ ++ down_write(&list_rwsem); ++ list_for_each_entry_safe(client, next, &client_list, list) { ++ init_completion(&done); ++ client->done = &done; ++ ++ /* Close scif ep to unblock the client thread scif_recv */ ++ scif_close(client->ep); ++ ++ up_write(&list_rwsem); ++ ++ /* Wait for client thread to finish */ ++ wait_for_completion(&done); ++ ++ down_write(&list_rwsem); ++ } ++ up_write(&list_rwsem); ++ ++ print_info(DRV_DESC " unloaded\n"); ++} ++ ++module_init(ibp_cm_server_init); ++module_exit(ibp_cm_server_exit); +diff -urN a6/drivers/infiniband/ibp/cm/server.h a7/drivers/infiniband/ibp/cm/server.h +--- a6/drivers/infiniband/ibp/cm/server.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/cm/server.h 2015-02-23 10:01:30.290769309 -0800 +@@ -0,0 +1,128 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef SERVER_H ++#define SERVER_H ++ ++#include ++#include ++#include ++#include ++#include "ibp-abi.h" ++#include "cm_ibp_abi.h" ++#include "common.h" ++ ++#define DRV_ROLE "Server" ++#define DRV_NAME "ibp_cm_server" ++ ++#define MAX_MSG_SIZE PAGE_SIZE ++ ++extern int timeout; ++extern struct rw_semaphore list_rwsem; ++extern struct list_head client_list; ++extern struct list_head cm_entry_list; ++ ++struct ibp_client { ++ struct list_head list; ++ scif_epd_t ep; ++ void *rx_buf; ++ void *tx_buf; ++ struct completion *done; ++ struct workqueue_struct *workqueue; ++ struct task_struct *ibp_cm_client_thread; ++}; ++ ++struct cm_entry { ++ struct list_head list; ++ struct ib_cm_id *cm_id; ++ struct ibp_client *client; ++}; ++ ++struct ibp_event_get { ++ __u64 response; ++ __u64 data; ++ __u64 info; ++ __u8 data_len; ++ __u8 info_len; ++ __u8 reserved[6]; ++}; ++ ++struct ibp_event { ++ struct work_struct work; ++ struct ibp_client *client; ++ struct ibp_cm_event event; ++}; ++ ++#define IBP_INIT_MSG(device, msg, size, op) \ ++ do { \ ++ (msg)->header.opcode = IBP_##op; \ ++ (msg)->header.length = (size); \ ++ (msg)->header.status = 0; \ ++ (msg)->header.reserved = 0; \ ++ (msg)->header.request = 0; \ ++ } while (0) ++ ++#define IBP_INIT_RESP(device, resp, size, op, req, stat) \ ++ do { \ ++ (resp)->header.opcode = IBP_##op; \ ++ (resp)->header.length = (size); \ ++ (resp)->header.status = (stat); \ ++ (resp)->header.reserved = 0; \ ++ (resp)->header.request = (req); \ ++ } while (0) ++ ++int ibp_process_recvs(void *p); ++void cleanup_cm_entry_list(void); ++ ++int ibp_cmd_create_cm_id(struct ibp_client *client, struct ibp_msg_header *hdr); ++int ibp_cmd_destroy_cm_id(struct ibp_client *client, ++ struct ibp_msg_header *hdr); ++int ibp_cmd_cm_listen(struct ibp_client *client, struct ibp_msg_header *hdr); ++int ibp_cmd_cm_notify(struct ibp_client *client, struct ibp_msg_header *hdr); ++int ibp_cmd_send_cm_req(struct ibp_client *client, struct ibp_msg_header *hdr); ++int ibp_cmd_send_cm_rep(struct ibp_client *client, struct ibp_msg_header *hdr); ++int ibp_cmd_send_cm_rtu(struct ibp_client *client, struct ibp_msg_header *hdr); ++int ibp_cmd_send_cm_dreq(struct ibp_client *client, struct ibp_msg_header *hdr); ++int ibp_cmd_send_cm_drep(struct ibp_client *client, struct ibp_msg_header *hdr); ++int ibp_cmd_send_cm_rej(struct ibp_client *client, struct ibp_msg_header *hdr); ++int ibp_cmd_send_cm_mra(struct ibp_client *client, struct ibp_msg_header *hdr); ++int ibp_cmd_send_cm_lap(struct ibp_client *client, struct ibp_msg_header *hdr); ++int ibp_cmd_send_cm_apr(struct ibp_client *client, struct ibp_msg_header *hdr); ++int ibp_cmd_send_cm_sidr_req(struct ibp_client *client, ++ struct ibp_msg_header *hdr); ++int ibp_cmd_send_cm_sidr_rep(struct ibp_client *client, ++ struct ibp_msg_header *hdr); ++int ibp_cmd_cm_event(struct ibp_client *client, struct ibp_msg_header *hdr); ++int ibp_cmd_cm_init_qp_attr(struct ibp_client *client, ++ struct ibp_msg_header *hdr); ++ ++#endif /* SERVER_H */ +diff -urN a6/drivers/infiniband/ibp/cm/server_msg.c a7/drivers/infiniband/ibp/cm/server_msg.c +--- a6/drivers/infiniband/ibp/cm/server_msg.c 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/cm/server_msg.c 2015-02-23 10:01:30.290769309 -0800 +@@ -0,0 +1,176 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#include "server.h" ++#include "cm_ibp_abi.h" ++ ++int ibp_send(scif_epd_t ep, void *buf, size_t len) ++{ ++ int ret; ++ ++ while (len) { ++ ret = scif_send(ep, buf, (uint32_t)len, SCIF_SEND_BLOCK); ++ if (ret < 0) { ++ print_dbg("scif_send returned %d\n", ret); ++ return ret; ++ } ++ buf += ret; ++ len -= ret; ++ } ++ ++ return 0; ++} ++ ++int ibp_recv(scif_epd_t ep, void *buf, size_t len) ++{ ++ int ret; ++ ++ while (len) { ++ ret = scif_recv(ep, buf, (uint32_t)len, SCIF_RECV_BLOCK); ++ if (ret < 0) { ++ print_dbg("scif_recv returned %d\n", ret); ++ return ret; ++ } ++ buf += ret; ++ len -= ret; ++ } ++ ++ return 0; ++} ++ ++static int ++ibp_cmd_bad_request(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_response_msg *msg; ++ size_t len; ++ int status = -EBADRQC; ++ ++ print_dbg("opcode 0x%x\n", hdr->opcode); ++ ++ msg = (struct ibp_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ IBP_INIT_RESP(NULL, msg, len, IBP_RESPONSE, hdr->request, status); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static void ++ibp_cm_destroy_client(struct ibp_client *client) ++{ ++ struct cm_entry *cm, *tmp; ++ ++ down_write(&list_rwsem); ++ list_del(&client->list); ++ list_for_each_entry_safe(cm, tmp, &cm_entry_list, list) ++ if (cm->client == client) { ++ ib_destroy_cm_id(cm->cm_id); ++ list_del(&cm->list); ++ kfree(cm); ++ } ++ up_write(&list_rwsem); ++ ++ destroy_workqueue(client->workqueue); ++ ++ free_page((uintptr_t)client->tx_buf); ++ free_page((uintptr_t)client->rx_buf); ++ ++ if (client->done) ++ complete(client->done); ++ else ++ scif_close(client->ep); ++ ++ kfree(client); ++} ++ ++static int ++(*ibp_msg_table[])(struct ibp_client *c, struct ibp_msg_header *h) = { ++ [IBP_CREATE_CM_ID] = ibp_cmd_create_cm_id, ++ [IBP_DESTROY_CM_ID] = ibp_cmd_destroy_cm_id, ++ [IBP_CM_LISTEN] = ibp_cmd_cm_listen, ++ [IBP_CM_NOTIFY] = ibp_cmd_cm_notify, ++ [IBP_SEND_CM_REQ] = ibp_cmd_send_cm_req, ++ [IBP_SEND_CM_REP] = ibp_cmd_send_cm_rep, ++ [IBP_SEND_CM_RTU] = ibp_cmd_send_cm_rtu, ++ [IBP_SEND_CM_DREQ] = ibp_cmd_send_cm_dreq, ++ [IBP_SEND_CM_DREP] = ibp_cmd_send_cm_drep, ++ [IBP_SEND_CM_REJ] = ibp_cmd_send_cm_rej, ++ [IBP_SEND_CM_MRA] = ibp_cmd_send_cm_mra, ++ [IBP_SEND_CM_LAP] = ibp_cmd_send_cm_lap, ++ [IBP_SEND_CM_APR] = ibp_cmd_send_cm_apr, ++ [IBP_SEND_CM_SIDR_REQ] = ibp_cmd_send_cm_sidr_req, ++ [IBP_SEND_CM_SIDR_REP] = ibp_cmd_send_cm_sidr_rep, ++ [IBP_CM_INIT_QP_ATTR] = ibp_cmd_cm_init_qp_attr, ++}; ++ ++int ibp_process_recvs(void *p) ++{ ++ struct ibp_client *client; ++ struct ibp_msg_header *hdr; ++ int ret; ++ ++ client = (struct ibp_client *) p; ++ hdr = (struct ibp_msg_header *) client->rx_buf; ++ ++ for (;;) { ++ ret = ibp_recv(client->ep, hdr, sizeof(*hdr)); ++ if (ret) ++ break; ++ ++ if (hdr->length > MAX_MSG_SIZE) { ++ print_err("message too large, len %u max %lu\n", ++ hdr->length, MAX_MSG_SIZE); ++ ret = -EMSGSIZE; ++ break; ++ } ++ ++ if (hdr->length > sizeof(*hdr)) { ++ ret = ibp_recv(client->ep, hdr->data, ++ hdr->length - sizeof(*hdr)); ++ if (ret) ++ break; ++ } ++ ++ if ((hdr->opcode >= ARRAY_SIZE(ibp_msg_table)) || ++ !ibp_msg_table[hdr->opcode]) { ++ ibp_cmd_bad_request(client, hdr); ++ continue; ++ } ++ ++ ret = ibp_msg_table[hdr->opcode](client, hdr); ++ if (ret) ++ break; ++ } ++ ++ ibp_cm_destroy_client(client); ++ ++ return ret; ++} +diff -urN a6/drivers/infiniband/ibp/drv/common.h a7/drivers/infiniband/ibp/drv/common.h +--- a6/drivers/infiniband/ibp/drv/common.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/drv/common.h 2015-02-23 10:01:30.290769309 -0800 +@@ -0,0 +1,109 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef COMMON_H ++#define COMMON_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define DRV_DESC "CCL Direct " DRV_ROLE ++#define DRV_VERSION "1.0" ++#define DRV_BASE "ibp" ++#define PFX DRV_BASE "_" ++#define DRV_PFX DRV_NAME ": " ++ ++#define DRV_COPYRIGHT "Copyright (c) 2011-2013 Intel Corporation" ++#define DRV_SIGNON DRV_DESC " v" DRV_VERSION "\n" DRV_COPYRIGHT "\n" ++ ++#define MODULE_PARAM(name, var, type, value, desc) \ ++ type var = value; \ ++ module_param_named(name, var, type, 0644); \ ++ MODULE_PARM_DESC(name, desc) ++ ++#ifdef IBP_DEBUG ++extern int debug_level; ++#endif ++ ++enum { ++ IBP_DEBUG_NONE, ++ IBP_DEBUG_TARGETED, ++ IBP_DEBUG_VERBOSE, ++}; ++ ++#define _PRINTK(l, f, arg...) \ ++ printk(l DRV_PFX "%s(%d) " f, __func__, __LINE__, ##arg) ++ ++#ifdef IBP_DEBUG ++#define PRINTK(dbg, l, f, arg...) \ ++ do { \ ++ if (debug_level >= dbg) \ ++ printk(l DRV_PFX "%s(%d) " f, \ ++ __func__, __LINE__, ##arg); \ ++ } while (0) ++#else ++#define PRINTK(dbg, l, f, arg...) do { } while (0) ++#endif ++ ++#define print_dbg(f, arg...) PRINTK(IBP_DEBUG_TARGETED, KERN_DEBUG, f, ##arg) ++#define print_err(f, arg...) _PRINTK(KERN_ERR, f, ##arg) ++#define print_info(f, arg...) pr_info(f, ##arg) ++ ++#if 0 ++#define FORCED_FUNCTION_TRACING ++#endif ++ ++#ifdef FORCED_FUNCTION_TRACING ++#define print_trace(f, arg...) _PRINTK(KERN_ERR, f, ##arg) ++#else ++#define print_trace(f, arg...) PRINTK(IBP_DEBUG_VERBOSE, KERN_ERR, f, ##arg) ++#endif ++ ++#ifndef IBP_PORT /* unique scif port for this service */ ++#define IBP_PORT SCIF_OFED_PORT_2 ++#endif ++ ++#define IS_NULL_OR_ERR(p) (!(p) || IS_ERR_VALUE((unsigned long)p)) ++ ++int ibp_init(void); ++ ++void ibp_cleanup(void); ++ ++#endif /* COMMON_H */ +diff -urN a6/drivers/infiniband/ibp/drv/ibp-abi.h a7/drivers/infiniband/ibp/drv/ibp-abi.h +--- a6/drivers/infiniband/ibp/drv/ibp-abi.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/drv/ibp-abi.h 2015-02-23 10:01:30.290769309 -0800 +@@ -0,0 +1,649 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef IBP_ABI_H ++#define IBP_ABI_H ++ ++#include ++ ++/* Increment this value if any changes break compatibility. */ ++#define IBP_ABI_VERSION 2 ++ ++/* Client to server message enums. */ ++enum { ++ IBP_VERB_GET_PROTOCOL_STATS, ++ IBP_VERB_QUERY_DEVICE, ++ IBP_VERB_QUERY_PORT, ++ IBP_VERB_GET_LINK_LAYER, ++ IBP_VERB_QUERY_GID, ++ IBP_VERB_QUERY_PKEY, ++ IBP_VERB_MODIFY_DEVICE, ++ IBP_VERB_MODIFY_PORT, ++ IBP_VERB_ALLOC_UCONTEXT, ++ IBP_VERB_DEALLOC_UCONTEXT, ++ IBP_VERB_REG_BUF, ++ IBP_VERB_DEREG_BUF, ++ IBP_VERB_MMAP, ++ IBP_VERB_UNMMAP, ++ IBP_VERB_ALLOC_PD, ++ IBP_VERB_DEALLOC_PD, ++ IBP_VERB_CREATE_AH, ++ IBP_VERB_MODIFY_AH, ++ IBP_VERB_QUERY_AH, ++ IBP_VERB_DESTROY_AH, ++ IBP_VERB_CREATE_SRQ, ++ IBP_VERB_MODIFY_SRQ, ++ IBP_VERB_QUERY_SRQ, ++ IBP_VERB_DESTROY_SRQ, ++ IBP_VERB_POST_SRQ_RECV, ++ IBP_VERB_CREATE_QP, ++ IBP_VERB_MODIFY_QP, ++ IBP_VERB_QUERY_QP, ++ IBP_VERB_DESTROY_QP, ++ IBP_VERB_POST_SEND, ++ IBP_VERB_POST_RECV, ++ IBP_VERB_CREATE_CQ, ++ IBP_VERB_MODIFY_CQ, ++ IBP_VERB_DESTROY_CQ, ++ IBP_VERB_RESIZE_CQ, ++ IBP_VERB_POLL_CQ, ++ IBP_VERB_PEEK_CQ, ++ IBP_VERB_REQ_NOTIFY_CQ, ++ IBP_VERB_REQ_NCOMP_NOTIF, ++ IBP_VERB_GET_DMA_MR, ++ IBP_VERB_REG_PHYS_MR, ++ IBP_VERB_REG_USER_MR, ++ IBP_VERB_QUERY_MR, ++ IBP_VERB_DEREG_MR, ++ IBP_VERB_ALLOC_FAST_REG_MR, ++ IBP_VERB_ALLOC_FAST_REG_PAGE_LIST, ++ IBP_VERB_FREE_FAST_REG_PAGE_LIST, ++ IBP_VERB_REREG_PHYS_MR, ++ IBP_VERB_ALLOC_MW, ++ IBP_VERB_BIND_MW, ++ IBP_VERB_DEALLOC_MW, ++ IBP_VERB_ALLOC_FMR, ++ IBP_VERB_MAP_PHYS_FMR, ++ IBP_VERB_UNMAP_FMR, ++ IBP_VERB_DEALLOC_FMR, ++ IBP_VERB_ATTACH_MCAST, ++ IBP_VERB_DETACH_MCAST, ++ IBP_VERB_PROCESS_MAD, ++ IBP_VERB_ALLOC_XRCD, ++ IBP_VERB_DEALLOC_XRCD, ++}; ++ ++/* Server to client message enums. */ ++enum { ++ IBP_ADD_DEVICE, ++ IBP_REMOVE_DEVICE, ++ IBP_VERB_RESPONSE, ++ IBP_QUEUED_RESPONSE, ++ IBP_ASYNC_EVENT, ++ IBP_CQ_COMP, ++}; ++ ++/* ++ * Make sure that all structs defined in this file are laid out to pack ++ * the same way on different architectures to avoid incompatibility. ++ * ++ * Specifically: ++ * - Do not use pointer types -- pass pointers in a u64 instead. ++ * - Make sure that any structure larger than 4 bytes is padded ++ * to a multiple of 8 bytes; otherwise the structure size may ++ * be different between architectures. ++ */ ++ ++struct ibp_msg_header { /* present in all messages */ ++ u32 opcode; ++ u32 length; ++ u32 status; ++ u32 reserved; ++ u64 device; ++ u64 request; ++ u64 data[0]; ++}; ++ ++#define IBP_DEVICE_NAME_MAX 64 ++ ++struct ibp_add_device { ++ u8 name[IBP_DEVICE_NAME_MAX]; ++ u32 vendor_id; ++ u32 device_id; ++ u64 ib_device; ++ u64 device; ++ __be64 node_guid; ++ u64 uverbs_cmd_mask; ++ u32 uverbs_abi_ver; ++ u32 ibp_abi_ver; ++ u32 num_comp_vectors; ++ u8 phys_port_cnt; ++ u8 reserved[7]; ++}; ++ ++struct ibp_add_device_msg { ++ struct ibp_msg_header header; ++ struct ibp_add_device data; ++}; ++ ++struct ibp_remove_device_msg { ++ struct ibp_msg_header header; ++}; ++ ++struct ibp_verb_response_msg { ++ struct ibp_msg_header header; ++ u64 data[0]; ++}; ++ ++struct ibp_queued_response_msg { ++ struct ibp_msg_header header; ++ u64 data[0]; ++}; ++ ++struct ibp_async_event { ++ u64 ibdev; ++ u64 context; ++ u32 type; ++ u8 reserved[4]; ++}; ++ ++struct ibp_async_event_msg { ++ struct ibp_msg_header header; ++ struct ibp_async_event data; ++}; ++ ++struct ibp_cq_comp { ++ u64 cq_context; ++}; ++ ++struct ibp_cq_comp_msg { ++ struct ibp_msg_header header; ++ struct ibp_cq_comp data; ++}; ++ ++struct ibp_alloc_ucontext_cmd { ++ struct ibp_msg_header header; ++ u64 ibdev; ++ u64 data[0]; ++}; ++ ++struct ibp_alloc_ucontext_resp { ++ u64 ucontext; ++ u64 data[0]; ++}; ++ ++struct ibp_dealloc_ucontext_cmd { ++ struct ibp_msg_header header; ++ u64 ucontext; ++}; ++ ++struct ibp_mmap_cmd { ++ struct ibp_msg_header header; ++ u64 len; ++ u64 prot; ++ u64 flags; ++ u64 pgoff; ++ u64 ucontext; ++}; ++ ++struct ibp_mmap_resp { ++ u64 mmap; ++ u64 scif_addr; ++}; ++ ++struct ibp_unmmap_cmd { ++ struct ibp_msg_header header; ++ u64 mmap; ++}; ++ ++struct ibp_reg_buf_cmd { ++ struct ibp_msg_header header; ++ u64 ucontext; ++ u64 virt_addr; ++ u64 scif_addr; ++ u64 length; ++ u32 offset; ++ u32 access; ++}; ++ ++struct ibp_reg_buf_resp { ++ u64 reg; ++}; ++ ++struct ibp_dereg_buf_cmd { ++ struct ibp_msg_header header; ++ u64 reg; ++}; ++ ++struct ibp_query_device_cmd { ++ struct ibp_msg_header header; ++}; ++ ++struct ibp_query_device_resp { ++ u64 fw_ver; ++ __be64 sys_image_guid; ++ u64 max_mr_size; ++ u64 page_size_cap; ++ u32 vendor_id; ++ u32 vendor_part_id; ++ u32 hw_ver; ++ u32 max_qp; ++ u32 max_qp_wr; ++ u32 device_cap_flags; ++ u32 max_sge; ++ u32 max_sge_rd; ++ u32 max_cq; ++ u32 max_cqe; ++ u32 max_mr; ++ u32 max_pd; ++ u32 max_qp_rd_atom; ++ u32 max_ee_rd_atom; ++ u32 max_res_rd_atom; ++ u32 max_qp_init_rd_atom; ++ u32 max_ee_init_rd_atom; ++ u32 atomic_cap; ++ u32 masked_atomic_cap; ++ u32 max_ee; ++ u32 max_rdd; ++ u32 max_mw; ++ u32 max_raw_ipv6_qp; ++ u32 max_raw_ethy_qp; ++ u32 max_mcast_grp; ++ u32 max_mcast_qp_attach; ++ u32 max_total_mcast_qp_attach; ++ u32 max_ah; ++ u32 max_fmr; ++ u32 max_map_per_fmr; ++ u32 max_srq; ++ u32 max_srq_wr; ++ u32 max_srq_sge; ++ u32 max_fast_reg_page_list_len; ++ u16 max_pkeys; ++ u8 local_ca_ack_delay; ++ u8 reserved[5]; ++}; ++ ++struct ibp_query_port_cmd { ++ struct ibp_msg_header header; ++ u8 port_num; ++ u8 reserved[7]; ++}; ++ ++struct ibp_query_port_resp { ++ u32 port_cap_flags; ++ u32 max_msg_sz; ++ u32 bad_pkey_cntr; ++ u32 qkey_viol_cntr; ++ u32 gid_tbl_len; ++ u16 pkey_tbl_len; ++ u16 lid; ++ u16 sm_lid; ++ u8 state; ++ u8 max_mtu; ++ u8 active_mtu; ++ u8 lmc; ++ u8 max_vl_num; ++ u8 sm_sl; ++ u8 subnet_timeout; ++ u8 init_type_reply; ++ u8 active_width; ++ u8 active_speed; ++ u8 phys_state; ++ u8 link_layer; ++ u8 reserved[2]; ++}; ++ ++struct ibp_query_gid_cmd { ++ struct ibp_msg_header header; ++ u32 index; ++ u8 port_num; ++ u8 reserved[3]; ++}; ++ ++struct ibp_query_gid_resp { ++ __be64 subnet_prefix; ++ __be64 interface_id; ++}; ++ ++struct ibp_query_pkey_cmd { ++ struct ibp_msg_header header; ++ u32 index; ++ u8 port_num; ++ u8 reserved[3]; ++}; ++ ++struct ibp_query_pkey_resp { ++ u16 pkey; ++ u8 reserved[6]; ++}; ++ ++struct ibp_alloc_pd_cmd { ++ struct ibp_msg_header header; ++ u64 ucontext; ++ u64 data[0]; ++}; ++ ++struct ibp_alloc_pd_resp { ++ u64 pd; ++ u64 data[0]; ++}; ++ ++struct ibp_dealloc_pd_cmd { ++ struct ibp_msg_header header; ++ u64 pd; ++}; ++ ++struct ibp_global_route { ++ __be64 dgid_subnet_prefix; ++ __be64 dgid_interface_id; ++ u32 flow_label; ++ u8 sgid_index; ++ u8 hop_limit; ++ u8 traffic_class; ++ u8 reserved[1]; ++}; ++ ++struct ibp_ah_attr { ++ struct ibp_global_route grh; ++ u16 dlid; ++ u8 sl; ++ u8 src_path_bits; ++ u8 static_rate; ++ u8 ah_flags; ++ u8 port_num; ++ u8 reserved[1]; ++}; ++ ++struct ibp_create_ah_cmd { ++ struct ibp_msg_header header; ++ u64 pd; ++ struct ibp_ah_attr ah_attr; ++}; ++ ++struct ibp_create_ah_resp { ++ u64 ah; ++}; ++ ++struct ibp_query_ah_cmd { ++ struct ibp_msg_header header; ++ u64 ah; ++}; ++ ++struct ibp_query_ah_resp { ++ struct ibp_ah_attr attr; ++}; ++ ++struct ibp_destroy_ah_cmd { ++ struct ibp_msg_header header; ++ u64 ah; ++}; ++ ++struct ibp_srq_attr { ++ u32 max_wr; ++ u32 max_sge; ++ u32 srq_limit; ++ u8 reserved[4]; ++}; ++ ++struct ibp_create_srq_cmd { ++ struct ibp_msg_header header; ++ u64 pd; ++ u64 srq_context; ++ struct ibp_srq_attr attr; ++ u64 data[0]; ++}; ++ ++struct ibp_create_srq_resp { ++ u64 srq; ++ struct ibp_srq_attr attr; ++ u64 data[0]; ++}; ++ ++struct ibp_query_srq_cmd { ++ struct ibp_msg_header header; ++ u64 srq; ++}; ++ ++struct ibp_query_srq_resp { ++ struct ibp_srq_attr attr; ++}; ++ ++struct ibp_modify_srq_cmd { ++ struct ibp_msg_header header; ++ u64 srq; ++ struct ibp_srq_attr attr; ++ u32 srq_attr_mask; ++ u8 reserved[4]; ++ u64 data[0]; ++}; ++ ++struct ibp_modify_srq_resp { ++ struct ibp_srq_attr attr; ++ u64 data[0]; ++}; ++ ++struct ibp_destroy_srq_cmd { ++ struct ibp_msg_header header; ++ u64 srq; ++}; ++ ++struct ibp_qp_cap { ++ u32 max_send_wr; ++ u32 max_recv_wr; ++ u32 max_send_sge; ++ u32 max_recv_sge; ++ u32 max_inline_data; ++ u8 reserved[4]; ++}; ++ ++struct ibp_create_qp_cmd { ++ struct ibp_msg_header header; ++ u64 pd; ++ u64 send_cq; ++ u64 recv_cq; ++ u64 srq; ++ u64 xrc_domain; ++ u64 qp_context; ++ struct ibp_qp_cap cap; ++ u8 sq_sig_type; ++ u8 qp_type; ++ u8 create_flags; ++ u8 port_num; ++ u64 data[0]; ++}; ++ ++struct ibp_create_qp_resp { ++ u64 qp; ++ struct ibp_qp_cap cap; ++ u32 qpn; ++ u8 reserved[4]; ++ u64 data[0]; ++}; ++ ++struct ibp_query_qp_cmd { ++ struct ibp_msg_header header; ++ u64 qp; ++ u32 qp_attr_mask; ++ u8 reserved[4]; ++}; ++ ++struct ibp_query_qp_resp { ++ u32 qp_state; ++ u32 cur_qp_state; ++ u32 path_mtu; ++ u32 path_mig_state; ++ u32 qkey; ++ u32 rq_psn; ++ u32 sq_psn; ++ u32 dest_qp_num; ++ u32 qp_access_flags; ++ u32 init_create_flags; ++ struct ibp_qp_cap init_cap; ++ struct ibp_qp_cap cap; ++ struct ibp_ah_attr ah; ++ struct ibp_ah_attr alt_ah; ++ u16 pkey_index; ++ u16 alt_pkey_index; ++ u8 en_sqd_async_notify; ++ u8 sq_draining; ++ u8 max_rd_atomic; ++ u8 max_dest_rd_atomic; ++ u8 min_rnr_timer; ++ u8 port_num; ++ u8 timeout; ++ u8 retry_cnt; ++ u8 rnr_retry; ++ u8 alt_port_num; ++ u8 alt_timeout; ++ u8 init_sq_sig_type; ++}; ++ ++struct ibp_modify_qp_cmd { ++ struct ibp_msg_header header; ++ u64 qp; ++ u32 qp_attr_mask; ++ u32 qp_state; ++ u32 cur_qp_state; ++ u32 path_mtu; ++ u32 path_mig_state; ++ u32 qkey; ++ u32 rq_psn; ++ u32 sq_psn; ++ u32 dest_qp_num; ++ u32 qp_access_flags; ++ struct ibp_qp_cap cap; ++ struct ibp_ah_attr ah; ++ struct ibp_ah_attr alt_ah; ++ u16 pkey_index; ++ u16 alt_pkey_index; ++ u8 en_sqd_async_notify; ++ u8 sq_draining; ++ u8 max_rd_atomic; ++ u8 max_dest_rd_atomic; ++ u8 min_rnr_timer; ++ u8 port_num; ++ u8 timeout; ++ u8 retry_cnt; ++ u8 rnr_retry; ++ u8 alt_port_num; ++ u8 alt_timeout; ++ u8 reserved[1]; ++ u64 data[0]; ++}; ++ ++struct ibp_modify_qp_resp { ++ struct ibp_qp_cap cap; ++ u64 data[0]; ++}; ++ ++struct ibp_destroy_qp_cmd { ++ struct ibp_msg_header header; ++ u64 qp; ++}; ++ ++struct ibp_create_cq_cmd { ++ struct ibp_msg_header header; ++ u64 ucontext; ++ u64 cq_context; ++ u32 cqe; ++ u32 vector; ++ u64 data[0]; ++}; ++ ++struct ibp_create_cq_resp { ++ u64 cq; ++ u32 cqe; ++ u8 reserved[4]; ++ u64 data[0]; ++}; ++ ++struct ibp_resize_cq_cmd { ++ struct ibp_msg_header header; ++ u64 cq; ++ u32 cqe; ++ u8 reserved[4]; ++ u64 data[0]; ++}; ++ ++struct ibp_resize_cq_resp { ++ u32 cqe; ++ u8 reserved[4]; ++ u64 data[0]; ++}; ++ ++struct ibp_destroy_cq_cmd { ++ struct ibp_msg_header header; ++ u64 cq; ++}; ++ ++struct ibp_reg_user_mr_cmd { ++ struct ibp_msg_header header; ++ u64 pd; ++ u64 hca_va; ++ u64 scif_addr; ++ u64 length; ++ u32 offset; ++ u32 access; ++ u64 data[0]; ++}; ++ ++struct ibp_reg_user_mr_resp { ++ u64 mr; ++ u32 lkey; ++ u32 rkey; ++ u64 data[0]; ++}; ++ ++struct ibp_dereg_mr_cmd { ++ struct ibp_msg_header header; ++ u64 mr; ++}; ++ ++struct ibp_attach_mcast_cmd { ++ struct ibp_msg_header header; ++ u64 qp; ++ __be64 subnet_prefix; ++ __be64 interface_id; ++ u16 lid; ++ u8 data[6]; ++}; ++ ++struct ibp_detach_mcast_cmd { ++ struct ibp_msg_header header; ++ u64 qp; ++ __be64 subnet_prefix; ++ __be64 interface_id; ++ u16 lid; ++ u8 data[6]; ++}; ++ ++#endif /* IBP_ABI_H */ +diff -urN a6/drivers/infiniband/ibp/drv/ibp.h a7/drivers/infiniband/ibp/drv/ibp.h +--- a6/drivers/infiniband/ibp/drv/ibp.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/drv/ibp.h 2015-02-23 10:01:30.291769309 -0800 +@@ -0,0 +1,257 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef IBP_H ++#define IBP_H ++ ++#include ++#include "ibp-abi.h" ++ ++struct ibp_device { ++ char name[IBP_DEVICE_NAME_MAX]; ++ u32 vendor_id; ++ u32 device_id; ++ u64 ib_device; ++ u64 device; ++ __be64 node_guid; ++ u64 uverbs_cmd_mask; ++ u32 uverbs_abi_ver; ++ u32 ibp_abi_ver; ++ struct device *linux_dev; ++ struct list_head list; ++ u64 driver_data; ++ int abi_version; ++ int num_comp_vectors; ++ u8 phys_port_cnt; ++}; ++ ++struct ibp_id_table { ++ u32 vendor_id; ++ u32 device_id; ++}; ++ ++struct ibp_driver { ++ const char *name; ++ const struct ibp_id_table *id_table; ++ int (*add)(struct ibp_device *device); ++ void (*remove)(struct ibp_device *device); ++ u64 (*resolve)(struct ib_device *ibdev); ++ ++ struct list_head list; ++}; ++ ++struct ibp_rb { ++ u64 handle; ++}; ++ ++struct ibp_iomem { ++ void *cookie; ++ void __iomem *addr; ++}; ++ ++/** ++ * ibp_resolve_ib_device - Return the host ib_device handle ++ * @ibdev:Card IB device ++ * ++ * Upper level drivers may require the host ib_device handle associated ++ * with the card ib_device. This routine resolves the card ib_device to ++ * the cooresponding host ib_device handle. A value of 0 is returned if ++ * no match was found. ++ */ ++u64 ibp_resolve_ib_device(struct ib_device *ibdev); ++ ++/** ++ * ibp_register_driver - Register this driver ++ * @driver:Driver to register ++ * ++ * Lower level drivers use ibp_register_driver to register for callbacks ++ * on IB device addition and removal. Only one low level driver registration ++ * is allowed for a each vendor/device id pair. When an IB device is added, ++ * it is compared with each registered driver vendor and device id. The add ++ * callback routine for the matching driver will be called. ++ */ ++int ibp_register_driver(struct ibp_driver *driver); ++ ++/** ++ * ibp_unregister_driver - Unregister this driver ++ * @client:Driver to unregister ++ * ++ * Lower level drivers use ibp_unregister_driver() to remove their ++ * registration. When ibp_unregister_driver() is called, the driver ++ * will receive a remove callback for each IB device with matcing vendor ++ * and device ids. ++ */ ++void ibp_unregister_driver(struct ibp_driver *driver); ++ ++static inline void ibp_set_driver_data(struct ibp_device *device, u64 data) ++{ ++ device->driver_data = data; ++} ++ ++static inline u64 ibp_get_driver_data(struct ibp_device *device) ++{ ++ return device->driver_data; ++} ++ ++int ibp_cmd_alloc_ucontext(struct ibp_device *device, struct ib_device *ibdev, ++ u64 *ucontext, struct ibp_alloc_ucontext_cmd *cmd, ++ size_t cmd_size, ++ struct ibp_alloc_ucontext_resp *resp, ++ size_t resp_size); ++ ++int ibp_cmd_dealloc_ucontext(struct ibp_device *device, u64 ucontext); ++ ++/** ++ * ibp_reg_buf - Register a private buffer with this driver ++ * @device: the device on which to register ++ * @ucontext: peer driver ucontext handle ++ * @vaddr: starting virtual address of the buffer ++ * @length: length of the buffer ++ * @access: IB_ACCESS_xxx flags for buffer ++ * ++ * Lower level drivers use ibp_reg_buf() to register private buffers. ++ * Upon success, a pointer to a registered buffer structure is returned ++ * which contains an addr handle. The addr handle can be shared with ++ * a peer driver on the host server for its use with ib_umem_get(). ++ * This routine should not be used to register IB memory regions. ++ */ ++struct ibp_rb *ibp_reg_buf(struct ibp_device *device, u64 ucontext, ++ unsigned long vaddr, size_t length, int access); ++ ++/** ++ * ibp_dereg_buf - Deregister a private buffer through this driver ++ * @device: the device on which to deregister ++ * @rb: pointer to the registered buffer structure; may be ERR or NULL ++ * ++ * Lower level drivers use ibp_dereg_buf() to deregister a private buffer. ++ */ ++int ibp_dereg_buf(struct ibp_device *device, struct ibp_rb *rb); ++ ++int ibp_cmd_mmap(struct ibp_device *device, u64 ucontext, ++ struct vm_area_struct *vma); ++ ++struct ibp_iomem *ibp_cmd_ioremap(struct ibp_device *device, u64 ucontext, ++ phys_addr_t offset, unsigned long size); ++ ++int ibp_cmd_iounmap(struct ibp_iomem *iomem); ++ ++int ibp_cmd_query_device(struct ibp_device *device, ++ struct ib_device_attr *device_attr); ++ ++int ibp_cmd_query_port(struct ibp_device *device, u8 port_num, ++ struct ib_port_attr *port_attr); ++ ++int ibp_cmd_query_gid(struct ibp_device *device, u8 port_num, int index, ++ union ib_gid *gid); ++ ++int ibp_cmd_query_pkey(struct ibp_device *device, u8 port_num, int index, ++ u16 *pkey); ++ ++int ibp_cmd_alloc_pd(struct ibp_device *device, u64 ucontext, u64 *pd, ++ struct ibp_alloc_pd_cmd *cmd, size_t cmd_size, ++ struct ibp_alloc_pd_resp *resp, size_t resp_size); ++ ++int ibp_cmd_dealloc_pd(struct ibp_device *device, u64 pd); ++ ++int ibp_cmd_create_ah(struct ibp_device *device, u64 pd, ++ struct ib_ah_attr *ah_attr, ++ u64 *ah); ++ ++int ibp_cmd_query_ah(struct ibp_device *device, u64 ah, ++ struct ib_ah_attr *ah_attr); ++ ++int ibp_cmd_destroy_ah(struct ibp_device *device, u64 ah); ++ ++int ibp_cmd_create_srq(struct ibp_device *device, u64 pd, ++ struct ib_srq_init_attr *init_attr, ++ u64 *srq, struct ib_srq *ibsrq, ++ struct ibp_create_srq_cmd *cmd, size_t cmd_size, ++ struct ibp_create_srq_resp *resp, size_t resp_size); ++ ++int ibp_cmd_query_srq(struct ibp_device *device, u64 srq, ++ struct ib_srq_attr *attr); ++ ++int ibp_cmd_modify_srq(struct ibp_device *device, u64 srq, ++ struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, ++ struct ibp_modify_srq_cmd *cmd, size_t cmd_size, ++ struct ibp_modify_srq_resp *resp, size_t resp_size); ++ ++int ibp_cmd_destroy_srq(struct ibp_device *device, u64 srq); ++ ++int ibp_cmd_create_qp(struct ibp_device *device, u64 pd, ++ u64 send_cq, u64 recv_cq, u64 srq, ++ struct ib_qp_init_attr *init_attr, ++ u64 *qp, struct ib_qp *ibqp, ++ struct ibp_create_qp_cmd *cmd, size_t cmd_size, ++ struct ibp_create_qp_resp *resp, size_t resp_size); ++ ++int ibp_cmd_query_qp(struct ibp_device *device, u64 qp, ++ struct ib_qp_attr *attr, int qp_attr_mask, ++ struct ib_qp_init_attr *init_attr); ++ ++int ibp_cmd_modify_qp(struct ibp_device *device, u64 qp, ++ struct ib_qp_attr *attr, int qp_attr_mask, ++ struct ibp_modify_qp_cmd *cmd, size_t cmd_size, ++ struct ibp_modify_qp_resp *resp, size_t resp_size); ++ ++int ibp_cmd_destroy_qp(struct ibp_device *device, u64 qp); ++ ++int ibp_cmd_create_cq(struct ibp_device *device, u64 ucontext, ++ int entries, int vector, u64 *cq, struct ib_cq *ibcq, ++ struct ibp_create_cq_cmd *cmd, size_t cmd_size, ++ struct ibp_create_cq_resp *resp, size_t resp_size); ++ ++int ibp_cmd_resize_cq(struct ibp_device *device, u64 cq, ++ int entries, struct ib_cq *ibcq, ++ struct ibp_resize_cq_cmd *cmd, size_t cmd_size, ++ struct ibp_resize_cq_resp *resp, size_t resp_size); ++ ++int ibp_cmd_destroy_cq(struct ibp_device *device, u64 cq); ++ ++int ibp_cmd_reg_user_mr(struct ibp_device *device, u64 pd, u64 start, ++ u64 length, u64 virt_addr, int access, u64 *mr, ++ u32 *lkey, u32 *rkey, ++ struct ibp_reg_user_mr_cmd *cmd, size_t cmd_size, ++ struct ibp_reg_user_mr_resp *resp, size_t resp_size); ++ ++int ibp_cmd_dereg_mr(struct ibp_device *device, u64 mr); ++ ++int ibp_cmd_get_dma_mr(struct ibp_device *device, u64 pd, int access, ++ u64 *mr, u32 *lkey, u32 *rkey); ++ ++int ibp_cmd_attach_mcast(struct ibp_device *device, u64 qp, ++ union ib_gid *gid, u16 lid); ++ ++int ibp_cmd_detach_mcast(struct ibp_device *device, u64 qp, ++ union ib_gid *gid, u16 lid); ++ ++#endif /* IBP_H */ +diff -urN a6/drivers/infiniband/ibp/drv/Makefile a7/drivers/infiniband/ibp/drv/Makefile +--- a6/drivers/infiniband/ibp/drv/Makefile 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/drv/Makefile 2015-02-23 10:01:30.291769309 -0800 +@@ -0,0 +1,21 @@ ++KDIR ?= /lib/modules/`uname -r`/build ++ ++obj-$(CONFIG_IBP_SERVER) += ibp_server.o ++ ++ccflags-$(CONFIG_IBP_DEBUG) += -g -DIBP_DEBUG ++ ++ibp_server-y := server.o \ ++ stack.o \ ++ server_msg.o ++ ++default: ++ $(MAKE) -C $(KDIR) M=`pwd` ++ ++modules_install: ++ $(MAKE) -C $(KDIR) M=`pwd` modules_install ++ ++clean: ++ rm -rf *.ko *.o .*.ko.cmd .*.o.cmd *.mod.c Module.* modules.order .tmp_versions ++ ++unix: ++ dos2unix *.[ch] Kconfig Makefile +diff -urN a6/drivers/infiniband/ibp/drv/server.c a7/drivers/infiniband/ibp/drv/server.c +--- a6/drivers/infiniband/ibp/drv/server.c 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/drv/server.c 2015-02-23 10:01:30.291769309 -0800 +@@ -0,0 +1,548 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#include "server.h" ++ ++MODULE_AUTHOR("Jerrie Coffman"); ++MODULE_AUTHOR("Phil Cayton"); ++MODULE_AUTHOR("Jay Sternberg"); ++MODULE_LICENSE("Dual BSD/GPL"); ++MODULE_DESCRIPTION(DRV_DESC); ++MODULE_VERSION(DRV_VERSION); ++ ++MODULE_PARAM(port, port, int, IBP_PORT, "Connection port"); ++MODULE_PARAM(backlog, backlog, int, 8, "Connection backlog"); ++MODULE_PARAM(timeout, timeout, int, 1000, "Listen/Poll time in milliseconds"); ++ ++#ifdef IBP_DEBUG ++MODULE_PARAM(debug_level, debug_level, int, 0, "Debug: 0-none, 1-some, 2-all"); ++#endif ++ ++#ifdef MOFED ++void *ibp_peer_mem_handle; ++invalidate_peer_memory ib_invalidate; ++#endif ++ ++struct rw_semaphore list_rwsem; ++ ++static struct class *ibp_class; ++static struct task_struct *listen_thread; ++ ++static LIST_HEAD(device_list); ++static LIST_HEAD(client_list); ++static LIST_HEAD(cdev_list); ++ ++static void ibp_add_one(struct ib_device *ib_dev); ++static void ibp_remove_one(struct ib_device *ib_dev); ++ ++static struct ib_client ib_client = { ++ .name = DRV_NAME, ++ .add = ibp_add_one, ++ .remove = ibp_remove_one ++}; ++ ++static int ibp_open(struct inode *inode, struct file *filp); ++static ssize_t ibp_write(struct file *filp, const char __user *buf, ++ size_t count, loff_t *pos); ++static int ibp_close(struct inode *inode, struct file *filp); ++ ++static const struct file_operations ibp_fops = { ++ .owner = THIS_MODULE, ++ .open = ibp_open, ++ .write = ibp_write, ++ .release = ibp_close, ++}; ++ ++static int ibp_create_cdev(struct ibp_client *client, uint16_t node) ++{ ++ struct device *device; ++ dev_t devt; ++ int ret; ++ ++ ret = alloc_chrdev_region(&devt, 0, 1, DRV_BASE); ++ if (ret) { ++ print_err("alloc_chrdev_region returned %d\n", ret); ++ return ret; ++ } ++ ++ cdev_init(&client->cdev, &ibp_fops); ++ client->cdev.owner = THIS_MODULE; ++ ++ ret = cdev_add(&client->cdev, devt, 1); ++ if (ret) { ++ print_err("cdev_add returned %d\n", ret); ++ goto err0; ++ } ++ ++ device = device_create(ibp_class, NULL, devt, ++ NULL, DRV_BASE "%u", node); ++ if (IS_ERR(device)) { ++ ret = PTR_ERR(device); ++ goto err1; ++ } ++ ++ /* Start on the cdev_list (until ibp_register_client). */ ++ down_write(&list_rwsem); ++ list_add_tail(&client->list, &cdev_list); ++ up_write(&list_rwsem); ++ ++ return 0; ++err1: ++ cdev_del(&client->cdev); ++err0: ++ unregister_chrdev_region(devt, 1); ++ return ret; ++} ++ ++static void ibp_destroy_cdev(struct ibp_client *client) ++{ ++ device_destroy(ibp_class, client->cdev.dev); ++ cdev_del(&client->cdev); ++ unregister_chrdev_region(client->cdev.dev, 1); ++} ++ ++static struct ibp_client *ibp_create_client(scif_epd_t ep, uint16_t node) ++{ ++ struct ibp_client *client; ++ int ret; ++ ++ /* If a reconnect occurs while on the cdev_list just update the ep. */ ++ down_read(&list_rwsem); ++ list_for_each_entry(client, &cdev_list, list) { ++ if (client->node == node) { ++ up_read(&list_rwsem); ++ scif_close(client->ep); ++ client->ep = ep; ++ return client; ++ } ++ } ++ up_read(&list_rwsem); ++ ++ client = kzalloc(sizeof(*client), GFP_KERNEL); ++ if (!client) { ++ print_err("kzalloc failed\n"); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ client->ep = ep; ++ client->node = node; ++ atomic_set(&client->busy, 0); ++ atomic_set(&client->rx_in_process, 0); ++ init_waitqueue_head(&client->rx_wait_queue); ++ mutex_init(&client->ucontext_mutex); ++ INIT_LIST_HEAD(&client->ucontext_list); ++ ++ client->workqueue = create_singlethread_workqueue(DRV_NAME); ++ if (!client->workqueue) { ++ print_err("create_singlethread_workqueue failed\n"); ++ goto err0; ++ } ++ ++ ret = ibp_create_cdev(client, node); ++ if (ret) ++ goto err1; ++ ++ return client; ++err1: ++ destroy_workqueue(client->workqueue); ++err0: ++ kfree(client); ++ return ERR_PTR(ret); ++} ++ ++static void ibp_destroy_client(struct ibp_client *client) ++{ ++ ibp_cleanup_ucontext(&client->ucontext_list); ++ scif_close(client->ep); ++ flush_workqueue(client->workqueue); ++ destroy_workqueue(client->workqueue); ++ ibp_destroy_cdev(client); ++ kfree(client); ++} ++ ++static void ibp_register_client(struct ibp_client *client) ++{ ++ struct ibp_device *device; ++ ++ down_write(&list_rwsem); ++ ++ list_move(&client->list, &client_list); ++ ++ list_for_each_entry(device, &device_list, list) ++ ibp_send_add(client, device); ++ ++ up_write(&list_rwsem); ++} ++ ++static void ibp_unregister_client(struct ibp_client *client) ++{ ++ struct ibp_device *device; ++ ++ flush_workqueue(client->workqueue); ++ ++ down_write(&list_rwsem); ++ ++ list_del(&client->list); ++ ++ list_for_each_entry(device, &device_list, list) ++ ibp_send_remove(client, device); ++ ++ up_write(&list_rwsem); ++} ++ ++static int ibp_open(struct inode *inode, struct file *filp) ++{ ++ struct ibp_client *client; ++ ++ client = container_of(inode->i_cdev, struct ibp_client, cdev); ++ ++ filp->private_data = client; ++ ++ if (atomic_add_return(1, &client->busy) == 1) ++ ibp_register_client(client); ++ ++ return 0; ++} ++ ++static ssize_t ibp_write(struct file *filp, const char __user *buf, ++ size_t count, loff_t *pos) ++{ ++ struct ibp_client *client; ++ void *rx_buf; ++ void *tx_buf; ++ int ret = -ENOMEM; ++ ++ client = filp->private_data; ++ ++ rx_buf = (void *) __get_free_page(GFP_KERNEL); ++ if (!rx_buf) { ++ print_err("__get_free_page rx_buf failed\n"); ++ goto err0; ++ } ++ ++ tx_buf = (void *) __get_free_page(GFP_KERNEL); ++ if (!tx_buf) { ++ print_err("__get_free_page tx_buf failed\n"); ++ goto err1; ++ } ++ ++ ret = ibp_process_recvs(client, rx_buf, tx_buf); ++ ++ free_page((uintptr_t) tx_buf); ++err1: ++ free_page((uintptr_t) rx_buf); ++err0: ++ return ret; ++} ++ ++static int ibp_close(struct inode *inode, struct file *filp) ++{ ++ struct ibp_client *client; ++ ++ client = filp->private_data; ++ ++ if (atomic_sub_and_test(1, &client->busy)) { ++ ibp_unregister_client(client); ++ device_destroy(ibp_class, client->cdev.dev); ++ ibp_destroy_client(client); ++ } ++ ++ return 0; ++} ++ ++int ibp_get_device(struct ibp_device *device) ++{ ++ struct ibp_device *entry; ++ ++ down_read(&list_rwsem); ++ ++ list_for_each_entry(entry, &device_list, list) { ++ if (entry == device) { ++ kref_get(&device->ref); ++ break; ++ } ++ } ++ ++ up_read(&list_rwsem); ++ ++ return (entry == device) ? 0 : -ENODEV; ++} ++ ++static void ibp_complete_device(struct kref *ref) ++{ ++ struct ibp_device *device; ++ ++ device = container_of(ref, struct ibp_device, ref); ++ complete(&device->done); ++} ++ ++void ibp_put_device(struct ibp_device *device) ++{ ++ kref_put(&device->ref, ibp_complete_device); ++} ++ ++static struct ibp_device *ibp_create_device(struct ib_device *ib_dev) ++{ ++ struct ibp_device *device; ++ ++ device = kzalloc(sizeof(*device), GFP_KERNEL); ++ if (!device) { ++ print_err("kzalloc failed\n"); ++ return ERR_PTR(-ENOMEM); ++ } ++ device->ib_dev = ib_dev; ++ kref_init(&device->ref); ++ init_completion(&device->done); ++ ++ ib_set_client_data(ib_dev, &ib_client, device); ++ ++ return device; ++} ++ ++static void ibp_destroy_device(struct ibp_device *device) ++{ ++ ibp_put_device(device); ++ wait_for_completion(&device->done); ++ ++ ib_set_client_data(device->ib_dev, &ib_client, NULL); ++ kfree(device); ++} ++ ++static void ibp_register_device(struct ibp_device *device) ++{ ++ struct ibp_client *client; ++ ++ down_write(&list_rwsem); ++ ++ list_add_tail(&device->list, &device_list); ++ list_for_each_entry(client, &client_list, list) ++ ibp_send_add(client, device); ++ ++ up_write(&list_rwsem); ++} ++ ++static void ibp_unregister_device(struct ibp_device *device) ++{ ++ struct ibp_client *client; ++ ++ down_write(&list_rwsem); ++ ++ list_for_each_entry(client, &client_list, list) ++ ibp_send_remove(client, device); ++ ++ list_del(&device->list); ++ ++ up_write(&list_rwsem); ++} ++ ++static int ibp_ignore_ib_dev(struct ib_device *ib_dev) ++{ ++ /* ++ * Only allow PCI-based channel adapters and RNICs. ++ * PCI is required in order to read the vendor id. ++ */ ++ return (!ib_dev->dma_device->bus || ++ !ib_dev->dma_device->bus->name || ++ strnicmp(ib_dev->dma_device->bus->name, "pci", 3) || ++ ((ib_dev->node_type != RDMA_NODE_IB_CA) && ++ (ib_dev->node_type != RDMA_NODE_RNIC))) ? 1 : 0; ++} ++ ++static void ibp_add_one(struct ib_device *ib_dev) ++{ ++ struct ibp_device *device; ++ ++ if (ibp_ignore_ib_dev(ib_dev)) ++ return; ++ ++ device = ibp_create_device(ib_dev); ++ if (IS_ERR(device)) ++ return; ++ ++ ibp_register_device(device); ++} ++ ++static void ibp_remove_one(struct ib_device *ib_dev) ++{ ++ struct ibp_device *device; ++ ++ device = ib_get_client_data(ib_dev, &ib_client); ++ if (!device) ++ return; ++ ++ ibp_unregister_device(device); ++ ibp_destroy_device(device); ++} ++ ++static int ibp_listen(void *data) ++{ ++ struct ibp_client *client; ++ struct scif_pollepd listen; ++ struct scif_portID peer; ++ scif_epd_t ep; ++ int ret; ++ ++ listen.epd = scif_open(); ++ if (!listen.epd) { ++ print_err("scif_open failed\n"); ++ ret = -EIO; ++ goto err0; ++ } ++ listen.events = POLLIN; ++ ++ ret = scif_bind(listen.epd, port); ++ if (ret < 0) { ++ print_err("scif_bind returned %d\n", ret); ++ goto err1; ++ } ++ ++ ret = scif_listen(listen.epd, backlog); ++ if (ret) { ++ print_err("scif_listen returned %d\n", ret); ++ goto err1; ++ } ++ ++ while (!kthread_should_stop()) { ++ ++ schedule(); ++ ++ ret = scif_poll(&listen, 1, timeout); ++ if (ret == 0) /* timeout */ ++ continue; ++ if (ret < 0) { ++ print_err("scif_poll revents 0x%x\n", listen.revents); ++ continue; ++ } ++ ++ ret = scif_accept(listen.epd, &peer, &ep, 0); ++ if (ret) { ++ print_err("scif_accept returned %d\n", ret); ++ continue; ++ } ++ ++ print_dbg("accepted node %d port %d\n", peer.node, peer.port); ++ ++ client = ibp_create_client(ep, peer.node); ++ if (IS_ERR(client)) { ++ ret = PTR_ERR(client); ++ print_err("ibp_create_client returned %d\n", ret); ++ scif_close(ep); ++ } ++ } ++err1: ++ scif_close(listen.epd); ++err0: ++ return ret; ++} ++ ++static int __init ibp_server_init(void) ++{ ++ int ret; ++ ++ print_info(DRV_SIGNON); ++ ++ init_rwsem(&list_rwsem); ++ ++ ret = ibp_init(); ++ if (ret) { ++ print_err("ibp_init_server returned %d\n", ret); ++ return ret; ++ } ++ ++ ibp_class = class_create(THIS_MODULE, "infiniband_proxy"); ++ if (IS_ERR(ibp_class)) { ++ ret = PTR_ERR(ibp_class); ++ print_err("class_create returned %d\n", ret); ++ goto err0; ++ } ++ ++ ret = ib_register_client(&ib_client); ++ if (ret) { ++ print_err("ib_register_client returned %d\n", ret); ++ goto err1; ++ } ++ ++#ifdef MOFED ++ ibp_peer_mem_handle = ib_register_peer_memory_client(&ibp_peer_mem, ++ &ib_invalidate); ++ if (IS_ERR(ibp_peer_mem_handle)) { ++ ret = PTR_ERR(ibp_peer_mem_handle); ++ print_err("ib_register_peer_memory_client returned %d\n", ret); ++ goto err2; ++ } ++#endif ++ ++ /* Start a thread for inbound connections. */ ++ listen_thread = kthread_run(ibp_listen, NULL, DRV_NAME); ++ if (IS_ERR(listen_thread)) { ++ ret = PTR_ERR(listen_thread); ++ print_err("kthread_run returned %d\n", ret); ++ goto err3; ++ } ++ ++ return 0; ++err3: ++#ifdef MOFED ++ ib_unregister_peer_memory_client(ibp_peer_mem_handle); ++err2: ++#endif ++ ib_unregister_client(&ib_client); ++err1: ++ class_destroy(ibp_class); ++err0: ++ ibp_cleanup(); ++ return ret; ++} ++ ++static void __exit ibp_server_exit(void) ++{ ++ struct ibp_client *client; ++ struct ibp_client *next; ++ ++ kthread_stop(listen_thread); ++ ++ list_for_each_entry_safe(client, next, &cdev_list, list) ++ ibp_destroy_client(client); ++ ++#ifdef MOFED ++ ib_unregister_peer_memory_client(ibp_peer_mem_handle); ++#endif ++ ib_unregister_client(&ib_client); ++ class_destroy(ibp_class); ++ ++ ibp_cleanup(); ++ ++ print_info(DRV_DESC " unloaded\n"); ++} ++ ++module_init(ibp_server_init); ++module_exit(ibp_server_exit); +diff -urN a6/drivers/infiniband/ibp/drv/server.h a7/drivers/infiniband/ibp/drv/server.h +--- a6/drivers/infiniband/ibp/drv/server.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/drv/server.h 2015-02-23 10:01:30.291769309 -0800 +@@ -0,0 +1,191 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef SERVER_H ++#define SERVER_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "ibp-abi.h" ++#include "common.h" ++ ++#define DRV_ROLE "Server" ++#define DRV_NAME "ibp_server" ++ ++#define MAX_MSG_SIZE PAGE_SIZE ++ ++extern int timeout; ++extern struct rw_semaphore list_rwsem; ++ ++struct ibp_device { ++ struct list_head list; ++ struct ib_device *ib_dev; ++ struct kref ref; ++ struct completion done; ++}; ++ ++struct ibp_client { ++ struct list_head list; ++ scif_epd_t ep; ++ struct workqueue_struct *workqueue; ++ struct mutex ucontext_mutex; ++ struct list_head ucontext_list; ++ wait_queue_head_t rx_wait_queue; ++ atomic_t rx_in_process; ++ struct cdev cdev; ++ atomic_t busy; ++ uint16_t node; ++}; ++ ++struct ibp_queued_response { ++ struct ibp_client *client; ++ struct work_struct work; ++ u64 msg[0]; ++}; ++ ++struct ibp_event { ++ struct ibp_client *client; ++ struct work_struct work; ++ u64 context; ++ u64 ibdev; ++ enum ib_event_type type; ++}; ++ ++struct ibp_comp { ++ struct ibp_client *client; ++ struct work_struct work; ++ void *cq_context; ++}; ++ ++struct ibp_ucontext { ++ struct ib_ucontext *ibucontext; ++ struct ibp_client *client; ++ struct ibp_device *device; ++ struct file *filp; ++ struct ib_event_handler event_handler; ++ u64 ibdev; ++ struct mutex mutex; ++ struct list_head list; ++ struct list_head mmap_list; ++ struct rb_root reg_tree; ++}; ++ ++struct ibp_qp { ++ struct ib_qp *ibqp; ++ struct list_head mcast; ++}; ++ ++struct ibp_mcast_entry { ++ struct list_head list; ++ union ib_gid gid; ++ u16 lid; ++}; ++ ++struct ibp_mmap { ++ struct list_head list; ++ struct ibp_ucontext *ucontext; ++ u64 len; ++ u64 prot; ++ u64 vaddr; ++ void __iomem *io_addr; ++ off_t scif_addr; ++}; ++ ++struct ibp_reg { ++ struct rb_node node; ++ struct scif_range *range; ++ struct ibp_ucontext *ucontext; ++ struct kref ref; ++ u64 virt_addr; ++ u64 length; ++ off_t offset; ++ u32 access; ++}; ++ ++struct ibp_mr { ++ struct ib_mr *ibmr; ++ struct ibp_reg *reg; ++}; ++ ++#ifdef MOFED ++#include ++extern struct peer_memory_client ibp_peer_mem; ++extern void *ibp_peer_mem_handle; ++extern invalidate_peer_memory ib_invalidate; ++#else ++#define IBP_UMEM_MAX_PAGE_CHUNK \ ++ ((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) / \ ++ ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] - \ ++ (void *) &((struct ib_umem_chunk *) 0)->page_list[0])) ++#endif ++ ++#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ ++ do { \ ++ (udata)->ops = &ibp_copy; \ ++ (udata)->inbuf = (void *)(ibuf); \ ++ (udata)->outbuf = (void *)(obuf); \ ++ (udata)->inlen = (ilen); \ ++ (udata)->outlen = (olen); \ ++ } while (0) ++ ++#define IBP_INIT_MSG(handle, msg, size, op) \ ++ do { \ ++ (msg)->header.opcode = IBP_##op; \ ++ (msg)->header.length = (size); \ ++ (msg)->header.status = 0; \ ++ (msg)->header.reserved = 0; \ ++ (msg)->header.device = (uintptr_t)(handle); \ ++ (msg)->header.request = 0; \ ++ } while (0) ++ ++#define IBP_INIT_RESP(handle, resp, size, op, req, stat) \ ++ do { \ ++ (resp)->header.opcode = IBP_##op; \ ++ (resp)->header.length = (size); \ ++ (resp)->header.status = (stat); \ ++ (resp)->header.reserved = 0; \ ++ (resp)->header.device = (uintptr_t)(handle); \ ++ (resp)->header.request = (req); \ ++ } while (0) ++ ++int ibp_process_recvs(struct ibp_client *client, void *rx_buf, void *tx_buf); ++void ibp_cleanup_ucontext(struct list_head *ucontext_list); ++int ibp_send_add(struct ibp_client *client, struct ibp_device *device); ++int ibp_send_remove(struct ibp_client *client, struct ibp_device *device); ++int ibp_get_device(struct ibp_device *device); ++void ibp_put_device(struct ibp_device *device); ++ ++#endif /* SERVER_H */ +diff -urN a6/drivers/infiniband/ibp/drv/server_msg.c a7/drivers/infiniband/ibp/drv/server_msg.c +--- a6/drivers/infiniband/ibp/drv/server_msg.c 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/drv/server_msg.c 2015-02-23 10:01:30.292769309 -0800 +@@ -0,0 +1,3098 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#include ++ ++#include "server.h" ++#include "stack.h" ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) ++ #define MUNMAP(x,y,z) \ ++ do { \ ++ down_write(¤t->mm->mmap_sem); \ ++ do_munmap(x,y,z); \ ++ up_write(¤t->mm->mmap_sem); \ ++ } while (0) ++#else ++ #define MUNMAP(x,y,z) \ ++ vm_munmap((unsigned long)y,z) ++#endif ++ ++static struct ibp_stack *o_stack; ++static struct ibp_stack *a_stack; ++static struct ibp_stack *c_stack; ++ ++/* ++ * umem functions ++ */ ++static int ibp_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) ++{ ++ size_t bytes; ++ ++ bytes = min(len, udata->inlen); ++ ++ memcpy(dest, udata->inbuf, bytes); ++ if (bytes < len) { ++ memset(dest + bytes, 0, len - bytes); ++ return -EFAULT; ++ } ++ return 0; ++} ++ ++static int ibp_copy_to_udata(struct ib_udata *udata, void *src, size_t len) ++{ ++ size_t bytes; ++ ++ bytes = min(len, udata->outlen); ++ ++ memcpy(udata->outbuf, src, bytes); ++ udata->outlen -= bytes; ++ ++ return (bytes < len) ? -EFAULT : 0; ++} ++ ++static struct ib_udata_ops ibp_copy = { ++ .copy_from = ibp_copy_from_udata, ++ .copy_to = ibp_copy_to_udata ++}; ++ ++#ifdef MOFED ++ ++static struct ibp_reg *__ibp_find_reg(struct ibp_ucontext *ucontext, ++ unsigned long virt, size_t size) ++{ ++ struct rb_node *node; ++ struct ibp_reg *reg; ++ ++ node = ucontext->reg_tree.rb_node; ++ ++ while (node) { ++ reg = rb_entry(node, struct ibp_reg, node); ++ ++ if ((virt == reg->virt_addr) && ++ (size == reg->length)) ++ return reg; ++ ++ if (virt < reg->virt_addr) ++ node = node->rb_left; ++ else if (virt > reg->virt_addr) ++ node = node->rb_right; ++ else if (size < reg->length) ++ node = node->rb_left; ++ else if (size > reg->length) ++ node = node->rb_right; ++ else ++ node = node->rb_right; ++ } ++ ++ return ERR_PTR(-EFAULT); ++} ++ ++static struct ibp_reg *ibp_find_reg(struct ibp_ucontext *ucontext, ++ unsigned long virt, size_t size) ++{ ++ struct ibp_reg *reg; ++ ++ mutex_lock(&ucontext->mutex); ++ reg = __ibp_find_reg(ucontext, virt, size); ++ mutex_unlock(&ucontext->mutex); ++ ++ return reg; ++} ++ ++/* ibp_peer_acquire return code: 1 mine, 0 not mine */ ++static int ibp_peer_acquire(unsigned long addr, ++ size_t size, void* peer_mem_private_data, ++ char* peer_mem_name, void** client_context) ++{ ++ struct ibp_ucontext *ucontext; ++ struct ibp_reg *reg; ++ ++ /* Verify private data is ours before ibp_ucontext cast. */ ++ if (!peer_mem_name || !peer_mem_private_data || ++ strncmp(peer_mem_name, ibp_peer_mem.name, ++ sizeof(ibp_peer_mem.name))) ++ return 0; ++ ++ ucontext = (struct ibp_ucontext *) peer_mem_private_data; ++ ++ reg = ibp_find_reg(ucontext, addr, size); ++ if (IS_ERR(reg)) { ++ print_err("ibp_find_reg returned %d\n", (int)PTR_ERR(reg)); ++ return 0; ++ } ++ ++ *client_context = (void *) reg; ++ ++ return 1; ++} ++ ++static int ibp_peer_get_pages(unsigned long addr, size_t size, int write, ++ int force, struct sg_table *sg_head, ++ void* client_context, void* core_context) ++{ ++ struct ibp_reg *reg; ++ struct page *page; ++ struct scatterlist *sg; ++ void **va; ++ int npages, off, i, ret; ++ ++ reg = (struct ibp_reg *) client_context; ++ ++ off = (addr - reg->virt_addr) + reg->offset; ++ npages = PAGE_ALIGN(size + (off & ~PAGE_MASK)) >> PAGE_SHIFT; ++ ++ ret = sg_alloc_table(sg_head, npages, GFP_KERNEL); ++ if (ret) ++ return ret; ++ ++ va = reg->range->va; ++ ++ for_each_sg(sg_head->sgl, sg, npages, i) { ++ page = vmalloc_to_page(va[i]); ++ if (!page) { ++ print_err("vmalloc_to_page failed\n"); ++ ret = -EINVAL; ++ goto err; ++ } ++ sg_set_page(sg, page, PAGE_SIZE, 0); ++ } ++ ++ return 0; ++err: ++ sg_free_table(sg_head); ++ return ret; ++} ++ ++static int ibp_peer_dma_map(struct sg_table *sg_head, void *client_context, ++ struct device *dma_device, int dmasync, int *nmap) ++{ ++ DEFINE_DMA_ATTRS(attrs); ++ int ret = 0; ++ ++ if (dmasync) ++ dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); ++ ++ *nmap = dma_map_sg_attrs(dma_device, ++ sg_head->sgl, ++ sg_head->orig_nents, ++ DMA_BIDIRECTIONAL, ++ &attrs); ++ ++ if (*nmap > 0) ++ sg_head->nents = *nmap; ++ else ++ ret = -ENOMEM; ++ ++ return ret; ++} ++ ++static int ibp_peer_dma_umap(struct sg_table *sg_head, void *client_context, ++ struct device *dma_device) ++{ ++ dma_unmap_sg(dma_device, ++ sg_head->sgl, ++ sg_head->nents, ++ DMA_BIDIRECTIONAL); ++ return 0; ++} ++ ++static void ibp_peer_put_pages(struct sg_table *sg_head, void *client_context) ++{ ++ sg_free_table(sg_head); ++} ++ ++static unsigned long ibp_peer_get_page_size(void *client_context) ++{ ++ return PAGE_SIZE; ++} ++ ++struct peer_memory_client ibp_peer_mem = { ++ .name = DRV_NAME, ++ .version = DRV_VERSION, ++ .acquire = &ibp_peer_acquire, ++ .get_pages = &ibp_peer_get_pages, ++ .dma_map = &ibp_peer_dma_map, ++ .dma_unmap = &ibp_peer_dma_umap, ++ .put_pages = &ibp_peer_put_pages, ++ .get_page_size = &ibp_peer_get_page_size, ++}; ++ ++#else /* MOFED */ ++ ++static struct ibp_reg *__ibp_find_reg(struct ibp_ucontext *ucontext, ++ unsigned long virt, size_t size, ++ int access) ++{ ++ struct rb_node *node; ++ struct ibp_reg *reg; ++ ++ node = ucontext->reg_tree.rb_node; ++ ++ while (node) { ++ reg = rb_entry(node, struct ibp_reg, node); ++ ++ if ((virt == reg->virt_addr) && ++ (size == reg->length) && ++ (access == reg->access)) ++ return reg; ++ ++ if (virt < reg->virt_addr) ++ node = node->rb_left; ++ else if (virt > reg->virt_addr) ++ node = node->rb_right; ++ else if (size < reg->length) ++ node = node->rb_left; ++ else if (size > reg->length) ++ node = node->rb_right; ++ else if (access < reg->access) ++ node = node->rb_left; ++ else ++ node = node->rb_right; ++ } ++ ++ return ERR_PTR(-EFAULT); ++} ++ ++static struct ibp_reg *ibp_find_reg(struct ibp_ucontext *ucontext, ++ unsigned long virt, size_t size, ++ int access) ++{ ++ struct ibp_reg *reg; ++ ++ mutex_lock(&ucontext->mutex); ++ reg = __ibp_find_reg(ucontext, virt, size, access); ++ mutex_unlock(&ucontext->mutex); ++ ++ return reg; ++} ++ ++static void __ibp_umem_release(struct ib_device *dev, struct ib_umem *umem, ++ int dirty) ++{ ++ struct scatterlist *sg; ++ int i; ++ ++ if (umem->nmap > 0) ++ ib_dma_unmap_sg(dev, umem->sg_head.sgl, ++ umem->nmap, DMA_BIDIRECTIONAL); ++ ++ if (umem->writable && dirty) ++ for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) ++ set_page_dirty_lock(sg_page(sg)); ++ ++ sg_free_table(&umem->sg_head); ++} ++ ++static struct ib_umem *ibp_umem_get(struct ib_ucontext *ibucontext, ++ unsigned long addr, size_t size, ++ int access, int dmasync) ++{ ++ struct ibp_reg *reg; ++ struct ib_umem *umem; ++ struct device *dma_device; ++ struct page *page; ++ struct scatterlist *sg; ++ void **va; ++ dma_addr_t *pa; ++ dma_addr_t daddr; ++ unsigned int dsize; ++ int npages; ++ int off; ++ int i; ++ int ret = 0; ++ ++ DEFINE_DMA_ATTRS(attrs); ++ ++ reg = ibp_find_reg(ibucontext->umem_private_data, addr, size, access); ++ if (IS_ERR(reg)) ++ return ERR_CAST(reg); ++ ++ if (dmasync) ++ dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); ++ ++ umem = kzalloc(sizeof(*umem), GFP_KERNEL); ++ if (!umem) { ++ print_err("kalloc failed\n"); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ umem->length = size; ++ umem->offset = addr & ~PAGE_MASK; ++ umem->page_size = PAGE_SIZE; ++ umem->pid = get_task_pid(current, PIDTYPE_PID); ++ umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); ++ ++ dsize = 0; ++ daddr = 0; ++ va = reg->range->va; ++ pa = reg->range->phys_addr; ++ dma_device = ibucontext->device->dma_device; ++ off = (addr - reg->virt_addr) + reg->offset; ++ npages = PAGE_ALIGN(size + (off & ~PAGE_MASK)) >> PAGE_SHIFT; ++ off >>= PAGE_SHIFT; ++ ++ ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); ++ if (ret) { ++ print_err("sg_alloc_table failed\n"); ++ goto err1; ++ } ++ ++ /* Assume hugetlb unless proven otherwise. */ ++ umem->hugetlb = 1; ++ for (i = 0; i < npages && umem->hugetlb; i++) { ++ if (!dsize) { ++ dsize = PAGE_SIZE; ++ daddr = pa[i + off]; ++ /* Page must start on a huge page boundary. */ ++ if ((daddr & ~HPAGE_MASK) >= PAGE_SIZE) ++ umem->hugetlb = 0; ++ } else if (daddr + dsize != pa[i + off]) ++ /* Pages must be contiguous. */ ++ umem->hugetlb = 0; ++ else { ++ dsize += PAGE_SIZE; ++ if (dsize == HPAGE_SIZE) ++ dsize = 0; ++ } ++ } ++ /* Page must end on a huge page boundary.*/ ++ if (umem->hugetlb && ((daddr + dsize) & ~HPAGE_MASK)) ++ umem->hugetlb = 0; ++ ++ for_each_sg(umem->sg_head.sgl, sg, npages, i) { ++ page = vmalloc_to_page(va[i]); ++ if (!page) { ++ print_err("vmalloc_to_page failed\n"); ++ ret = -EINVAL; ++ goto err2; ++ } ++ sg_set_page(sg, page, PAGE_SIZE, 0); ++ } ++ ++ umem->npages = npages; ++ ++ umem->nmap = ib_dma_map_sg_attrs(ibucontext->device, ++ umem->sg_head.sgl, ++ umem->npages, ++ DMA_BIDIRECTIONAL, ++ &attrs); ++ if (umem->nmap <= 0) { ++ print_err("map_sg_attrs failed\n"); ++ ret = -ENOMEM; ++ goto err2; ++ } ++ ++ return umem; ++err2: ++ __ibp_umem_release(ibucontext->device, umem, 0); ++err1: ++ put_pid(umem->pid); ++ kfree(umem); ++ return ERR_PTR(ret); ++} ++ ++static void ibp_umem_release(struct ib_umem *umem) ++{ ++ struct ib_ucontext *ibucontext; ++ ++ ibucontext = umem->context; ++ ++ __ibp_umem_release(ibucontext->device, umem, 0); ++ ++ put_pid(umem->pid); ++ kfree(umem); ++} ++ ++static struct ib_umem_ops ibp_umem = { ++ .get = &ibp_umem_get, ++ .release = &ibp_umem_release, ++}; ++ ++#endif /* MOFED */ ++ ++static int ibp_send(scif_epd_t ep, void *buf, size_t len) ++{ ++ int ret; ++ ++ while (len) { ++ ret = scif_send(ep, buf, (uint32_t) len, SCIF_SEND_BLOCK); ++ if (ret < 0) { ++ print_dbg("scif_send returned %d\n", ret); ++ return ret; ++ } ++ buf += ret; ++ len -= ret; ++ } ++ ++ return 0; ++} ++ ++static int ibp_recv(scif_epd_t ep, void *buf, size_t len) ++{ ++ int ret; ++ ++ while (len) { ++ ret = scif_recv(ep, buf, (uint32_t) len, SCIF_RECV_BLOCK); ++ if (ret < 0) { ++ print_dbg("scif_recv returned %d\n", ret); ++ return ret; ++ } ++ buf += ret; ++ len -= ret; ++ } ++ ++ return 0; ++} ++ ++int ibp_send_add(struct ibp_client *client, struct ibp_device *device) ++{ ++ struct pci_dev *pdev; ++ struct ibp_add_device_msg msg; ++ ++ print_trace("in\n"); ++ ++ pdev = to_pci_dev(device->ib_dev->dma_device); ++ ++ IBP_INIT_MSG(device, &msg, sizeof(msg), ADD_DEVICE); ++ ++ strncpy(msg.data.name, device->ib_dev->name, sizeof(msg.data.name)); ++ msg.data.vendor_id = pdev->vendor; ++ msg.data.device_id = pdev->device; ++ ++ msg.data.ib_device = (uintptr_t) device->ib_dev; ++ msg.data.device = (uintptr_t) device; ++ msg.data.node_guid = device->ib_dev->node_guid; ++ msg.data.uverbs_cmd_mask = device->ib_dev->uverbs_cmd_mask; ++ msg.data.uverbs_abi_ver = device->ib_dev->uverbs_abi_ver; ++ msg.data.ibp_abi_ver = IBP_ABI_VERSION; ++ msg.data.num_comp_vectors = device->ib_dev->num_comp_vectors; ++ msg.data.phys_port_cnt = device->ib_dev->phys_port_cnt; ++ ++ return ibp_send(client->ep, &msg, sizeof(msg)); ++} ++ ++int ibp_send_remove(struct ibp_client *client, struct ibp_device *device) ++{ ++ struct ibp_remove_device_msg msg; ++ ++ print_trace("in\n"); ++ ++ IBP_INIT_MSG(device, &msg, sizeof(msg), REMOVE_DEVICE); ++ return ibp_send(client->ep, &msg, sizeof(msg)); ++} ++ ++static void ibp_send_queued_response(struct work_struct *work) ++{ ++ struct ibp_queued_response_msg *msg; ++ struct ibp_queued_response *resp; ++ ++ resp = container_of(work, struct ibp_queued_response, work); ++ msg = (struct ibp_queued_response_msg *) resp->msg; ++ ++ ibp_send(resp->client->ep, msg, msg->header.length); ++ kfree(resp); ++} ++ ++static int ibp_queue_response(struct ibp_client *client, ++ struct ibp_queued_response_msg *msg) ++{ ++ struct ibp_queued_response *resp; ++ size_t len; ++ ++ len = sizeof(*resp) + msg->header.length; ++ ++ resp = kmalloc(len, GFP_ATOMIC); ++ if (!resp) { ++ print_err("kalloc failed\n"); ++ return -ENOMEM; ++ } ++ ++ resp->client = client; ++ memcpy(&resp->msg, msg, msg->header.length); ++ ++ /* Queue to serialize behing any associated events. */ ++ INIT_WORK(&resp->work, ibp_send_queued_response); ++ queue_work(client->workqueue, &resp->work); ++ ++ return 0; ++} ++ ++static int ibp_cmd_error(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf, int ret) ++{ ++ struct ibp_verb_response_msg *msg; ++ size_t len; ++ ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ IBP_INIT_RESP(hdr->device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_bad_request(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ print_dbg("opcode 0x%x\n", hdr->opcode); ++ return ibp_cmd_error(client, hdr, tx_buf, -EBADRQC); ++} ++ ++static int ibp_cmd_not_supported(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ print_dbg("opcode 0x%x\n", hdr->opcode); ++ return ibp_cmd_error(client, hdr, tx_buf, -ENOSYS); ++} ++ ++static int ibp_cmd_query_device(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_query_device_resp *resp; ++ struct ib_device_attr attr; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ ret = ib_query_device(device->ib_dev, &attr); ++ if (ret) { ++ print_err("ib_query_device returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ resp = (struct ibp_query_device_resp *) msg->data; ++ len += sizeof(*resp); ++ ++ resp->fw_ver = attr.fw_ver; ++ resp->sys_image_guid = attr.sys_image_guid; ++ resp->max_mr_size = attr.max_mr_size; ++ resp->page_size_cap = attr.page_size_cap; ++ resp->vendor_id = attr.vendor_id; ++ resp->vendor_part_id = attr.vendor_part_id; ++ resp->hw_ver = attr.hw_ver; ++ resp->max_qp = attr.max_qp; ++ resp->max_qp_wr = attr.max_qp_wr; ++ resp->device_cap_flags = attr.device_cap_flags; ++ resp->max_sge = attr.max_sge; ++ resp->max_sge_rd = attr.max_sge_rd; ++ resp->max_cq = attr.max_cq; ++ resp->max_cqe = attr.max_cqe; ++ resp->max_mr = attr.max_mr; ++ resp->max_pd = attr.max_pd; ++ resp->max_qp_rd_atom = attr.max_qp_rd_atom; ++ resp->max_ee_rd_atom = attr.max_ee_rd_atom; ++ resp->max_res_rd_atom = attr.max_res_rd_atom; ++ resp->max_qp_init_rd_atom = attr.max_qp_init_rd_atom; ++ resp->max_ee_init_rd_atom = attr.max_ee_init_rd_atom; ++ resp->atomic_cap = attr.atomic_cap; ++ resp->masked_atomic_cap = attr.masked_atomic_cap; ++ resp->max_ee = attr.max_ee; ++ resp->max_rdd = attr.max_rdd; ++ resp->max_mw = attr.max_mw; ++ resp->max_raw_ipv6_qp = attr.max_raw_ipv6_qp; ++ resp->max_raw_ethy_qp = attr.max_raw_ethy_qp; ++ resp->max_mcast_grp = attr.max_mcast_grp; ++ resp->max_mcast_qp_attach = attr.max_mcast_qp_attach; ++ resp->max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach; ++ resp->max_ah = attr.max_ah; ++ resp->max_fmr = attr.max_fmr; ++ resp->max_map_per_fmr = attr.max_map_per_fmr; ++ resp->max_srq = attr.max_srq; ++ resp->max_srq_wr = attr.max_srq_wr; ++ resp->max_srq_sge = attr.max_srq_sge; ++ resp->max_fast_reg_page_list_len = attr.max_fast_reg_page_list_len; ++ resp->max_pkeys = attr.max_pkeys; ++ resp->local_ca_ack_delay = attr.local_ca_ack_delay; ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_query_port(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_query_port_cmd *cmd; ++ struct ibp_query_port_resp *resp; ++ struct ib_port_attr attr; ++ size_t len; ++ int ret; ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_query_port_cmd *) hdr; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ ret = ib_query_port(device->ib_dev, cmd->port_num, &attr); ++ if (ret) { ++ print_err("ib_query_port returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ resp = (struct ibp_query_port_resp *) msg->data; ++ len += sizeof(*resp); ++ ++ resp->state = attr.state; ++ resp->max_mtu = attr.max_mtu; ++ resp->active_mtu = attr.active_mtu; ++ resp->gid_tbl_len = attr.gid_tbl_len; ++ resp->port_cap_flags = attr.port_cap_flags; ++ resp->max_msg_sz = attr.max_msg_sz; ++ resp->bad_pkey_cntr = attr.bad_pkey_cntr; ++ resp->qkey_viol_cntr = attr.qkey_viol_cntr; ++ resp->pkey_tbl_len = attr.pkey_tbl_len; ++ resp->lid = attr.lid; ++ resp->sm_lid = attr.sm_lid; ++ resp->lmc = attr.lmc; ++ resp->max_vl_num = attr.max_vl_num; ++ resp->sm_sl = attr.sm_sl; ++ resp->subnet_timeout = attr.subnet_timeout; ++ resp->init_type_reply = attr.init_type_reply; ++ resp->active_width = attr.active_width; ++ resp->active_speed = attr.active_speed; ++ resp->phys_state = attr.phys_state; ++ resp->link_layer = rdma_port_get_link_layer(device->ib_dev, ++ cmd->port_num); ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_query_gid(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_query_gid_cmd *cmd; ++ struct ibp_query_gid_resp *resp; ++ size_t len; ++ union ib_gid gid; ++ int ret; ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_query_gid_cmd *) hdr; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ ret = ib_query_gid(device->ib_dev, cmd->port_num, cmd->index, &gid); ++ if (ret) { ++ print_err("ib_query_gid returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ resp = (struct ibp_query_gid_resp *) msg->data; ++ len += sizeof(*resp); ++ ++ resp->subnet_prefix = gid.global.subnet_prefix; ++ resp->interface_id = gid.global.interface_id; ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_query_pkey(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_query_pkey_cmd *cmd; ++ struct ibp_query_pkey_resp *resp; ++ size_t len; ++ u16 pkey; ++ int ret; ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_query_pkey_cmd *) hdr; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ ret = ib_query_pkey(device->ib_dev, cmd->port_num, cmd->index, &pkey); ++ if (ret) { ++ print_err("ib_query_pkey returned %d\n", ret); ++ goto send_resp; ++ } ++ resp = (struct ibp_query_pkey_resp *) msg->data; ++ len += sizeof(*resp); ++ ++ resp->pkey = pkey; ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static void ibp_async_event(struct work_struct *work) ++{ ++ struct ibp_event *event; ++ struct ibp_async_event_msg msg; ++ ++ event = container_of(work, struct ibp_event, work); ++ ++ IBP_INIT_MSG(NULL, &msg, sizeof(msg), ASYNC_EVENT); ++ ++ msg.data.context = (uintptr_t) event->context; ++ msg.data.type = event->type; ++ ++ ibp_send(event->client->ep, &msg, sizeof(msg)); ++ ++ ibp_add_to_stack(a_stack, (void *) event); ++} ++ ++static void ibp_event_handler(struct ib_event_handler *handler, ++ struct ib_event *ibevent) ++{ ++ struct ibp_ucontext *ucontext; ++ struct ibp_client *client; ++ struct ibp_event *event; ++ ++ ucontext = container_of(handler, struct ibp_ucontext, event_handler); ++ ++ if (ucontext->ibucontext->closing) { ++ print_dbg("ignoring event, connection closing\n"); ++ return; ++ } ++ ++ event = (struct ibp_event *) ++ ibp_pull_from_stack(a_stack, sizeof(*event), GFP_ATOMIC); ++ if (!event) { ++ print_err("kalloc failed\n"); ++ return; ++ } ++ ++ client = ucontext->client; ++ ++ event->client = client; ++ event->context = ibevent->element.port_num; ++ event->type = ibevent->event; ++ event->ibdev = ucontext->ibdev; ++ ++ INIT_WORK(&event->work, ibp_async_event); ++ queue_work(client->workqueue, &event->work); ++} ++ ++static int ibp_mmap(struct file *filp, struct vm_area_struct *vma) ++{ ++ struct ibp_ucontext *ucontext; ++ struct ib_ucontext *ibucontext; ++ ++ ucontext = filp->private_data; ++ ibucontext = ucontext->ibucontext; ++ ++ return (ibucontext->device->mmap) ? ++ ibucontext->device->mmap(ibucontext, vma) : -ENOSYS; ++} ++ ++static const struct file_operations ibp_fops = { ++ .mmap = ibp_mmap, ++}; ++ ++static int ibp_cmd_alloc_ucontext(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_alloc_ucontext_cmd *cmd; ++ struct ibp_alloc_ucontext_resp *resp; ++ struct ibp_ucontext *ucontext; ++ struct ib_ucontext *ibucontext; ++ struct ib_udata udata; ++ size_t len; ++ size_t outlen; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_alloc_ucontext_cmd *) hdr; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ resp = (struct ibp_alloc_ucontext_resp *) msg->data; ++ len = hdr->length - sizeof(*cmd); ++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp); ++ ++ /* Workaround for len check in mlx5 driver (no impact to others) */ ++ len += sizeof(struct ib_uverbs_cmd_hdr); ++ ++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen); ++ ++ len = sizeof(*msg); ++ ++ ret = ibp_get_device(device); ++ if (ret) { ++ print_err("ibp_get_device returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ ucontext = kzalloc(sizeof(*ucontext), GFP_KERNEL); ++ if (!ucontext) { ++ print_err("kzalloc failed\n"); ++ ret = -ENOMEM; ++ goto err1; ++ } ++ ucontext->device = device; ++ ++ ibucontext = device->ib_dev->alloc_ucontext(device->ib_dev, &udata); ++ if (IS_ERR(ibucontext)) { ++ ret = PTR_ERR(ibucontext); ++ print_err("Invalid ibucontext %p\n", ibucontext); ++ goto err2; ++ } ++ ++#ifdef MOFED ++ ibucontext->peer_mem_name = ibp_peer_mem.name; ++ ibucontext->peer_mem_private_data = ucontext; ++#else ++ ibucontext->umem_ops = &ibp_umem; ++ ibucontext->umem_private_data = ucontext; ++#endif ++ ++ ibucontext->device = device->ib_dev; ++ ibucontext->closing = 0; ++ ++ INIT_LIST_HEAD(&ibucontext->pd_list); ++ INIT_LIST_HEAD(&ibucontext->mr_list); ++ INIT_LIST_HEAD(&ibucontext->mw_list); ++ INIT_LIST_HEAD(&ibucontext->cq_list); ++ INIT_LIST_HEAD(&ibucontext->qp_list); ++ INIT_LIST_HEAD(&ibucontext->srq_list); ++ INIT_LIST_HEAD(&ibucontext->ah_list); ++ INIT_LIST_HEAD(&ibucontext->xrcd_list); ++ ++ ucontext->filp = anon_inode_getfile("["DRV_NAME"]", &ibp_fops, ++ ucontext, O_RDWR); ++ if (IS_ERR(ucontext->filp)) { ++ ret = PTR_ERR(ucontext->filp); ++ print_err("anon_inode_getfile returned %d\n", ret); ++ goto err3; ++ } ++ ++ if (cmd->ibdev) { ++ ucontext->ibdev = cmd->ibdev; ++ INIT_IB_EVENT_HANDLER(&ucontext->event_handler, device->ib_dev, ++ ibp_event_handler); ++ ret = ib_register_event_handler(&ucontext->event_handler); ++ if (ret) { ++ print_err("event_handler returned %d\n", ret); ++ goto err4; ++ } ++ } ++ ++ ucontext->client = client; ++ ucontext->ibucontext = ibucontext; ++ mutex_init(&ucontext->mutex); ++ INIT_LIST_HEAD(&ucontext->mmap_list); ++ ucontext->reg_tree = RB_ROOT; ++ ++ mutex_lock(&client->ucontext_mutex); ++ list_add_tail(&ucontext->list, &client->ucontext_list); ++ mutex_unlock(&client->ucontext_mutex); ++ ++ len += sizeof(*resp); ++ len += outlen - udata.outlen; /* add driver private data */ ++ ++ resp->ucontext = (uintptr_t)ucontext; ++ ++ goto send_resp; ++ ++err4: ++ fput(ucontext->filp); ++err3: ++ device->ib_dev->dealloc_ucontext(ibucontext); ++err2: ++ kfree(ucontext); ++err1: ++ ibp_put_device(device); ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_dealloc_ucontext(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_dealloc_ucontext_cmd *cmd; ++ struct ibp_queued_response_msg *msg; ++ struct ibp_ucontext *ucontext; ++ struct ib_ucontext *ibucontext; ++ size_t len; ++ int ret = -EINVAL; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_dealloc_ucontext_cmd *) hdr; ++ ucontext = (struct ibp_ucontext *) cmd->ucontext; ++ msg = (struct ibp_queued_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ if (IS_NULL_OR_ERR(ucontext)) { ++ print_err("Invalid ucontext %p\n", ucontext); ++ goto send_resp; ++ } ++ ++ ibucontext = ucontext->ibucontext; ++ ++ if (ucontext->ibdev) ++ ib_unregister_event_handler(&ucontext->event_handler); ++ ++ fput(ucontext->filp); ++ ++ if (device && device->ib_dev) { ++ ret = device->ib_dev->dealloc_ucontext(ibucontext); ++ if (ret) { ++ print_err("ib_dealloc_ucontext returned %d\n", ret); ++ goto send_resp; ++ } ++ } ++ ++ mutex_lock(&client->ucontext_mutex); ++ list_del(&ucontext->list); ++ mutex_unlock(&client->ucontext_mutex); ++ ++ ibp_put_device(device); ++ kfree(ucontext); ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, QUEUED_RESPONSE, hdr->request, ret); ++ return ibp_queue_response(client, msg); ++} ++ ++static void ibp_dereg_buf(struct kref *ref) ++{ ++ struct ibp_reg *reg; ++ struct ibp_ucontext *ucontext; ++ ++ reg = container_of(ref, struct ibp_reg, ref); ++ ucontext = reg->ucontext; ++ ++ if (!RB_EMPTY_NODE(®->node)) { ++ mutex_lock(&ucontext->mutex); ++ rb_erase(®->node, &ucontext->reg_tree); ++ mutex_unlock(&ucontext->mutex); ++ } ++ ++ if (reg->range) ++ scif_put_pages(reg->range); ++ ++ kfree(reg); ++} ++ ++static struct ibp_reg *__ibp_insert_reg_buf(struct ibp_ucontext *ucontext, ++ struct ibp_reg *reg) ++{ ++ struct rb_node **link; ++ struct rb_node *parent; ++ struct ibp_reg *cur_reg; ++ ++ link = &ucontext->reg_tree.rb_node; ++ parent = NULL; ++ ++ while (*link) { ++ parent = *link; ++ cur_reg = rb_entry(parent, struct ibp_reg, node); ++ ++#ifdef MOFED ++ if ((reg->virt_addr == cur_reg->virt_addr) && ++ (reg->length == cur_reg->length)) ++ return cur_reg; ++#else ++ if ((reg->virt_addr == cur_reg->virt_addr) && ++ (reg->length == cur_reg->length) && ++ (reg->access == cur_reg->access)) ++ return cur_reg; ++#endif ++ ++ if (reg->virt_addr < cur_reg->virt_addr) ++ link = &(*link)->rb_left; ++ else if (reg->virt_addr > cur_reg->virt_addr) ++ link = &(*link)->rb_right; ++ else if (reg->length < cur_reg->length) ++ link = &(*link)->rb_left; ++ else if (reg->length > cur_reg->length) ++ link = &(*link)->rb_right; ++#ifndef MOFED ++ else if (reg->access < cur_reg->access) ++ link = &(*link)->rb_left; ++#endif ++ else ++ link = &(*link)->rb_right; ++ } ++ ++ rb_link_node(®->node, parent, link); ++ rb_insert_color(®->node, &ucontext->reg_tree); ++ ++ return NULL; ++} ++ ++static struct ibp_reg *ibp_reg_buf(struct ibp_ucontext *ucontext, ++ u64 virt_addr, u64 scif_addr, u64 length, ++ u64 offset, u32 access) ++{ ++ struct ibp_reg *reg; ++ struct ibp_reg *cur_reg; ++ int ret; ++ ++ reg = kzalloc(sizeof(*reg), GFP_KERNEL); ++ if (!reg) { ++ print_err("kzalloc failed\n"); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ kref_init(®->ref); ++ RB_CLEAR_NODE(®->node); ++ reg->ucontext = ucontext; ++ reg->virt_addr = virt_addr; ++ reg->length = length; ++ reg->offset = offset; ++ reg->access = access; ++ ++ ret = scif_get_pages(ucontext->client->ep, scif_addr, ++ PAGE_ALIGN(reg->length + ++ (reg->virt_addr & ~PAGE_MASK)), ++ ®->range); ++ if (ret) { ++ print_err("scif_get_pages returned %d\n", ret); ++ kref_put(®->ref, ibp_dereg_buf); ++ return ERR_PTR(ret); ++ } ++ ++ mutex_lock(&ucontext->mutex); ++ ++ cur_reg = __ibp_insert_reg_buf(ucontext, reg); ++ if (cur_reg) { ++ print_dbg("__ibp_insert_reg_buf duplicate entry\n"); ++ kref_get(&cur_reg->ref); ++ } ++ ++ mutex_unlock(&ucontext->mutex); ++ ++ if (cur_reg) { ++ kref_put(®->ref, ibp_dereg_buf); ++ reg = cur_reg; ++ } ++ ++ return reg; ++} ++ ++static int ibp_cmd_reg_buf(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_reg_buf_cmd *cmd; ++ struct ibp_reg_buf_resp *resp; ++ struct ibp_ucontext *ucontext; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_reg *reg; ++ size_t len; ++ int ret = 0; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_reg_buf_cmd *) hdr; ++ ucontext = (struct ibp_ucontext *) cmd->ucontext; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ reg = ibp_reg_buf(ucontext, cmd->virt_addr, cmd->scif_addr, ++ cmd->length, cmd->offset, cmd->access); ++ if (IS_ERR(reg)) { ++ ret = PTR_ERR(reg); ++ print_err("ibp_reg_buf returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ resp = (struct ibp_reg_buf_resp *) msg->data; ++ len += sizeof(*resp); ++ ++ resp->reg = (uintptr_t)reg; ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_dereg_buf(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_dereg_buf_cmd *cmd; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_reg *reg; ++ size_t len; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_dereg_buf_cmd *) hdr; ++ reg = (struct ibp_reg *) cmd->reg; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ kref_put(®->ref, ibp_dereg_buf); ++ ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, 0); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_convert_prot_flags(unsigned long prot) ++{ ++ int prot_flags; ++ ++ prot_flags = 0; ++ ++ if (prot & PROT_READ) ++ prot_flags |= SCIF_PROT_READ; ++ ++ if (prot & PROT_WRITE) ++ prot_flags |= SCIF_PROT_WRITE; ++ ++ return prot_flags; ++} ++ ++static int ibp_convert_map_flags(unsigned long flags) ++{ ++ int map_flags; ++ ++ map_flags = SCIF_MAP_KERNEL; ++ ++ if (flags & MAP_FIXED) ++ map_flags |= SCIF_MAP_FIXED; ++ ++ return map_flags; ++} ++ ++static int ibp_scif_register(struct ibp_client *client, struct ibp_mmap *mmap, ++ unsigned long flags) ++{ ++ struct vm_area_struct *vma; ++ unsigned long npages; ++ unsigned long pfn; ++ int offset; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ offset = mmap->vaddr & ~PAGE_MASK; ++ npages = PAGE_ALIGN(mmap->len + offset) >> PAGE_SHIFT; ++ if (npages != 1) { ++ print_err("request %lu but only one page supported\n", npages); ++ return -EINVAL; ++ } ++ ++ down_write(¤t->mm->mmap_sem); ++ vma = find_vma(current->mm, mmap->vaddr); ++ if (!vma) { ++ up_write(¤t->mm->mmap_sem); ++ print_err("find_vma failed\n"); ++ return -EFAULT; ++ } ++ ++ ret = follow_pfn(vma, mmap->vaddr, &pfn); ++ ++ up_write(¤t->mm->mmap_sem); ++ if (ret) { ++ print_err("follow_pfn returned %d\n", ret); ++ return ret; ++ } ++ ++ mmap->io_addr = ioremap(page_to_phys(pfn_to_page(pfn)), mmap->len); ++ if (!mmap->io_addr) { ++ print_err("ioremap failed\n"); ++ return -ENOMEM; ++ } ++ ++ mmap->scif_addr = scif_register(client->ep, (void *) mmap->io_addr, ++ mmap->len, (off_t) mmap->io_addr, ++ ibp_convert_prot_flags(mmap->prot), ++ ibp_convert_map_flags(flags)); ++ if (IS_ERR_VALUE(mmap->scif_addr)) { ++ ret = mmap->scif_addr; ++ print_err("scif_register returned %d\n", ret); ++ goto err0; ++ ++ } ++ ++ return 0; ++err0: ++ iounmap(mmap->io_addr); ++ return ret; ++} ++ ++static ++void ibp_scif_unregister(struct ibp_client *client, struct ibp_mmap *mmap) ++{ ++ int ret; ++ ++ print_trace("in\n"); ++ ++ ret = scif_unregister(client->ep, mmap->scif_addr, mmap->len); ++ if (ret) { ++ if (ret == -ECONNRESET) ++ print_dbg("scif connection reset\n"); ++ else ++ print_err("scif_unregister returned %d\n", ret); ++ } ++ ++ iounmap(mmap->io_addr); ++} ++ ++static int ibp_cmd_mmap(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_mmap_cmd *cmd; ++ struct ibp_mmap_resp *resp; ++ struct ibp_ucontext *ucontext; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_mmap *mmap; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_mmap_cmd *) hdr; ++ ucontext = (struct ibp_ucontext *) cmd->ucontext; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ mmap = kzalloc(sizeof(*mmap), GFP_KERNEL); ++ if (!mmap) { ++ print_err("kzalloc failed\n"); ++ ret = -ENOMEM; ++ goto send_resp; ++ } ++ mmap->ucontext = ucontext; ++ mmap->len = cmd->len; ++ mmap->prot = cmd->prot; ++ ++ /* The mmap syscall ignores these bits; do the same here. */ ++ cmd->flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) ++ down_write(¤t->mm->mmap_sem); ++ mmap->vaddr = do_mmap_pgoff(ucontext->filp, 0, cmd->len, ++ cmd->prot, cmd->flags, cmd->pgoff); ++ up_write(¤t->mm->mmap_sem); ++#else ++ mmap->vaddr = vm_mmap(ucontext->filp, 0, cmd->len, cmd->prot, ++ cmd->flags, cmd->pgoff << PAGE_SHIFT); ++#endif ++ ++ if (mmap->vaddr & ~PAGE_MASK) { ++ ret = mmap->vaddr; ++ print_err("mmap returned %d\n", ret); ++ goto err1; ++ } ++ ++ ret = ibp_scif_register(client, mmap, cmd->flags); ++ if (ret) { ++ print_err("ibp_scif_register returned %d\n", ret); ++ goto err2; ++ } ++ ++ mutex_lock(&ucontext->mutex); ++ list_add_tail(&mmap->list, &ucontext->mmap_list); ++ mutex_unlock(&ucontext->mutex); ++ ++ resp = (struct ibp_mmap_resp *) msg->data; ++ len += sizeof(*resp); ++ ++ resp->scif_addr = mmap->scif_addr; ++ resp->mmap = (uintptr_t)mmap; ++ ++ goto send_resp; ++err2: ++ MUNMAP(current->mm, mmap->vaddr, cmd->len); ++err1: ++ kfree(mmap); ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_unmmap(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_unmmap_cmd *cmd; ++ struct ibp_mmap *mmap; ++ struct ibp_verb_response_msg *msg; ++ size_t len; ++ int ret = 0; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_unmmap_cmd *) hdr; ++ mmap = (struct ibp_mmap *) cmd->mmap; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ if (IS_NULL_OR_ERR(mmap)) { ++ print_err("Invalid mmap %p\n", mmap); ++ ret = -EINVAL; ++ goto send_resp; ++ } ++ ++ ibp_scif_unregister(client, mmap); ++ ++ if (IS_NULL_OR_ERR(current) || IS_NULL_OR_ERR(current->mm)) { ++ print_err("Invalid current mm pointer\n"); ++ ret = -EINVAL; ++ goto send_resp; ++ } ++ ++ MUNMAP(current->mm, mmap->vaddr, mmap->len); ++ ++ if (mmap->ucontext) { ++ mutex_lock(&mmap->ucontext->mutex); ++ list_del(&mmap->list); ++ mutex_unlock(&mmap->ucontext->mutex); ++ } ++ ++ kfree(mmap); ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static struct ib_uobject *ibp_create_uobj(struct ibp_ucontext *ucontext) ++{ ++ static struct lock_class_key __key; ++ struct ib_uobject *uobj; ++ ++ if (IS_NULL_OR_ERR(ucontext)) ++ return ERR_PTR(-EINVAL); ++ ++ uobj = (struct ib_uobject *) ++ ibp_pull_from_stack(o_stack, sizeof(*uobj), GFP_ATOMIC); ++ if (!uobj) ++ return ERR_PTR(-ENOMEM); ++ ++ /* ++ * the uobj struct is updated since this is kernel-to-kernel, ++ * so this structure is not fully setup as in ib_uverbs. ++ */ ++ uobj->context = ucontext->ibucontext; ++ uobj->user_handle = (uintptr_t)ucontext; ++ kref_init(&uobj->ref); ++ init_rwsem(&uobj->mutex); ++ lockdep_set_class(&uobj->mutex, &__key); ++ uobj->live = 1; ++ ++ return uobj; ++} ++ ++static void ibp_destroy_uobj(struct ib_uobject *uobj) ++{ ++ struct ibp_ucontext *ucontext; ++ ++ if (!IS_NULL_OR_ERR(uobj)) { ++ ucontext = (struct ibp_ucontext *) uobj->user_handle; ++ if (ucontext) { ++ mutex_lock(&ucontext->mutex); ++ list_del(&uobj->list); ++ mutex_unlock(&ucontext->mutex); ++ } ++ ++ ibp_add_to_stack(o_stack, (void *) uobj); ++ } ++} ++ ++static int ibp_cmd_alloc_pd(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_alloc_pd_cmd *cmd; ++ struct ibp_alloc_pd_resp *resp; ++ struct ibp_ucontext *ucontext; ++ struct ib_uobject *uobj; ++ struct ib_udata udata; ++ struct ib_pd *pd; ++ size_t len; ++ size_t outlen; ++ int ret = 0; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_alloc_pd_cmd *) hdr; ++ ucontext = (struct ibp_ucontext *) cmd->ucontext; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ resp = (struct ibp_alloc_pd_resp *) msg->data; ++ len = hdr->length - sizeof(*cmd); ++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp); ++ ++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen); ++ ++ len = sizeof(*msg); ++ ++ uobj = ibp_create_uobj(ucontext); ++ if (IS_ERR(uobj)) { ++ ret = PTR_ERR(uobj); ++ print_err("ibp_create_uobj returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ pd = device->ib_dev->alloc_pd(device->ib_dev, ucontext->ibucontext, ++ &udata); ++ if (IS_ERR(pd)) { ++ ret = PTR_ERR(pd); ++ print_err("ib_alloc_pd returned %d\n", ret); ++ /* ++ * Clear uobj's user_handle as destroy_uobj tries to list_del ++ * uobj from the list and uobj has NOT been added yet ++ */ ++ uobj->user_handle = 0; ++ ibp_destroy_uobj(uobj); ++ goto send_resp; ++ } ++ ++ pd->device = device->ib_dev; ++ atomic_set(&pd->usecnt, 0); ++ ++ pd->uobject = uobj; ++ uobj->object = pd; ++ ++ mutex_lock(&ucontext->mutex); ++ list_add_tail(&uobj->list, &ucontext->ibucontext->pd_list); ++ mutex_unlock(&ucontext->mutex); ++ ++ len += sizeof(*resp); ++ len += outlen - udata.outlen; /* add driver private data */ ++ ++ resp->pd = (uintptr_t)pd; ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_dealloc_pd(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_dealloc_pd_cmd *cmd; ++ struct ibp_verb_response_msg *msg; ++ struct ib_uobject *uobj; ++ struct ib_pd *pd; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_dealloc_pd_cmd *) hdr; ++ pd = (struct ib_pd *) cmd->pd; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ if (IS_NULL_OR_ERR(pd)) { ++ print_err("Invalid pd %p\n", pd); ++ ret = -EINVAL; ++ goto send_resp; ++ } ++ ++ uobj = pd->uobject; ++ ++ ret = ib_dealloc_pd(pd); ++ if (unlikely(ret == -EBUSY)) { ++ msleep(100); ++ ret = ib_dealloc_pd(pd); ++ } ++ if (ret) { ++ print_err("ib_dealloc_pd returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ ibp_destroy_uobj(uobj); ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_create_ah(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_create_ah_cmd *cmd; ++ struct ibp_create_ah_resp *resp; ++ struct ibp_ucontext *ucontext; ++ struct ib_uobject *uobj; ++ struct ib_pd *pd; ++ struct ib_ah *ah; ++ struct ib_ah_attr attr; ++ size_t len; ++ int ret = 0; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_create_ah_cmd *) hdr; ++ pd = (struct ib_pd *) cmd->pd; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ ucontext = (struct ibp_ucontext *) pd->uobject->user_handle; ++ ++ uobj = ibp_create_uobj(ucontext); ++ if (IS_ERR(uobj)) { ++ ret = PTR_ERR(uobj); ++ print_err("ibp_create_uobj returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ memset(&attr, 0, sizeof(attr)); ++ ++ attr.dlid = cmd->ah_attr.dlid; ++ attr.sl = cmd->ah_attr.sl; ++ attr.src_path_bits = cmd->ah_attr.src_path_bits; ++ attr.static_rate = cmd->ah_attr.static_rate; ++ attr.ah_flags = cmd->ah_attr.ah_flags; ++ attr.port_num = cmd->ah_attr.port_num; ++ attr.grh.dgid.global.subnet_prefix = ++ cmd->ah_attr.grh.dgid_subnet_prefix; ++ attr.grh.dgid.global.interface_id = cmd->ah_attr.grh.dgid_interface_id; ++ attr.grh.flow_label = cmd->ah_attr.grh.flow_label; ++ attr.grh.sgid_index = cmd->ah_attr.grh.sgid_index; ++ attr.grh.hop_limit = cmd->ah_attr.grh.hop_limit; ++ attr.grh.traffic_class = cmd->ah_attr.grh.traffic_class; ++ ++ ah = ib_create_ah(pd, &attr); ++ if (IS_ERR(ah)) { ++ ret = PTR_ERR(ah); ++ print_err("ib_create_ah returned %d\n", ret); ++ /* ++ * Clear uobj's user_handle as destroy_uobj tries to list_del ++ * uobj from the list and uobj has NOT been added yet ++ */ ++ uobj->user_handle = 0; ++ ibp_destroy_uobj(uobj); ++ goto send_resp; ++ } ++ ++ ah->uobject = uobj; ++ uobj->object = ah; ++ ++ mutex_lock(&ucontext->mutex); ++ list_add_tail(&uobj->list, &ucontext->ibucontext->ah_list); ++ mutex_unlock(&ucontext->mutex); ++ ++ resp = (struct ibp_create_ah_resp *) msg->data; ++ len += sizeof(*resp); ++ ++ resp->ah = (uintptr_t) ah; ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_query_ah(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_query_ah_cmd *cmd; ++ struct ibp_query_ah_resp *resp; ++ struct ibp_verb_response_msg *msg; ++ struct ib_ah *ah; ++ struct ib_ah_attr attr; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_query_ah_cmd *) hdr; ++ ah = (struct ib_ah *) cmd->ah; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ ret = ib_query_ah(ah, &attr); ++ if (ret) { ++ print_err("ib_query_ah returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ resp = (struct ibp_query_ah_resp *) msg->data; ++ len += sizeof(*resp); ++ ++ resp->attr.dlid = attr.dlid; ++ resp->attr.sl = attr.sl; ++ resp->attr.src_path_bits = attr.src_path_bits; ++ resp->attr.static_rate = attr.static_rate; ++ resp->attr.ah_flags = attr.ah_flags; ++ resp->attr.port_num = attr.port_num; ++ resp->attr.grh.dgid_subnet_prefix = attr.grh.dgid.global.subnet_prefix; ++ resp->attr.grh.dgid_interface_id = attr.grh.dgid.global.interface_id; ++ resp->attr.grh.flow_label = attr.grh.flow_label; ++ resp->attr.grh.sgid_index = attr.grh.sgid_index; ++ resp->attr.grh.hop_limit = attr.grh.hop_limit; ++ resp->attr.grh.traffic_class = attr.grh.traffic_class; ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_destroy_ah(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_destroy_ah_cmd *cmd; ++ struct ib_uobject *uobj; ++ struct ib_ah *ah; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_destroy_ah_cmd *) hdr; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ ah = (struct ib_ah *) cmd->ah; ++ len = sizeof(*msg); ++ ++ uobj = ah->uobject; ++ ++ ret = ib_destroy_ah(ah); ++ if (ret) { ++ print_err("ib_destroy_ah returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ ibp_destroy_uobj(uobj); ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static void ibp_ibsrq_event(struct ib_event *ibevent, void *srq_context) ++{ ++ struct ibp_ucontext *ucontext; ++ struct ibp_client *client; ++ struct ibp_event *event; ++ struct ib_uobject *uobj; ++ ++ print_trace("in\n"); ++ ++ event = kmalloc(sizeof(*event), GFP_ATOMIC); ++ if (!event) { ++ print_err("kalloc failed\n"); ++ return; ++ } ++ ++ uobj = ibevent->element.srq->uobject; ++ ucontext = (struct ibp_ucontext *) uobj->user_handle; ++ client = ucontext->client; ++ ++ event->client = client; ++ event->context = (uintptr_t) srq_context; ++ event->type = ibevent->event; ++ event->ibdev = ucontext->ibdev; ++ ++ INIT_WORK(&event->work, ibp_async_event); ++ queue_work(client->workqueue, &event->work); ++} ++ ++static int ibp_cmd_create_srq(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_create_srq_cmd *cmd; ++ struct ibp_create_srq_resp *resp; ++ struct ibp_ucontext *ucontext; ++ struct ib_uobject *uobj; ++ struct ib_pd *pd; ++ struct ib_srq *srq; ++ struct ib_srq_init_attr init_attr; ++ struct ib_udata udata; ++ size_t len; ++ size_t outlen; ++ int ret = 0; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_create_srq_cmd *) hdr; ++ pd = (struct ib_pd *) cmd->pd; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ resp = (struct ibp_create_srq_resp *) msg->data; ++ len = hdr->length - sizeof(*cmd); ++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp); ++ ++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen); ++ ++ len = sizeof(*msg); ++ ++ ucontext = (struct ibp_ucontext *) pd->uobject->user_handle; ++ ++ uobj = ibp_create_uobj(ucontext); ++ if (IS_ERR(uobj)) { ++ ret = PTR_ERR(uobj); ++ print_err("ibp_create_uobj returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ memset(&init_attr, 0, sizeof(init_attr)); ++ ++ init_attr.event_handler = ibp_ibsrq_event; ++ init_attr.srq_context = (void *) cmd->srq_context; ++ init_attr.attr.max_wr = cmd->attr.max_wr; ++ init_attr.attr.max_sge = cmd->attr.max_sge; ++ init_attr.attr.srq_limit = cmd->attr.srq_limit; ++ ++ srq = device->ib_dev->create_srq(pd, &init_attr, &udata); ++ if (IS_ERR(srq)) { ++ ret = PTR_ERR(srq); ++ print_err("ib_create_srq returned %d\n", ret); ++ /* ++ * Clear uobj's user_handle as destroy_uobj tries to list_del ++ * uobj from the list and uobj has NOT been added yet ++ */ ++ uobj->user_handle = 0; ++ ibp_destroy_uobj(uobj); ++ goto send_resp; ++ } ++ ++ srq->device = device->ib_dev; ++ srq->pd = pd; ++ srq->event_handler = init_attr.event_handler; ++ srq->srq_context = init_attr.srq_context; ++ srq->srq_type = 0; ++ srq->ext.xrc.cq = NULL; ++ srq->ext.xrc.xrcd = NULL; ++ ++ atomic_inc(&pd->usecnt); ++ atomic_set(&srq->usecnt, 0); ++ ++ srq->uobject = uobj; ++ uobj->object = srq; ++ ++ mutex_lock(&ucontext->mutex); ++ list_add_tail(&uobj->list, &ucontext->ibucontext->srq_list); ++ mutex_unlock(&ucontext->mutex); ++ ++ len += sizeof(*resp); ++ len += outlen - udata.outlen; /* add driver private data */ ++ ++ resp->srq = (uintptr_t)srq; ++ resp->attr.max_wr = init_attr.attr.max_wr; ++ resp->attr.max_sge = init_attr.attr.max_sge; ++ resp->attr.srq_limit = init_attr.attr.srq_limit; ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_modify_srq(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_modify_srq_cmd *cmd; ++ struct ibp_modify_srq_resp *resp; ++ struct ib_srq *srq; ++ struct ib_srq_attr attr; ++ struct ib_udata udata; ++ size_t len; ++ size_t outlen; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_modify_srq_cmd *) hdr; ++ srq = (struct ib_srq *) cmd->srq; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ resp = (struct ibp_modify_srq_resp *) msg->data; ++ len = hdr->length - sizeof(*cmd); ++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp); ++ ++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen); ++ ++ len = sizeof(*msg); ++ ++ memset(&attr, 0, sizeof(attr)); ++ ++ attr.max_wr = cmd->attr.max_wr; ++ attr.max_sge = cmd->attr.max_sge; ++ attr.srq_limit = cmd->attr.srq_limit; ++ ++ ret = device->ib_dev->modify_srq(srq, &attr, cmd->srq_attr_mask, ++ &udata); ++ if (ret) { ++ print_err("ib_modify_srq returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ len += sizeof(*resp); ++ len += outlen - udata.outlen; /* add driver private data */ ++ ++ resp->attr.max_wr = attr.max_wr; ++ resp->attr.max_sge = attr.max_sge; ++ resp->attr.srq_limit = attr.srq_limit; ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_query_srq(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_query_srq_cmd *cmd; ++ struct ibp_query_srq_resp *resp; ++ struct ib_srq *srq; ++ struct ib_srq_attr attr; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_query_srq_cmd *) hdr; ++ srq = (struct ib_srq *) cmd->srq; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ ret = ib_query_srq(srq, &attr); ++ if (ret) { ++ print_err("ib_query_srq returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ resp = (struct ibp_query_srq_resp *) msg->data; ++ len += sizeof(*resp); ++ ++ resp->attr.max_wr = attr.max_wr; ++ resp->attr.max_sge = attr.max_sge; ++ resp->attr.srq_limit = attr.srq_limit; ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_destroy_srq(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_queued_response_msg *msg; ++ struct ibp_destroy_srq_cmd *cmd; ++ struct ib_uobject *uobj; ++ struct ib_srq *srq; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_destroy_srq_cmd *) hdr; ++ srq = (struct ib_srq *) cmd->srq; ++ msg = (struct ibp_queued_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ uobj = srq->uobject; ++ ++ ret = ib_destroy_srq(srq); ++ if (unlikely(ret == -EBUSY)) { ++ msleep(100); ++ ret = ib_destroy_srq(srq); ++ } ++ if (ret) { ++ print_err("ib_destroy_srq returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ ibp_destroy_uobj(uobj); ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, QUEUED_RESPONSE, hdr->request, ret); ++ return ibp_queue_response(client, msg); ++} ++ ++static void ibp_ibqp_event(struct ib_event *ibevent, void *qp_context) ++{ ++ struct ibp_ucontext *ucontext; ++ struct ibp_client *client; ++ struct ibp_event *event; ++ struct ib_uobject *uobj; ++ ++ event = kmalloc(sizeof(*event), GFP_ATOMIC); ++ if (!event) { ++ print_err("kalloc failed\n"); ++ return; ++ } ++ ++ uobj = ibevent->element.qp->uobject; ++ ucontext = (struct ibp_ucontext *) uobj->user_handle; ++ client = ucontext->client; ++ ++ event->client = client; ++ event->context = (uintptr_t) qp_context; ++ event->type = ibevent->event; ++ event->ibdev = ucontext->ibdev; ++ ++ INIT_WORK(&event->work, ibp_async_event); ++ queue_work(client->workqueue, &event->work); ++} ++ ++static int ibp_cmd_create_qp(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_create_qp_cmd *cmd; ++ struct ibp_create_qp_resp *resp; ++ struct ibp_ucontext *ucontext; ++ struct ib_uobject *uobj; ++ struct ib_pd *pd; ++ struct ibp_qp *qp; ++ struct ib_qp_init_attr init_attr; ++ struct ib_udata udata; ++ size_t len; ++ size_t outlen; ++ int ret = 0; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_create_qp_cmd *) hdr; ++ pd = (struct ib_pd *) cmd->pd; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ resp = (struct ibp_create_qp_resp *) msg->data; ++ len = hdr->length - sizeof(*cmd); ++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp); ++ ++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen); ++ ++ len = sizeof(*msg); ++ ++ qp = kzalloc(sizeof *qp, GFP_KERNEL); ++ if (!qp) { ++ print_err("kzalloc failed\n"); ++ ret = -ENOMEM; ++ goto send_resp; ++ } ++ INIT_LIST_HEAD(&qp->mcast); ++ ++ ucontext = (struct ibp_ucontext *) pd->uobject->user_handle; ++ ++ uobj = ibp_create_uobj(ucontext); ++ if (IS_ERR(uobj)) { ++ ret = PTR_ERR(uobj); ++ print_err("ibp_create_uobj returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ memset(&init_attr, 0, sizeof(init_attr)); ++ ++ init_attr.send_cq = (struct ib_cq *) cmd->send_cq; ++ init_attr.recv_cq = (struct ib_cq *) cmd->recv_cq; ++ init_attr.srq = (struct ib_srq *) cmd->srq; ++ init_attr.xrcd = (struct ib_xrcd *) cmd->xrc_domain; ++ init_attr.cap.max_send_wr = cmd->cap.max_send_wr; ++ init_attr.cap.max_recv_wr = cmd->cap.max_recv_wr; ++ init_attr.cap.max_send_sge = cmd->cap.max_send_sge; ++ init_attr.cap.max_recv_sge = cmd->cap.max_recv_sge; ++ init_attr.cap.max_inline_data = cmd->cap.max_inline_data; ++ init_attr.sq_sig_type = cmd->sq_sig_type; ++ init_attr.qp_type = cmd->qp_type; ++ init_attr.create_flags = cmd->create_flags; ++ init_attr.port_num = cmd->port_num; ++ ++ qp->ibqp = device->ib_dev->create_qp(pd, &init_attr, &udata); ++ if (IS_ERR(qp->ibqp)) { ++ ret = PTR_ERR(qp->ibqp); ++ print_err("ib_create_qp returned %d\n", ret); ++ /* ++ * Clear uobj's user_handle as destroy_uobj tries to list_del ++ * uobj from the list and uobj has NOT been added yet ++ */ ++ uobj->user_handle = 0; ++ ibp_destroy_uobj(uobj); ++ goto send_resp; ++ } ++ ++ qp->ibqp->device = device->ib_dev; ++ qp->ibqp->pd = pd; ++ qp->ibqp->send_cq = init_attr.send_cq; ++ qp->ibqp->recv_cq = init_attr.recv_cq; ++ qp->ibqp->srq = init_attr.srq; ++ qp->ibqp->event_handler = ibp_ibqp_event; ++ qp->ibqp->qp_context = (void *) cmd->qp_context; ++ qp->ibqp->qp_type = init_attr.qp_type; ++ ++ if (qp->ibqp->qp_type == IB_QPT_XRC_TGT) { ++ qp->ibqp->xrcd = init_attr.xrcd; ++ atomic_inc(&qp->ibqp->xrcd->usecnt); ++ } else { ++ qp->ibqp->xrcd = NULL; ++ qp->ibqp->real_qp = qp->ibqp; ++ } ++ atomic_set(&qp->ibqp->usecnt, 0); ++ ++ atomic_inc(&pd->usecnt); ++ atomic_inc(&init_attr.send_cq->usecnt); ++ atomic_inc(&init_attr.recv_cq->usecnt); ++ ++ if (init_attr.srq) ++ atomic_inc(&init_attr.srq->usecnt); ++ ++ qp->ibqp->uobject = uobj; ++ uobj->object = qp; ++ ++ mutex_lock(&ucontext->mutex); ++ list_add_tail(&uobj->list, &ucontext->ibucontext->qp_list); ++ mutex_unlock(&ucontext->mutex); ++ ++ len += sizeof(*resp); ++ len += outlen - udata.outlen; /* add driver private data */ ++ ++ resp->qp = (uintptr_t) qp; ++ resp->qpn = qp->ibqp->qp_num; ++ resp->cap.max_send_wr = init_attr.cap.max_send_wr; ++ resp->cap.max_recv_wr = init_attr.cap.max_recv_wr; ++ resp->cap.max_send_sge = init_attr.cap.max_send_sge; ++ resp->cap.max_recv_sge = init_attr.cap.max_recv_sge; ++ resp->cap.max_inline_data = init_attr.cap.max_inline_data; ++ ++send_resp: ++ if (ret) ++ kfree(qp); ++ ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_modify_qp(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_modify_qp_cmd *cmd; ++ struct ibp_modify_qp_resp *resp; ++ struct ibp_qp *qp; ++ struct ib_qp_attr attr; ++ struct ib_udata udata; ++ size_t len; ++ size_t outlen; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_modify_qp_cmd *) hdr; ++ qp = (struct ibp_qp *) cmd->qp; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ resp = (struct ibp_modify_qp_resp *) msg->data; ++ len = hdr->length - sizeof(*cmd); ++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp); ++ ++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen); ++ ++ len = sizeof(*msg); ++ ++ memset(&attr, 0, sizeof(attr)); ++ ++ attr.qp_state = cmd->qp_state; ++ attr.cur_qp_state = cmd->cur_qp_state; ++ attr.path_mtu = cmd->path_mtu; ++ attr.path_mig_state = cmd->path_mig_state; ++ attr.qkey = cmd->qkey; ++ attr.rq_psn = cmd->rq_psn; ++ attr.sq_psn = cmd->sq_psn; ++ attr.dest_qp_num = cmd->dest_qp_num; ++ attr.qp_access_flags = cmd->qp_access_flags; ++ attr.cap.max_send_wr = cmd->cap.max_send_wr; ++ attr.cap.max_recv_wr = cmd->cap.max_recv_wr; ++ attr.cap.max_send_sge = cmd->cap.max_send_sge; ++ attr.cap.max_recv_sge = cmd->cap.max_recv_sge; ++ attr.cap.max_inline_data = cmd->cap.max_inline_data; ++ attr.ah_attr.grh.dgid.global.subnet_prefix = ++ cmd->ah.grh.dgid_subnet_prefix; ++ attr.ah_attr.grh.dgid.global.interface_id = ++ cmd->ah.grh.dgid_interface_id; ++ attr.ah_attr.grh.flow_label = cmd->ah.grh.flow_label; ++ attr.ah_attr.grh.sgid_index = cmd->ah.grh.sgid_index; ++ attr.ah_attr.grh.hop_limit = cmd->ah.grh.hop_limit; ++ attr.ah_attr.grh.traffic_class = cmd->ah.grh.traffic_class; ++ attr.ah_attr.dlid = cmd->ah.dlid; ++ attr.ah_attr.sl = cmd->ah.sl; ++ attr.ah_attr.src_path_bits = cmd->ah.src_path_bits; ++ attr.ah_attr.static_rate = cmd->ah.static_rate; ++ attr.ah_attr.ah_flags = cmd->ah.ah_flags; ++ attr.ah_attr.port_num = cmd->ah.port_num; ++ attr.alt_ah_attr.grh.dgid.global.subnet_prefix = ++ cmd->alt_ah.grh.dgid_subnet_prefix; ++ attr.alt_ah_attr.grh.dgid.global.interface_id = ++ cmd->alt_ah.grh.dgid_interface_id; ++ attr.alt_ah_attr.grh.flow_label = cmd->alt_ah.grh.flow_label; ++ attr.alt_ah_attr.grh.sgid_index = cmd->alt_ah.grh.sgid_index; ++ attr.alt_ah_attr.grh.hop_limit = cmd->alt_ah.grh.hop_limit; ++ attr.alt_ah_attr.grh.traffic_class = cmd->alt_ah.grh.traffic_class; ++ attr.alt_ah_attr.dlid = cmd->alt_ah.dlid; ++ attr.alt_ah_attr.sl = cmd->alt_ah.sl; ++ attr.alt_ah_attr.src_path_bits = cmd->alt_ah.src_path_bits; ++ attr.alt_ah_attr.static_rate = cmd->alt_ah.static_rate; ++ attr.alt_ah_attr.ah_flags = cmd->alt_ah.ah_flags; ++ attr.alt_ah_attr.port_num = cmd->alt_ah.port_num; ++ attr.pkey_index = cmd->pkey_index; ++ attr.alt_pkey_index = cmd->alt_pkey_index; ++ attr.en_sqd_async_notify = cmd->en_sqd_async_notify; ++ attr.sq_draining = cmd->sq_draining; ++ attr.max_rd_atomic = cmd->max_rd_atomic; ++ attr.max_dest_rd_atomic = cmd->max_dest_rd_atomic; ++ attr.min_rnr_timer = cmd->min_rnr_timer; ++ attr.port_num = cmd->port_num; ++ attr.timeout = cmd->timeout; ++ attr.retry_cnt = cmd->retry_cnt; ++ attr.rnr_retry = cmd->rnr_retry; ++ attr.alt_port_num = cmd->alt_port_num; ++ attr.alt_timeout = cmd->alt_timeout; ++ ++ ret = device->ib_dev->modify_qp(qp->ibqp, &attr, cmd->qp_attr_mask, &udata); ++ if (ret) { ++ print_err("ib_modify_qp returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ len += sizeof(*resp); ++ len += outlen - udata.outlen; /* add driver private data */ ++ ++ resp->cap.max_send_wr = attr.cap.max_send_wr; ++ resp->cap.max_recv_wr = attr.cap.max_recv_wr; ++ resp->cap.max_send_sge = attr.cap.max_send_sge; ++ resp->cap.max_recv_sge = attr.cap.max_recv_sge; ++ resp->cap.max_inline_data = attr.cap.max_inline_data; ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_query_qp(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_query_qp_cmd *cmd; ++ struct ibp_query_qp_resp *resp; ++ struct ibp_qp *qp; ++ struct ib_qp_attr qp_attr; ++ struct ib_qp_init_attr qp_init_attr; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_query_qp_cmd *) hdr; ++ qp = (struct ibp_qp *) cmd->qp; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ ret = ib_query_qp(qp->ibqp, &qp_attr, cmd->qp_attr_mask, &qp_init_attr); ++ if (ret) { ++ print_err("ib_query_qp returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ resp = (struct ibp_query_qp_resp *) msg->data; ++ len += sizeof(*resp); ++ ++ resp->qp_state = qp_attr.qp_state; ++ resp->cur_qp_state = qp_attr.cur_qp_state; ++ resp->path_mtu = qp_attr.path_mtu; ++ resp->path_mig_state = qp_attr.path_mig_state; ++ resp->qkey = qp_attr.qkey; ++ resp->rq_psn = qp_attr.rq_psn; ++ resp->sq_psn = qp_attr.sq_psn; ++ resp->dest_qp_num = qp_attr.dest_qp_num; ++ resp->qp_access_flags = qp_attr.qp_access_flags; ++ ++ resp->init_cap.max_send_wr = qp_init_attr.cap.max_send_wr; ++ resp->init_cap.max_recv_wr = qp_init_attr.cap.max_recv_wr; ++ resp->init_cap.max_send_sge = qp_init_attr.cap.max_send_sge; ++ resp->init_cap.max_recv_sge = qp_init_attr.cap.max_recv_sge; ++ resp->init_cap.max_inline_data = qp_init_attr.cap.max_inline_data; ++ resp->init_create_flags = qp_init_attr.create_flags; ++ resp->init_sq_sig_type = qp_init_attr.sq_sig_type; ++ ++ resp->cap.max_send_wr = qp_attr.cap.max_send_wr; ++ resp->cap.max_recv_wr = qp_attr.cap.max_recv_wr; ++ resp->cap.max_send_sge = qp_attr.cap.max_send_sge; ++ resp->cap.max_recv_sge = qp_attr.cap.max_recv_sge; ++ resp->cap.max_inline_data = qp_attr.cap.max_inline_data; ++ ++ resp->ah.grh.dgid_subnet_prefix = ++ qp_attr.ah_attr.grh.dgid.global.subnet_prefix; ++ resp->ah.grh.dgid_interface_id = ++ qp_attr.ah_attr.grh.dgid.global.interface_id; ++ resp->ah.grh.flow_label = qp_attr.ah_attr.grh.flow_label; ++ resp->ah.grh.sgid_index = qp_attr.ah_attr.grh.sgid_index; ++ resp->ah.grh.hop_limit = qp_attr.ah_attr.grh.hop_limit; ++ resp->ah.grh.traffic_class = qp_attr.ah_attr.grh.traffic_class; ++ resp->ah.dlid = qp_attr.ah_attr.dlid; ++ resp->ah.sl = qp_attr.ah_attr.sl; ++ resp->ah.src_path_bits = qp_attr.ah_attr.src_path_bits; ++ resp->ah.static_rate = qp_attr.ah_attr.static_rate; ++ resp->ah.ah_flags = qp_attr.ah_attr.ah_flags; ++ resp->ah.port_num = qp_attr.ah_attr.port_num; ++ ++ resp->alt_ah.grh.dgid_subnet_prefix = ++ qp_attr.alt_ah_attr.grh.dgid.global.subnet_prefix; ++ resp->alt_ah.grh.dgid_interface_id = ++ qp_attr.alt_ah_attr.grh.dgid.global.interface_id; ++ resp->alt_ah.grh.flow_label = qp_attr.alt_ah_attr.grh.flow_label; ++ resp->alt_ah.grh.sgid_index = qp_attr.alt_ah_attr.grh.sgid_index; ++ resp->alt_ah.grh.hop_limit = qp_attr.alt_ah_attr.grh.hop_limit; ++ resp->alt_ah.grh.traffic_class = qp_attr.alt_ah_attr.grh.traffic_class; ++ resp->alt_ah.dlid = qp_attr.alt_ah_attr.dlid; ++ resp->alt_ah.sl = qp_attr.alt_ah_attr.sl; ++ resp->alt_ah.src_path_bits = qp_attr.alt_ah_attr.src_path_bits; ++ resp->alt_ah.static_rate = qp_attr.alt_ah_attr.static_rate; ++ resp->alt_ah.ah_flags = qp_attr.alt_ah_attr.ah_flags; ++ resp->alt_ah.port_num = qp_attr.alt_ah_attr.port_num; ++ ++ resp->pkey_index = qp_attr.pkey_index; ++ resp->alt_pkey_index = qp_attr.alt_pkey_index; ++ resp->en_sqd_async_notify = qp_attr.en_sqd_async_notify; ++ resp->sq_draining = qp_attr.sq_draining; ++ resp->max_rd_atomic = qp_attr.max_rd_atomic; ++ resp->max_dest_rd_atomic = qp_attr.max_dest_rd_atomic; ++ resp->min_rnr_timer = qp_attr.min_rnr_timer; ++ resp->port_num = qp_attr.port_num; ++ resp->timeout = qp_attr.timeout; ++ resp->retry_cnt = qp_attr.retry_cnt; ++ resp->rnr_retry = qp_attr.rnr_retry; ++ resp->alt_port_num = qp_attr.alt_port_num; ++ resp->alt_timeout = qp_attr.alt_timeout; ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_destroy_qp(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_queued_response_msg *msg; ++ struct ibp_destroy_qp_cmd *cmd; ++ struct ib_uobject *uobj; ++ struct ibp_qp *qp; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_destroy_qp_cmd *) hdr; ++ qp = (struct ibp_qp *) cmd->qp; ++ msg = (struct ibp_queued_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ uobj = qp->ibqp->uobject; ++ ++ ret = ib_destroy_qp(qp->ibqp); ++ if (ret) { ++ print_err("ib_destroy_qp returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ ibp_destroy_uobj(uobj); ++ ++ kfree(qp); ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, QUEUED_RESPONSE, hdr->request, ret); ++ return ibp_queue_response(client, msg); ++} ++ ++static void ibp_ibcq_event(struct ib_event *ibevent, void *cq_context) ++{ ++ struct ibp_ucontext *ucontext; ++ struct ibp_client *client; ++ struct ibp_event *event; ++ struct ib_uobject *uobj; ++ ++ event = kmalloc(sizeof(*event), GFP_ATOMIC); ++ if (!event) { ++ print_err("kalloc failed\n"); ++ return; ++ } ++ ++ uobj = (struct ib_uobject *) ibevent->element.cq->uobject; ++ ucontext = (void *) uobj->user_handle; ++ client = ucontext->client; ++ ++ event->client = client; ++ event->context = (uintptr_t) cq_context; ++ event->type = ibevent->event; ++ event->ibdev = ucontext->ibdev; ++ ++ INIT_WORK(&event->work, ibp_async_event); ++ queue_work(client->workqueue, &event->work); ++} ++ ++static void ibp_cq_comp(struct work_struct *work) ++{ ++ struct ibp_comp *comp; ++ struct ibp_cq_comp_msg msg; ++ ++ comp = container_of(work, struct ibp_comp, work); ++ ++ IBP_INIT_MSG(NULL, &msg, sizeof(msg), CQ_COMP); ++ ++ msg.data.cq_context = (uintptr_t) comp->cq_context; ++ ++ ibp_send(comp->client->ep, &msg, sizeof(msg)); ++ ++ ibp_add_to_stack(c_stack, (void *) comp); ++} ++ ++static void ibp_ibcq_comp(struct ib_cq *ibcq, void *cq_context) ++{ ++ struct ibp_ucontext *ucontext; ++ struct ibp_client *client; ++ struct ibp_comp *comp; ++ ++ ucontext = (struct ibp_ucontext *) ibcq->uobject->user_handle; ++ ++ if (ucontext->ibucontext->closing) { ++ print_dbg("ignoring cq completion, connection closing\n"); ++ return; ++ } ++ ++ comp = (struct ibp_comp *) ++ ibp_pull_from_stack(c_stack, sizeof(*comp), GFP_ATOMIC); ++ if (!comp) { ++ print_err("kalloc failed\n"); ++ return; ++ } ++ ++ client = ucontext->client; ++ ++ comp->client = client; ++ comp->cq_context = cq_context; ++ ++ INIT_WORK(&comp->work, ibp_cq_comp); ++ queue_work(client->workqueue, &comp->work); ++} ++ ++static int ibp_cmd_create_cq(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_create_cq_cmd *cmd; ++ struct ibp_create_cq_resp *resp; ++ struct ibp_ucontext *ucontext; ++ struct ib_uobject *uobj; ++ struct ib_udata udata; ++ struct ib_cq *cq; ++ size_t len; ++ size_t outlen; ++ int ret = 0; ++#ifdef MOFED ++ struct ib_cq_init_attr attr; ++#endif ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_create_cq_cmd *) hdr; ++ ucontext = (struct ibp_ucontext *) cmd->ucontext; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ resp = (struct ibp_create_cq_resp *) msg->data; ++ len = hdr->length - sizeof(*cmd); ++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp); ++ ++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen); ++ ++ len = sizeof(*msg); ++ ++ uobj = ibp_create_uobj(ucontext); ++ if (IS_ERR(uobj)) { ++ ret = PTR_ERR(uobj); ++ print_err("ibp_create_uobj returned %d\n", ret); ++ goto send_resp; ++ } ++ ++#ifdef MOFED ++ memset(&attr, 0, sizeof(attr)); ++ attr.cqe = cmd->cqe; ++ attr.comp_vector = cmd->vector; ++ ++ cq = device->ib_dev->create_cq(device->ib_dev, &attr, ++ ucontext->ibucontext, &udata); ++#else ++ cq = device->ib_dev->create_cq(device->ib_dev, (int) cmd->cqe, ++ (int) cmd->vector, ++ ucontext->ibucontext, &udata); ++#endif ++ if (IS_ERR(cq)) { ++ ret = PTR_ERR(cq); ++ print_err("ib_create_cq returned %d\n", ret); ++ /* ++ * Clear uobj's user_handle as destroy_uobj tries to list_del ++ * uobj from the list and uobj has NOT been added yet ++ */ ++ uobj->user_handle = 0; ++ ibp_destroy_uobj(uobj); ++ goto send_resp; ++ } ++ ++ cq->device = device->ib_dev; ++ cq->event_handler = ibp_ibcq_event; ++ cq->comp_handler = ibp_ibcq_comp; ++ cq->cq_context = (void *) cmd->cq_context; ++ atomic_set(&cq->usecnt, 0); ++ ++ cq->uobject = uobj; ++ uobj->object = cq; ++ ++ mutex_lock(&ucontext->mutex); ++ list_add_tail(&uobj->list, &ucontext->ibucontext->cq_list); ++ mutex_unlock(&ucontext->mutex); ++ ++ len += sizeof(*resp); ++ len += outlen - udata.outlen; /* add driver private data */ ++ ++ resp->cq = (uintptr_t)cq; ++ resp->cqe = cq->cqe; ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_destroy_cq(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_queued_response_msg *msg; ++ struct ibp_destroy_cq_cmd *cmd; ++ struct ib_uobject *uobj; ++ struct ib_cq *cq; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_destroy_cq_cmd *) hdr; ++ cq = (struct ib_cq *) cmd->cq; ++ msg = (struct ibp_queued_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ uobj = cq->uobject; ++ ++ ret = ib_destroy_cq(cq); ++ if (unlikely(ret == -EBUSY)) { ++ msleep(100); ++ ret = ib_destroy_cq(cq); ++ } ++ if (ret) { ++ print_err("ib_destroy_cq returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ ibp_destroy_uobj(uobj); ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, QUEUED_RESPONSE, hdr->request, ret); ++ return ibp_queue_response(client, msg); ++} ++ ++static int ibp_cmd_resize_cq(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_resize_cq_cmd *cmd; ++ struct ibp_resize_cq_resp *resp; ++ struct ib_cq *cq; ++ struct ib_udata udata; ++ size_t len; ++ size_t outlen; ++ int ret; ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_resize_cq_cmd *) hdr; ++ cq = (struct ib_cq *) cmd->cq; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ resp = (struct ibp_resize_cq_resp *) msg->data; ++ len = hdr->length - sizeof(*cmd); ++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp); ++ ++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen); ++ ++ len = sizeof(*msg); ++ ++ ret = device->ib_dev->resize_cq ? ++ device->ib_dev->resize_cq(cq, (int) cmd->cqe, &udata) : -ENOSYS; ++ if (ret) { ++ print_err("ib_resize_cq returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ len += sizeof(*resp); ++ len += outlen - udata.outlen; /* add driver private data */ ++ ++ resp->cqe = cq->cqe; ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_reg_user_mr(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_reg_user_mr_cmd *cmd; ++ struct ibp_reg_user_mr_resp *resp; ++ struct ibp_mr *mr; ++ struct ibp_ucontext *ucontext; ++ struct ib_uobject *uobj; ++ struct ib_udata udata; ++ struct ib_pd *pd; ++ size_t len; ++ size_t outlen; ++ int ret = 0; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_reg_user_mr_cmd *) hdr; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ resp = (struct ibp_reg_user_mr_resp *) msg->data; ++ len = hdr->length - sizeof(*cmd); ++ outlen = MAX_MSG_SIZE - sizeof(*msg) - sizeof(*resp); ++ ++ INIT_UDATA(&udata, cmd->data, resp->data, len, outlen); ++ ++ len = sizeof(*msg); ++ ++ mr = kzalloc(sizeof(*mr), GFP_KERNEL); ++ if (!mr) { ++ print_err("kzalloc failed\n"); ++ ret = -ENOMEM; ++ goto send_resp; ++ } ++ ++ pd = (struct ib_pd *) cmd->pd; ++ ++ ucontext = (struct ibp_ucontext *) pd->uobject->user_handle; ++ ++ mr->reg = ibp_reg_buf(ucontext, cmd->hca_va, cmd->scif_addr, ++ cmd->length, cmd->offset, cmd->access); ++ if (IS_ERR(mr->reg)) { ++ ret = PTR_ERR(mr->reg); ++ print_err("ibp_reg_buf returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ uobj = ibp_create_uobj(ucontext); ++ if (IS_ERR(uobj)) { ++ ret = PTR_ERR(uobj); ++ print_err("ibp_create_uobj returned %d\n", ret); ++ kref_put(&mr->reg->ref, ibp_dereg_buf); ++ goto send_resp; ++ } ++ ++#ifdef MOFED ++ mr->ibmr = pd->device->reg_user_mr(pd, cmd->hca_va, cmd->length, ++ cmd->hca_va, cmd->access, &udata, 0); ++#else ++ mr->ibmr = pd->device->reg_user_mr(pd, cmd->hca_va, cmd->length, ++ cmd->hca_va, cmd->access, &udata); ++#endif ++ if (IS_ERR(mr->ibmr)) { ++ ret = PTR_ERR(mr->ibmr); ++ print_err("ib_reg_user_mr returned %d\n", ret); ++ kref_put(&mr->reg->ref, ibp_dereg_buf); ++ ibp_destroy_uobj(uobj); ++ goto send_resp; ++ } ++ ++ mr->ibmr->pd = pd; ++ mr->ibmr->device = pd->device; ++ atomic_inc(&pd->usecnt); ++ atomic_set(&mr->ibmr->usecnt, 0); ++ ++ mr->ibmr->uobject = uobj; ++ uobj->object = mr; ++ ++ mutex_lock(&ucontext->mutex); ++ list_add_tail(&uobj->list, &ucontext->ibucontext->mr_list); ++ mutex_unlock(&ucontext->mutex); ++ ++ len += sizeof(*resp); ++ len += outlen - udata.outlen; /* add driver private data */ ++ ++ resp->mr = (uintptr_t) mr; ++ resp->lkey = mr->ibmr->lkey; ++ resp->rkey = mr->ibmr->rkey; ++ ++send_resp: ++ if (ret) ++ kfree(mr); ++ ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_dereg_mr(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_dereg_mr_cmd *cmd; ++ struct ibp_mr *mr; ++ struct ib_uobject *uobj; ++ size_t len; ++ int ret; ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_dereg_mr_cmd *) hdr; ++ mr = (struct ibp_mr *) cmd->mr; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ if (IS_NULL_OR_ERR(mr)) { ++ print_err("Invalid mr %p\n", mr); ++ ret = -EINVAL; ++ goto send_resp; ++ } ++ ++ uobj = mr->ibmr->uobject; ++ ++ ret = ib_dereg_mr(mr->ibmr); ++ if (unlikely(ret == -EBUSY)) { ++ msleep(100); ++ ret = ib_dereg_mr(mr->ibmr); ++ } ++ if (ret) { ++ print_err("ib_dereg_mr returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ ibp_destroy_uobj(uobj); ++ ++ if (mr->reg) ++ kref_put(&mr->reg->ref, ibp_dereg_buf); ++ ++ kfree(mr); ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_attach_mcast(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_attach_mcast_cmd *cmd; ++ struct ibp_mcast_entry *mcast; ++ struct ibp_ucontext *ucontext; ++ struct ibp_qp *qp; ++ union ib_gid gid; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_attach_mcast_cmd *) hdr; ++ qp = (struct ibp_qp *) cmd->qp; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ ucontext = (struct ibp_ucontext *) qp->ibqp->uobject->user_handle; ++ ++ mcast = kzalloc(sizeof *mcast, GFP_KERNEL); ++ if (!mcast) { ++ print_err("kzalloc failed\n"); ++ ret = -ENOMEM; ++ goto send_resp; ++ } ++ ++ gid.global.subnet_prefix = cmd->subnet_prefix; ++ gid.global.interface_id = cmd->interface_id; ++ ++ ret = ib_attach_mcast(qp->ibqp, &gid, cmd->lid); ++ if (ret) { ++ print_err("ib_attach_mcast returned %d\n", ret); ++ kfree(mcast); ++ goto send_resp; ++ } ++ ++ mcast->lid = cmd->lid; ++ mcast->gid.global.subnet_prefix = cmd->subnet_prefix; ++ mcast->gid.global.interface_id = cmd->interface_id; ++ ++ mutex_lock(&ucontext->mutex); ++ list_add_tail(&mcast->list, &qp->mcast); ++ mutex_unlock(&ucontext->mutex); ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static int ibp_cmd_detach_mcast(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) ++{ ++ struct ibp_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_detach_mcast_cmd *cmd; ++ struct ibp_mcast_entry *mcast; ++ struct ibp_ucontext *ucontext; ++ struct ibp_qp *qp; ++ union ib_gid gid; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ device = (struct ibp_device *) hdr->device; ++ cmd = (struct ibp_detach_mcast_cmd *) hdr; ++ qp = (struct ibp_qp *) cmd->qp; ++ msg = (struct ibp_verb_response_msg *) tx_buf; ++ len = sizeof(*msg); ++ ++ ucontext = (struct ibp_ucontext *) qp->ibqp->uobject->user_handle; ++ ++ gid.global.subnet_prefix = cmd->subnet_prefix; ++ gid.global.interface_id = cmd->interface_id; ++ ++ ret = ib_detach_mcast(qp->ibqp, &gid, cmd->lid); ++ if (ret) { ++ print_err("ib_detach_mcast returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ mutex_lock(&ucontext->mutex); ++ list_for_each_entry(mcast, &qp->mcast, list) ++ if (cmd->lid == mcast->lid && ++ !memcmp(&gid , mcast->gid.raw, sizeof mcast->gid.raw)) { ++ list_del(&mcast->list); ++ kfree(mcast); ++ break; ++ } ++ mutex_unlock(&ucontext->mutex); ++ ++send_resp: ++ IBP_INIT_RESP(device, msg, len, VERB_RESPONSE, hdr->request, ret); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static void ibp_detach_mcast(struct ibp_qp *qp) ++{ ++ struct ibp_mcast_entry *mcast, *tmp; ++ ++ list_for_each_entry_safe(mcast, tmp, &qp->mcast, list) { ++ ib_detach_mcast(qp->ibqp, &mcast->gid, mcast->lid); ++ list_del(&mcast->list); ++ kfree(mcast); ++ } ++} ++ ++static void ibp_destroy_ucontext(struct ibp_ucontext *ucontext) ++{ ++ struct ib_ucontext *ibuctx; ++ struct ib_uobject *uobj; ++ struct ib_uobject *tmp; ++ struct ibp_mmap *mmap; ++ struct ibp_mmap *tmp_mmap; ++ ++ ibuctx = ucontext->ibucontext; ++ if (!ibuctx) ++ goto out; ++ ++ ibuctx->closing = 1; ++ ++ synchronize_sched(); ++ ++ down_write(&list_rwsem); ++ ++ list_for_each_entry_safe(uobj, tmp, &ibuctx->ah_list, list) { ++ struct ib_ah *ibah = uobj->object; ++ ib_destroy_ah(ibah); ++ ibp_destroy_uobj(uobj); ++ } ++ ++ list_for_each_entry_safe(uobj, tmp, &ibuctx->qp_list, list) { ++ struct ibp_qp *qp = uobj->object; ++ ibp_detach_mcast(qp); ++ ib_destroy_qp(qp->ibqp); ++ ibp_destroy_uobj(uobj); ++ kfree(qp); ++ } ++ ++ list_for_each_entry_safe(uobj, tmp, &ibuctx->cq_list, list) { ++ struct ib_cq *ibcq = uobj->object; ++ ib_destroy_cq(ibcq); ++ ibp_destroy_uobj(uobj); ++ } ++ ++ list_for_each_entry_safe(uobj, tmp, &ibuctx->srq_list, list) { ++ struct ib_srq *ibsrq = uobj->object; ++ ib_destroy_srq(ibsrq); ++ ibp_destroy_uobj(uobj); ++ } ++ ++ list_for_each_entry_safe(uobj, tmp, &ibuctx->mr_list, list) { ++ struct ibp_mr *mr = uobj->object; ++ ib_dereg_mr(mr->ibmr); ++ ibp_destroy_uobj(uobj); ++ kref_put(&mr->reg->ref, ibp_dereg_buf); ++ kfree(mr); ++ } ++ ++ list_for_each_entry_safe(uobj, tmp, &ibuctx->xrcd_list, list) { ++ struct ib_xrcd *ibxrcd = uobj->object; ++ ib_dealloc_xrcd(ibxrcd); ++ ibp_destroy_uobj(uobj); ++ } ++ ++ list_for_each_entry_safe(uobj, tmp, &ibuctx->pd_list, list) { ++ struct ib_pd *ibpd = uobj->object; ++ ib_dealloc_pd(ibpd); ++ ibp_destroy_uobj(uobj); ++ } ++ ++ up_write(&list_rwsem); ++ ++ synchronize_sched(); ++ ++ ibuctx->device->dealloc_ucontext(ibuctx); ++out: ++ if (ucontext->ibdev) ++ ib_unregister_event_handler(&ucontext->event_handler); ++ ++ list_for_each_entry_safe(mmap, tmp_mmap, &ucontext->mmap_list, list) { ++ ibp_scif_unregister(ucontext->client, mmap); ++ ++ if (!IS_NULL_OR_ERR(current) && !IS_NULL_OR_ERR(current->mm)) { ++ MUNMAP(current->mm, mmap->vaddr, mmap->len); ++ } ++ kfree(mmap); ++ } ++ ++ while (!RB_EMPTY_ROOT(&ucontext->reg_tree)) { ++ struct ibp_reg *reg; ++ reg = rb_entry(ucontext->reg_tree.rb_node, struct ibp_reg, ++ node); ++ kref_put(®->ref, ibp_dereg_buf); ++ } ++ ++ ibp_put_device(ucontext->device); ++ fput(ucontext->filp); ++ kfree(ucontext); ++} ++ ++void ibp_cleanup_ucontext(struct list_head *ucontext_list) ++{ ++ struct ibp_ucontext *ucontext; ++ struct ibp_ucontext *next; ++ ++ list_for_each_entry_safe(ucontext, next, ucontext_list, list) ++ ibp_destroy_ucontext(ucontext); ++} ++ ++static int (*ibp_msg_table[])(struct ibp_client *client, ++ struct ibp_msg_header *hdr, void *tx_buf) = { ++ [IBP_VERB_GET_PROTOCOL_STATS] = ibp_cmd_not_supported, ++ [IBP_VERB_QUERY_DEVICE] = ibp_cmd_query_device, ++ [IBP_VERB_QUERY_PORT] = ibp_cmd_query_port, ++ [IBP_VERB_GET_LINK_LAYER] = ibp_cmd_not_supported, ++ [IBP_VERB_QUERY_GID] = ibp_cmd_query_gid, ++ [IBP_VERB_QUERY_PKEY] = ibp_cmd_query_pkey, ++ [IBP_VERB_MODIFY_DEVICE] = ibp_cmd_not_supported, ++ [IBP_VERB_MODIFY_PORT] = ibp_cmd_not_supported, ++ [IBP_VERB_ALLOC_UCONTEXT] = ibp_cmd_alloc_ucontext, ++ [IBP_VERB_DEALLOC_UCONTEXT] = ibp_cmd_dealloc_ucontext, ++ [IBP_VERB_REG_BUF] = ibp_cmd_reg_buf, ++ [IBP_VERB_DEREG_BUF] = ibp_cmd_dereg_buf, ++ [IBP_VERB_MMAP] = ibp_cmd_mmap, ++ [IBP_VERB_UNMMAP] = ibp_cmd_unmmap, ++ [IBP_VERB_ALLOC_PD] = ibp_cmd_alloc_pd, ++ [IBP_VERB_DEALLOC_PD] = ibp_cmd_dealloc_pd, ++ [IBP_VERB_CREATE_AH] = ibp_cmd_create_ah, ++ [IBP_VERB_MODIFY_AH] = ibp_cmd_not_supported, ++ [IBP_VERB_QUERY_AH] = ibp_cmd_query_ah, ++ [IBP_VERB_DESTROY_AH] = ibp_cmd_destroy_ah, ++ [IBP_VERB_CREATE_SRQ] = ibp_cmd_create_srq, ++ [IBP_VERB_MODIFY_SRQ] = ibp_cmd_modify_srq, ++ [IBP_VERB_QUERY_SRQ] = ibp_cmd_query_srq, ++ [IBP_VERB_DESTROY_SRQ] = ibp_cmd_destroy_srq, ++ [IBP_VERB_POST_SRQ_RECV] = ibp_cmd_not_supported, ++ [IBP_VERB_CREATE_QP] = ibp_cmd_create_qp, ++ [IBP_VERB_MODIFY_QP] = ibp_cmd_modify_qp, ++ [IBP_VERB_QUERY_QP] = ibp_cmd_query_qp, ++ [IBP_VERB_DESTROY_QP] = ibp_cmd_destroy_qp, ++ [IBP_VERB_POST_SEND] = ibp_cmd_not_supported, ++ [IBP_VERB_POST_RECV] = ibp_cmd_not_supported, ++ [IBP_VERB_CREATE_CQ] = ibp_cmd_create_cq, ++ [IBP_VERB_MODIFY_CQ] = ibp_cmd_not_supported, ++ [IBP_VERB_DESTROY_CQ] = ibp_cmd_destroy_cq, ++ [IBP_VERB_RESIZE_CQ] = ibp_cmd_resize_cq, ++ [IBP_VERB_POLL_CQ] = ibp_cmd_not_supported, ++ [IBP_VERB_PEEK_CQ] = ibp_cmd_not_supported, ++ [IBP_VERB_REQ_NOTIFY_CQ] = ibp_cmd_not_supported, ++ [IBP_VERB_REQ_NCOMP_NOTIF] = ibp_cmd_not_supported, ++ [IBP_VERB_GET_DMA_MR] = ibp_cmd_not_supported, ++ [IBP_VERB_REG_PHYS_MR] = ibp_cmd_not_supported, ++ [IBP_VERB_REG_USER_MR] = ibp_cmd_reg_user_mr, ++ [IBP_VERB_QUERY_MR] = ibp_cmd_not_supported, ++ [IBP_VERB_DEREG_MR] = ibp_cmd_dereg_mr, ++ [IBP_VERB_ALLOC_FAST_REG_MR] = ibp_cmd_not_supported, ++ [IBP_VERB_ALLOC_FAST_REG_PAGE_LIST] = ibp_cmd_not_supported, ++ [IBP_VERB_FREE_FAST_REG_PAGE_LIST] = ibp_cmd_not_supported, ++ [IBP_VERB_REREG_PHYS_MR] = ibp_cmd_not_supported, ++ [IBP_VERB_ALLOC_MW] = ibp_cmd_not_supported, ++ [IBP_VERB_BIND_MW] = ibp_cmd_not_supported, ++ [IBP_VERB_DEALLOC_MW] = ibp_cmd_not_supported, ++ [IBP_VERB_ALLOC_FMR] = ibp_cmd_not_supported, ++ [IBP_VERB_MAP_PHYS_FMR] = ibp_cmd_not_supported, ++ [IBP_VERB_UNMAP_FMR] = ibp_cmd_not_supported, ++ [IBP_VERB_DEALLOC_FMR] = ibp_cmd_not_supported, ++ [IBP_VERB_ATTACH_MCAST] = ibp_cmd_attach_mcast, ++ [IBP_VERB_DETACH_MCAST] = ibp_cmd_detach_mcast, ++ [IBP_VERB_PROCESS_MAD] = ibp_cmd_not_supported, ++ [IBP_VERB_ALLOC_XRCD] = ibp_cmd_not_supported, ++ [IBP_VERB_DEALLOC_XRCD] = ibp_cmd_not_supported, ++}; ++ ++int ibp_init() ++{ ++ a_stack = ibp_init_stack(); ++ c_stack = ibp_init_stack(); ++ o_stack = ibp_init_stack(); ++ ++ if (!a_stack || !c_stack || !o_stack) { ++ print_err("stack allocation failed\n"); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++void ibp_cleanup() ++{ ++ ibp_clear_stack(a_stack); ++ ibp_clear_stack(c_stack); ++ ibp_clear_stack(o_stack); ++} ++ ++int ibp_process_recvs(struct ibp_client *client, void *rx_buf, void *tx_buf) ++{ ++ struct ibp_msg_header *hdr; ++ int ret; ++ ++ hdr = (struct ibp_msg_header *) rx_buf; ++ ++ for (;;) { ++ wait_event_interruptible(client->rx_wait_queue, ++ !atomic_xchg(&client->rx_in_process, ++ 1)); ++ ++ ret = ibp_recv(client->ep, hdr, sizeof(*hdr)); ++ if (ret) ++ goto err; ++ ++ if (hdr->length > MAX_MSG_SIZE) { ++ print_err("message too large, len %u max %lu\n", ++ hdr->length, MAX_MSG_SIZE); ++ ret = -EMSGSIZE; ++ goto err; ++ } ++ ++ ret = ibp_recv(client->ep, hdr->data, ++ hdr->length - sizeof(*hdr)); ++ if (ret) ++ goto err; ++ ++ atomic_set(&client->rx_in_process, 0); ++ wake_up_interruptible(&client->rx_wait_queue); ++ ++ if ((hdr->opcode >= ARRAY_SIZE(ibp_msg_table)) || ++ !ibp_msg_table[hdr->opcode]) { ++ ibp_cmd_bad_request(client, hdr, tx_buf); ++ continue; ++ } ++ ++ ret = ibp_msg_table[hdr->opcode](client, hdr, tx_buf); ++ if (ret) ++ goto err; ++ } ++ ++ goto out; ++err: ++ atomic_set(&client->rx_in_process, 0); ++ wake_up_interruptible(&client->rx_wait_queue); ++ ++out: ++ return ret; ++} +diff -urN a6/drivers/infiniband/ibp/drv/stack.c a7/drivers/infiniband/ibp/drv/stack.c +--- a6/drivers/infiniband/ibp/drv/stack.c 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/drv/stack.c 2015-02-23 10:01:30.292769309 -0800 +@@ -0,0 +1,102 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#include "common.h" ++#include "stack.h" ++ ++static DEFINE_SPINLOCK(stack_lock); ++ ++struct ibp_stack *ibp_init_stack(void) ++{ ++ struct ibp_stack *s; ++ ++ s = kzalloc(sizeof(struct ibp_stack), GFP_KERNEL); ++ if (s) ++ s->top_pointer = &s->base[0]; ++ ++ return s; ++} ++ ++void ibp_clear_stack(struct ibp_stack *s) ++{ ++ while (s->top_pointer != s->base) { ++ s->top_pointer--; ++ kfree(*s->top_pointer); ++ } ++ kfree(s); ++} ++ ++void ibp_add_to_stack(struct ibp_stack *s, void *p) ++{ ++ spin_lock_irq(&stack_lock); ++ ++ if (unlikely(++s->sample_cnt == STACK_GC_SAMPLE)) { ++ s->sample_cnt = 0; ++ if (unlikely(++s->gc_cnt == STACK_GC_RATE)) { ++ s->gc_cnt = 0; ++ while (s->current_count > s->high_water_mark) { ++ s->top_pointer--; ++ s->current_count--; ++ kfree(*s->top_pointer); ++ } ++ } else if (s->high_water_mark < s->current_count) ++ s->high_water_mark = s->current_count; ++ } ++ ++ if (likely(s->current_count < MAX_STACK)) { ++ *s->top_pointer++ = p; ++ s->current_count++; ++ } else ++ kfree(p); ++ ++ spin_unlock_irq(&stack_lock); ++} ++ ++void *ibp_pull_from_stack(struct ibp_stack *s, size_t size, int gfp_mask) ++{ ++ void *p; ++ unsigned long flag; ++ ++ spin_lock_irqsave(&stack_lock, flag); ++ ++ if (s->top_pointer == s->base) ++ p = kmalloc(size, gfp_mask); ++ else { ++ s->current_count--; ++ s->top_pointer--; ++ p = *s->top_pointer; ++ } ++ ++ spin_unlock_irqrestore(&stack_lock, flag); ++ ++ return p; ++} +diff -urN a6/drivers/infiniband/ibp/drv/stack.h a7/drivers/infiniband/ibp/drv/stack.h +--- a6/drivers/infiniband/ibp/drv/stack.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/drv/stack.h 2015-02-23 10:01:30.293769309 -0800 +@@ -0,0 +1,57 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef _IBP_STACK_H_ ++#define _IBP_STACK_H_ ++ ++#define STACK_GC_SAMPLE 5 ++#define STACK_GC_RATE 10 ++#define MAX_STACK 128 ++ ++struct ibp_stack { ++ int current_count; ++ int high_water_mark; ++ int gc_cnt; ++ int sample_cnt; ++ void **top_pointer; ++ void *base[MAX_STACK+1]; ++}; ++ ++struct ibp_stack *ibp_init_stack(void); ++ ++void ibp_add_to_stack(struct ibp_stack *s, void *p); ++ ++void *ibp_pull_from_stack(struct ibp_stack *s, size_t size, int gfp_mask); ++ ++void ibp_clear_stack(struct ibp_stack *s); ++ ++#endif /* _IBP_STACK_H_ */ +diff -urN a6/drivers/infiniband/ibp/Kconfig a7/drivers/infiniband/ibp/Kconfig +--- a6/drivers/infiniband/ibp/Kconfig 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/Kconfig 2015-02-23 10:01:30.293769309 -0800 +@@ -0,0 +1,16 @@ ++config IBP_SERVER ++ tristate "CCL Direct IB Server drivers" ++ ---help--- ++ Server drivers for CCL Direct including server proxies for ++ hw drivers, and core drivers ib_sa and ib_cm. ++ Also includes is a kernel mode test module ++ ++ To compile this driver as a module, choose M here. ++ If unsure, say N. ++ ++config IBP_DEBUG ++ bool "CCL Direct debugging" ++ depends on IBP_SERVER ++ default y ++ ---help--- ++ This option causes debug code to be compiled into the CCL Direct drivers. +diff -urN a6/drivers/infiniband/ibp/Makefile a7/drivers/infiniband/ibp/Makefile +--- a6/drivers/infiniband/ibp/Makefile 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/Makefile 2015-02-23 10:01:30.293769309 -0800 +@@ -0,0 +1,3 @@ ++obj-$(CONFIG_IBP_SERVER) += drv/ ++obj-$(CONFIG_IBP_SERVER) += cm/ ++obj-$(CONFIG_IBP_SERVER) += sa/ +diff -urN a6/drivers/infiniband/ibp/sa/common.h a7/drivers/infiniband/ibp/sa/common.h +--- a6/drivers/infiniband/ibp/sa/common.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/sa/common.h 2015-02-23 10:01:30.293769309 -0800 +@@ -0,0 +1,108 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef COMMON_H ++#define COMMON_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define DRV_DESC "CCL Direct SA " DRV_ROLE ++#define DRV_VERSION "1.0" ++#define DRV_BASE "ibp_sa" ++#define PFX DRV_BASE "_" ++#define DRV_PFX DRV_NAME ": " ++ ++#define DRV_COPYRIGHT "Copyright (c) 2011-2013 Intel Corporation" ++#define DRV_SIGNON DRV_DESC " v" DRV_VERSION "\n" DRV_COPYRIGHT "\n" ++ ++#define MODULE_PARAM(name, var, type, value, desc) \ ++ type var = value; \ ++ module_param_named(name, var, type, 0644); \ ++ MODULE_PARM_DESC(name, desc) ++ ++#ifdef IBP_DEBUG ++extern int debug_level; ++#endif ++ ++enum { ++ IBP_DEBUG_NONE, ++ IBP_DEBUG_TARGETED, ++ IBP_DEBUG_VERBOSE, ++}; ++ ++#define _PRINTK(l, f, arg...) \ ++ printk(l DRV_PFX "%s(%d) " f, __func__, __LINE__, ##arg) ++ ++#ifdef IBP_DEBUG ++#define PRINTK(dbg, l, f, arg...) \ ++ do { \ ++ if (debug_level >= dbg) \ ++ printk(l DRV_PFX "%s(%d) " f, \ ++ __func__, __LINE__, ##arg); \ ++ } while (0) ++#else ++#define PRINTK(dbg, l, f, arg...) do { } while (0) ++#endif ++ ++#define print_dbg(f, arg...) PRINTK(IBP_DEBUG_TARGETED, KERN_DEBUG, f, ##arg) ++#define print_err(f, arg...) _PRINTK(KERN_ERR, f, ##arg) ++#define print_info(f, arg...) pr_info(f, ##arg) ++ ++#if 0 ++#define FORCED_FUNCTION_TRACING ++#endif ++ ++#ifdef FORCED_FUNCTION_TRACING ++#define print_trace(f, arg...) _PRINTK(KERN_ERR, f, ##arg) ++#else ++#define print_trace(f, arg...) PRINTK(IBP_DEBUG_VERBOSE, KERN_ERR, f, ##arg) ++#endif ++ ++#ifndef IBP_SA_PORT /* unique scif port for this service */ ++#define IBP_SA_PORT SCIF_OFED_PORT_4 ++#endif ++ ++#define IS_NULL_OR_ERR(p) (!(p) || IS_ERR_VALUE((unsigned long)p)) ++ ++int ibp_send(scif_epd_t ep, void *buf, size_t len); ++int ibp_recv(scif_epd_t ep, void *buf, size_t len); ++ ++#endif /* COMMON_H */ +diff -urN a6/drivers/infiniband/ibp/sa/ibp-abi.h a7/drivers/infiniband/ibp/sa/ibp-abi.h +--- a6/drivers/infiniband/ibp/sa/ibp-abi.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/sa/ibp-abi.h 2015-02-23 10:01:30.293769309 -0800 +@@ -0,0 +1,101 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef IBP_ABI_H ++#define IBP_ABI_H ++ ++#include ++#include ++#include ++ ++/* Increment this value if any changes break compatibility. */ ++#define IBP_CM_ABI_VERSION 1 ++#define MAX_MSG_SIZE PAGE_SIZE ++ ++/* Client to server message enums. */ ++enum { ++ /* have callback */ ++ IBP_SA_PATH_REC_GET, ++ IBP_SA_JOIN_MCAST, ++ ++ /* no callback */ ++ IBP_SA_FREE_MCAST, ++ IBP_SA_GET_MCMEMBER_REC, ++ IBP_SA_REGISTER_CLIENT, ++ IBP_SA_UNREGISTER_CLIENT, ++ IBP_SA_CANCEL_QUERY, ++ IBP_INIT_AH_FROM_PATH, ++ IBP_INIT_AH_FROM_MCMEMBER, ++#if 0 ++ /* not used or local to client */ ++ IBP_SA_SERVICE_REC_QUERY, ++ IBP_SA_UNPACK_PATH, ++#endif ++}; ++ ++/* Server to client message enums. */ ++enum { ++ IBP_CALLBACK, ++ IBP_RESPONSE, ++}; ++ ++enum { ++ PATH_REC_GET_CB, ++ JOIN_MCAST_CB, ++}; ++ ++/* ++ * Make sure that all structs defined in this file are laid out to pack ++ * the same way on different architectures to avoid incompatibility. ++ * ++ * Specifically: ++ * - Do not use pointer types -- pass pointers in a u64 instead. ++ * - Make sure that any structure larger than 4 bytes is padded ++ * to a multiple of 8 bytes; otherwise the structure size may ++ * be different between architectures. ++ */ ++ ++struct ibp_msg_header { /* present in all messages */ ++ u32 opcode; ++ u32 length; ++ u32 status; ++ u32 reserved; ++ u64 request; ++ u64 data[0]; ++}; ++ ++struct ibp_verb_response_msg { ++ struct ibp_msg_header header; ++ u64 data[0]; ++}; ++ ++#endif /* IBP_ABI_H */ +diff -urN a6/drivers/infiniband/ibp/sa/ibp_exports.h a7/drivers/infiniband/ibp/sa/ibp_exports.h +--- a6/drivers/infiniband/ibp/sa/ibp_exports.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/sa/ibp_exports.h 2015-02-23 10:01:30.293769309 -0800 +@@ -0,0 +1,49 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef IBP_EXPORTS_H ++#define IBP_EXPORTS_H ++ ++#include ++ ++/* ++ ibp_resolve_ib_device - Return the host ib_device handle ++ @ibdev:Card IB device ++ ++ Upper level drivers may require the host ib_device handle associated ++ with the card ib_device. This routine resolves the card ib_device to ++ the cooresponding host ib_device handle. A value of 0 is returned if ++ no match was found. ++*/ ++u64 ibp_resolve_ib_device(struct ib_device *ibdev); ++ ++#endif /* IBP_EXPORTS_H */ +diff -urN a6/drivers/infiniband/ibp/sa/Makefile a7/drivers/infiniband/ibp/sa/Makefile +--- a6/drivers/infiniband/ibp/sa/Makefile 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/sa/Makefile 2015-02-23 10:01:30.293769309 -0800 +@@ -0,0 +1,21 @@ ++KDIR ?= /lib/modules/`uname -r`/build ++ ++obj-$(CONFIG_IBP_SERVER) += ibp_sa_server.o ++ ++ccflags-$(CONFIG_IBP_DEBUG) += -g -DIBP_DEBUG ++ ++ibp_sa_server-y := server.o \ ++ server_msg.o \ ++ sa_server_msg.o ++ ++default: ++ $(MAKE) -C $(KDIR) M=`pwd` ++ ++modules_install: ++ $(MAKE) -C $(KDIR) M=`pwd` modules_install ++ ++clean: ++ rm -rf *.ko *.o .*.ko.cmd .*.o.cmd *.mod.c Module.* modules.order .tmp_versions ++ ++unix: ++ dos2unix *.[ch] Kconfig Makefile +diff -urN a6/drivers/infiniband/ibp/sa/sa_ibp_abi.h a7/drivers/infiniband/ibp/sa/sa_ibp_abi.h +--- a6/drivers/infiniband/ibp/sa/sa_ibp_abi.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/sa/sa_ibp_abi.h 2015-02-23 10:01:30.293769309 -0800 +@@ -0,0 +1,251 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef SA_IBP_ABI_H ++#define SA_IBP_ABI_H ++ ++#include ++#include ++#include ++ ++/* Increment this value if any changes break compatibility. */ ++#define IBP_SA_ABI_VERSION 1 ++ ++/* ++ * Make sure that all structs defined in this file are laid out to pack ++ * the same way on different architectures to avoid incompatibility. ++ * ++ * Specifically: ++ * - Do not use pointer types -- pass pointers in a u64 instead. ++ * - Make sure that any structure larger than 4 bytes is padded ++ * to a multiple of 8 bytes; otherwise the structure size may ++ * be different between architectures. ++ */ ++ ++struct cb_header { ++ u64 cb_type; ++ u64 status; ++ u64 ibp_client; ++}; ++ ++struct ibp_sa_path_rec { ++ __be64 service_id; ++ u64 dgid_prefix; ++ u64 dgid_id; ++ u64 sgid_prefix; ++ u64 sgid_id; ++ __be16 dlid; ++ __be16 slid; ++ u32 raw_traffic; ++ __be32 flow_label; ++ u8 hop_limit; ++ u8 traffic_class; ++ u32 reversible; ++ u8 numb_path; ++ __be16 pkey; ++ __be16 qos_class; ++ u8 sl; ++ u8 mtu_selector; ++ u8 mtu; ++ u8 rate_selector; ++ u8 rate; ++ u8 packet_life_time_selector; ++ u8 packet_life_time; ++ u8 preference; ++}; ++ ++struct path_rec_data { ++ u64 entry; ++ u64 query; ++ struct ibp_sa_path_rec resp; ++ u8 reserved[1]; ++}; ++ ++struct ibp_sa_mcmember_rec { ++ u64 mgid_prefix; ++ u64 mgid_id; ++ u64 port_gid_prefix; ++ u64 port_gid_id; ++ __be32 qkey; ++ __be16 mlid; ++ u8 mtu_selector; ++ u8 mtu; ++ u8 traffic_class; ++ __be16 pkey; ++ u8 rate_selector; ++ u8 rate; ++ u8 packet_life_time_selector; ++ u8 packet_life_time; ++ u8 sl; ++ __be32 flow_label; ++ u8 hop_limit; ++ u8 scope; ++ u8 join_state; ++ u64 proxy_join; ++ u8 reserved[1]; ++}; ++ ++struct mc_join_data { ++ u64 mcentry; ++ u64 ibp_mcast; ++ struct ibp_sa_mcmember_rec rec; ++}; ++ ++struct callback_msg { ++ struct cb_header header; ++ union { ++ struct path_rec_data path_rec; ++ struct mc_join_data mc_join; ++ } u; ++}; ++ ++struct ibp_callback_msg { ++ struct ibp_msg_header header; ++ u8 data[0]; ++}; ++ ++struct ibp_sa_path_rec_get_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_client; ++ u64 entry; ++ u64 query; ++ u64 device; ++ u64 port_num; ++ u64 comp_mask; ++ u64 timeout_ms; ++ u64 gfp_mask; ++ struct ibp_sa_path_rec rec; ++}; ++ ++struct ibp_sa_path_rec_get_resp { ++ u64 sa_query; ++ u64 query_id; ++}; ++ ++struct ibp_sa_register_client_cmd { ++ struct ibp_msg_header header; ++}; ++ ++struct ibp_sa_register_client_resp { ++ u64 ibp_client; ++}; ++ ++struct ibp_sa_unregister_client_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_client; ++}; ++ ++struct ibp_sa_cancel_query_cmd { ++ struct ibp_msg_header header; ++ u64 id; ++ u64 client; ++}; ++ ++struct ibp_init_ah_from_path_cmd { ++ struct ibp_msg_header header; ++ u64 device; ++ u8 port_num; ++ struct ibp_sa_path_rec rec; ++}; ++ ++struct ibp_ah_attr { ++ u64 dgid_prefix; ++ u64 dgid_id; ++ u32 flow_label; ++ u8 sgid_index; ++ u8 hop_limit; ++ u8 traffic_class; ++ u16 dlid; ++ u8 sl; ++ u8 src_path_bits; ++ u8 static_rate; ++ u8 ah_flags; ++ u8 port_num; ++}; ++struct ibp_init_ah_from_path_resp { ++ struct ibp_ah_attr attr; ++}; ++ ++struct ibp_init_ah_from_mcmember_cmd { ++ struct ibp_msg_header header; ++ u64 device; ++ u8 port_num; ++ struct ib_sa_mcmember_rec rec; ++}; ++ ++struct ibp_init_ah_from_mcmember_resp { ++ struct ibp_ah_attr attr; ++}; ++ ++struct ibp_sa_join_multicast_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_client; ++ u64 mcentry; ++ u64 device; ++ u8 port_num; ++ u64 comp_mask; ++ u64 gfp_mask; ++ struct ib_sa_mcmember_rec rec; ++}; ++ ++struct ibp_sa_join_multicast_resp { ++ u64 ibp_mcast; ++}; ++ ++struct ibp_sa_free_multicast_cmd { ++ struct ibp_msg_header header; ++ u64 ibp_mcast; ++}; ++ ++struct ibp_sa_get_mcmember_rec_cmd { ++ struct ibp_msg_header header; ++ u64 device; ++ u8 port_num; ++ u64 subnet_prefix; ++ u64 interface_id; ++}; ++ ++struct ibp_sa_get_mcmember_rec_resp { ++ struct ib_sa_mcmember_rec rec; ++}; ++ ++struct ibp_sa_event { ++ enum ib_event_type event_type; ++ u64 ibp_client; ++ union { ++ __u32 send_status; ++ } u; ++ u64 data_length; ++ u8 data[0]; ++}; ++ ++#endif /* SA_IBP_ABI_H */ +diff -urN a6/drivers/infiniband/ibp/sa/sa_server_msg.c a7/drivers/infiniband/ibp/sa/sa_server_msg.c +--- a6/drivers/infiniband/ibp/sa/sa_server_msg.c 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/sa/sa_server_msg.c 2015-02-23 10:01:30.294769309 -0800 +@@ -0,0 +1,970 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#include "server.h" ++ ++LIST_HEAD(sa_entry_list); ++LIST_HEAD(query_list); ++LIST_HEAD(mcast_list); ++ ++static void free_query_list(struct sa_query_entry *entry) ++{ ++ if (entry) { ++ down_write(&list_rwsem); ++ ++ list_del(&entry->list); ++ ++ up_write(&list_rwsem); ++ } ++} ++ ++static struct sa_query_entry *add_query_list(struct ibp_client *client) ++{ ++ struct sa_query_entry *entry; ++ ++ print_trace("in\n"); ++ ++ entry = kzalloc(sizeof(struct sa_query_entry), GFP_KERNEL); ++ if (!entry) { ++ print_err("kzalloc failed\n"); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ entry->ibp_client = client; ++ ++ down_write(&list_rwsem); ++ ++ list_add(&entry->list, &query_list); ++ ++ up_write(&list_rwsem); ++ ++ return entry; ++} ++ ++static struct sa_query_entry *find_query_entry(struct ib_sa_client *client) ++{ ++ struct sa_query_entry *query = NULL; ++ ++ down_read(&list_rwsem); ++ ++ list_for_each_entry(query, &query_list, list) ++ if (query->sa_client == client) ++ goto out; ++ ++ print_err("Could not find sa_query_entry\n"); ++ ++out: ++ up_read(&list_rwsem); ++ ++ return query; ++} ++ ++static struct sa_entry *find_sa_entry(struct ib_sa_client *ib_client) ++{ ++ struct sa_entry *entry = NULL; ++ ++ down_read(&list_rwsem); ++ ++ list_for_each_entry(entry, &sa_entry_list, list) ++ if (&entry->ib_client == ib_client) ++ goto out; ++ ++ print_err("Could not find sa_entry\n"); ++ ++out: ++ up_read(&list_rwsem); ++ ++ return entry; ++} ++ ++/* Translate from server side "true" SA client to proxied SA client on the ++ * client ++ */ ++static struct ib_sa_client *find_ibp_client(struct ibp_client *ibp_client) ++{ ++ struct sa_entry *entry; ++ struct ib_sa_client *client = NULL; ++ ++ down_read(&list_rwsem); ++ ++ list_for_each_entry(entry, &sa_entry_list, list) ++ if (entry->client == ibp_client) { ++ client = &entry->ib_client; ++ goto out; ++ } ++ ++ print_err("Could not find proxied sa_client %p\n", ibp_client); ++ ++out: ++ up_read(&list_rwsem); ++ ++ return client; ++} ++ ++int ibp_cmd_sa_register_client(struct ibp_client *ibp_client, ++ struct ibp_msg_header *hdr) ++{ ++ struct sa_entry *entry; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_sa_register_client_resp *resp; ++ size_t len; ++ int status = 0; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf; ++ len = sizeof(*msg); ++ ++ entry = kzalloc((sizeof(struct sa_entry)), GFP_KERNEL); ++ if (!entry) { ++ print_err("kzalloc failed\n"); ++ status = -ENOMEM; ++ goto send_resp; ++ } ++ ++ entry->client = ibp_client; ++ ++ len += sizeof(*resp); ++ ++ resp = (struct ibp_sa_register_client_resp *) msg->data; ++ ++ resp->ibp_client = (u64) &entry->ib_client; ++send_resp: ++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, status); ++ ++ ret = ibp_send(ibp_client->ep, msg, len); ++ if (ret) { ++ kfree(entry); ++ print_err("ibp_send returned %d\n", ret); ++ return ret; ++ } ++ if (status) ++ return status; ++ ++ ib_sa_register_client(&entry->ib_client); ++ ++ down_write(&list_rwsem); ++ list_add(&entry->list, &sa_entry_list); ++ up_write(&list_rwsem); ++ ++ return 0; ++} ++ ++int ibp_cmd_sa_unregister_client(struct ibp_client *ibp_client, ++ struct ibp_msg_header *hdr) ++{ ++ struct sa_entry *entry; ++ struct ibp_sa_unregister_client_cmd *cmd; ++ struct ibp_verb_response_msg *msg; ++ struct ib_sa_client *client; ++ size_t len; ++ int ret = 0; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_sa_unregister_client_cmd *) hdr; ++ client = (struct ib_sa_client *) cmd->ibp_client; ++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf; ++ len = sizeof(*msg); ++ ++ entry = find_sa_entry(client); ++ if (!entry) { ++ ret = -EINVAL; ++ goto send_resp; ++ } ++ ++ down_write(&list_rwsem); ++ list_del(&entry->list); ++ up_write(&list_rwsem); ++ ++ ib_sa_unregister_client(&entry->ib_client); ++ ++send_resp: ++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret); ++ ++ ret = ibp_send(ibp_client->ep, msg, len); ++ if (ret) ++ print_err("ibp_send returned %d\n", ret); ++ ++ return ret; ++} ++ ++int ibp_cmd_sa_cancel_query(struct ibp_client *ibp_client, ++ struct ibp_msg_header *hdr) ++{ ++ struct sa_query_entry *entry; ++ struct ibp_sa_cancel_query_cmd *cmd; ++ struct ibp_verb_response_msg *msg; ++ size_t len; ++ int ret = 0; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_sa_cancel_query_cmd *) hdr; ++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf; ++ len = sizeof(*msg); ++ ++ entry = find_query_entry((struct ib_sa_client *) cmd->client); ++ if (!entry) { ++ ret = -EINVAL; ++ goto send_resp; ++ } ++ ++ ib_sa_cancel_query(cmd->id, entry->query); ++ ++ free_query_list(entry); ++ ++send_resp: ++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret); ++ ++ ret = ibp_send(ibp_client->ep, msg, len); ++ if (ret) ++ print_err("ibp_send returned %d\n", ret); ++ ++ return ret; ++} ++ ++int ibp_cmd_init_ah_from_path(struct ibp_client *ibp_client, ++ struct ibp_msg_header *hdr) ++{ ++ struct ib_device *device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_init_ah_from_path_cmd *cmd; ++ struct ibp_init_ah_from_path_resp *resp; ++ struct ib_sa_path_rec rec; ++ struct ib_ah_attr attr; ++ size_t len; ++ u8 port_num; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_init_ah_from_path_cmd *) hdr; ++ device = (struct ib_device *) cmd->device; ++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf; ++ len = sizeof(*msg); ++ ++ port_num = cmd->port_num; ++ ++ rec.service_id = cmd->rec.service_id; ++ rec.dgid.global.interface_id ++ = cmd->rec.dgid_id; ++ rec.dgid.global.subnet_prefix ++ = cmd->rec.dgid_prefix; ++ rec.sgid.global.interface_id ++ = cmd->rec.sgid_id; ++ rec.sgid.global.subnet_prefix ++ = cmd->rec.sgid_prefix; ++ rec.dlid = cmd->rec.dlid; ++ rec.slid = cmd->rec.slid; ++ rec.raw_traffic = cmd->rec.raw_traffic; ++ rec.flow_label = cmd->rec.flow_label; ++ rec.hop_limit = cmd->rec.hop_limit; ++ rec.traffic_class = cmd->rec.traffic_class; ++ rec.reversible = cmd->rec.reversible; ++ rec.numb_path = cmd->rec.numb_path; ++ rec.pkey = cmd->rec.pkey; ++ rec.qos_class = cmd->rec.qos_class; ++ rec.sl = cmd->rec.sl; ++ rec.mtu_selector = cmd->rec.mtu_selector; ++ rec.mtu = cmd->rec.mtu; ++ rec.rate_selector = cmd->rec.rate_selector; ++ rec.rate = cmd->rec.rate; ++ rec.packet_life_time_selector ++ = cmd->rec.packet_life_time_selector; ++ rec.packet_life_time = cmd->rec.packet_life_time; ++ rec.preference = cmd->rec.preference; ++ ++ ret = ib_init_ah_from_path(device, port_num, &rec, &attr); ++ if (ret) ++ print_err("init_ah_from_path returned %d\n", ret); ++ ++ len += sizeof(*resp); ++ resp = (struct ibp_init_ah_from_path_resp *) msg->data; ++ ++ resp->attr.dgid_prefix = attr.grh.dgid.global.subnet_prefix; ++ resp->attr.dgid_id = attr.grh.dgid.global.interface_id; ++ resp->attr.flow_label = attr.grh.flow_label; ++ resp->attr.sgid_index = attr.grh.sgid_index; ++ resp->attr.hop_limit = attr.grh.hop_limit; ++ resp->attr.traffic_class ++ = attr.grh.traffic_class; ++ resp->attr.dlid = attr.dlid; ++ resp->attr.sl = attr.sl; ++ resp->attr.src_path_bits ++ = attr.src_path_bits; ++ resp->attr.static_rate = attr.static_rate; ++ resp->attr.ah_flags = attr.ah_flags; ++ resp->attr.port_num = attr.port_num; ++ ++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret); ++ ++ ret = ibp_send(ibp_client->ep, msg, len); ++ if (ret) ++ print_err("ibp_send returned %d\n", ret); ++ ++ return ret; ++} ++ ++int ibp_cmd_init_ah_from_mcmember(struct ibp_client *ibp_client, ++ struct ibp_msg_header *hdr) ++{ ++ struct ib_device *device; ++ struct ibp_init_ah_from_mcmember_cmd *cmd; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_init_ah_from_mcmember_resp *resp; ++ struct ib_sa_mcmember_rec rec; ++ struct ib_ah_attr attr; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_init_ah_from_mcmember_cmd *) hdr; ++ device = (struct ib_device *) cmd->device; ++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf; ++ len = sizeof(*msg); ++ ++ rec.mgid.global.subnet_prefix = cmd->rec.mgid.global.subnet_prefix; ++ rec.mgid.global.interface_id = cmd->rec.mgid.global.interface_id; ++ rec.port_gid.global.subnet_prefix ++ = cmd->rec.port_gid.global.subnet_prefix; ++ rec.port_gid.global.interface_id ++ = cmd->rec.port_gid.global.interface_id; ++ rec.qkey = cmd->rec.qkey; ++ rec.mlid = cmd->rec.mlid; ++ rec.mtu_selector = cmd->rec.mtu_selector; ++ rec.mtu = cmd->rec.mtu; ++ rec.traffic_class = cmd->rec.traffic_class; ++ rec.pkey = cmd->rec.pkey; ++ rec.rate_selector = cmd->rec.rate_selector; ++ rec.rate = cmd->rec.rate; ++ rec.packet_life_time_selector ++ = cmd->rec.packet_life_time_selector; ++ rec.packet_life_time = cmd->rec.packet_life_time; ++ rec.sl = cmd->rec.sl; ++ rec.flow_label = cmd->rec.flow_label; ++ rec.hop_limit = cmd->rec.hop_limit; ++ rec.scope = cmd->rec.scope; ++ rec.join_state = cmd->rec.join_state; ++ rec.proxy_join = cmd->rec.proxy_join; ++ ++ ret = ib_init_ah_from_mcmember(device, cmd->port_num, &rec, &attr); ++ if (ret) { ++ print_err("ib_init_ah_from_mcmember returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ len += sizeof(*resp); ++ resp = (struct ibp_init_ah_from_mcmember_resp *) msg->data; ++ ++ resp->attr.dgid_prefix = attr.grh.dgid.global.subnet_prefix; ++ resp->attr.dgid_id = attr.grh.dgid.global.interface_id; ++ resp->attr.flow_label = attr.grh.flow_label; ++ resp->attr.sgid_index = attr.grh.sgid_index; ++ resp->attr.hop_limit = attr.grh.hop_limit; ++ resp->attr.traffic_class ++ = attr.grh.traffic_class; ++ resp->attr.dlid = attr.dlid; ++ resp->attr.sl = attr.sl; ++ resp->attr.src_path_bits ++ = attr.src_path_bits; ++ resp->attr.static_rate = attr.static_rate; ++ resp->attr.ah_flags = attr.ah_flags; ++ resp->attr.port_num = attr.port_num; ++ ++ ++send_resp: ++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret); ++ ++ ret = ibp_send(ibp_client->ep, msg, len); ++ if (ret) ++ print_err("ibp_send returned %d\n", ret); ++ ++ return ret; ++} ++ ++static void ibp_send_callback(struct work_struct *work) ++{ ++ struct callback_work *cb_work; ++ struct ibp_callback_msg *msg; ++ struct cb_header *header; ++ struct ibp_client *client; ++ size_t len; ++ int data_length; ++ int cb_type; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cb_work = (struct callback_work *) work; ++ len = sizeof(*msg); ++ ++ if (!cb_work) { ++ print_err("Invalid callback work_struct\n"); ++ return; ++ } ++ ++ header = &cb_work->msg.header; ++ cb_type = header->cb_type; ++ ++ client = cb_work->client; ++ if (!client) { ++ print_err("Invalid callback client\n"); ++ goto err; ++ } ++ if (!client->ep) { ++ print_err("Invalid callback client ep\n"); ++ goto err; ++ } ++ if (cb_work->data->ret) { ++ print_err("caller failed to send msg to card\n"); ++ goto err; ++ } ++ ++ data_length = cb_work->length; ++ ++ if (cb_type == PATH_REC_GET_CB) { ++ ret = sizeof(struct path_rec_data) + sizeof(struct cb_header); ++ if (data_length != ret) { ++ print_err("Invalid data length %d, expecting %d\n", ++ data_length, ret); ++ goto err; ++ } ++ } else if (cb_type == JOIN_MCAST_CB) { ++ ret = sizeof(struct mc_join_data) + sizeof(struct cb_header); ++ if (data_length != ret) { ++ print_err("Invalid data length %d, expecting %d\n", ++ data_length, ret); ++ goto err; ++ } ++ } else { ++ print_err("Invalid callback type %d\n", cb_type); ++ goto err; ++ } ++ ++ len += data_length; ++ ++ msg = kzalloc(len, GFP_KERNEL); ++ if (!msg) { ++ print_err("kzmalloc failed\n"); ++ goto err; ++ } ++ IBP_INIT_MSG(msg, len, CALLBACK); ++ ++ memcpy(msg->data, &cb_work->msg, data_length); ++ ++ /* wait for host to send message to card before processing cb */ ++ mutex_lock(&cb_work->data->lock); ++ ++ ret = ibp_send(client->ep, msg, len); ++ if (ret) ++ print_err("ibp_send returned %d\n", ret); ++ ++ mutex_unlock(&cb_work->data->lock); ++ ++ kfree(msg); ++err: ++ if (cb_type == PATH_REC_GET_CB) ++ kfree(cb_work->data); ++ ++ kfree(cb_work); ++} ++ ++static void path_rec_get_callback(int status, struct ib_sa_path_rec *resp, ++ void *context) ++{ ++ struct path_rec_cb_data *data; ++ struct sa_query_entry *entry; ++ struct ibp_client *client; ++ struct ib_sa_client *ib_client; ++ struct callback_work *cb_work; ++ struct cb_header *header; ++ struct path_rec_data *path_rec; ++ ++ print_trace("in\n"); ++ ++ data = (struct path_rec_cb_data *) context; ++ entry = data->entry; ++ client = entry->ibp_client; ++ ++ cb_work = kzalloc(sizeof(struct callback_work), GFP_KERNEL); ++ if (!cb_work) { ++ print_err("kzalloc failed\n"); ++ goto err1; ++ } ++ ++ ib_client = find_ibp_client(client); ++ if (!ib_client) { ++ print_err("Could not find client for event handler\n"); ++ goto err2; ++ } ++ ++ if (!entry->query) { ++ print_err("Callback occurred before call returned\n"); ++ goto err2; ++ } ++ ++ cb_work->data = (struct generic_cb_data *) data; ++ cb_work->client = client; ++ cb_work->length = sizeof(*header) + sizeof(*path_rec); ++ ++ header = &cb_work->msg.header; ++ header->cb_type = PATH_REC_GET_CB; ++ header->status = status; ++ header->ibp_client = (u64) ib_client; ++ ++ path_rec = &cb_work->msg.u.path_rec; ++ path_rec->entry = data->ibp_entry; ++ path_rec->query = data->ibp_query; ++ ++ if (status) { ++ print_err("callback status %d\n", status); ++ // XXX How is data deallocated in error cases? ++ goto queue_work; ++ } ++ ++ path_rec->resp.service_id = resp->service_id; ++ path_rec->resp.dgid_prefix = resp->dgid.global.subnet_prefix; ++ path_rec->resp.dgid_id = resp->dgid.global.interface_id; ++ path_rec->resp.sgid_prefix = resp->sgid.global.subnet_prefix; ++ path_rec->resp.sgid_id = resp->sgid.global.interface_id; ++ path_rec->resp.dlid = resp->dlid; ++ path_rec->resp.slid = resp->slid; ++ path_rec->resp.raw_traffic = resp->raw_traffic; ++ path_rec->resp.flow_label = resp->flow_label; ++ path_rec->resp.hop_limit = resp->hop_limit; ++ path_rec->resp.traffic_class = resp->traffic_class; ++ path_rec->resp.reversible = resp->reversible; ++ path_rec->resp.numb_path = resp->numb_path; ++ path_rec->resp.pkey = resp->pkey; ++ path_rec->resp.qos_class = resp->qos_class; ++ path_rec->resp.sl = resp->sl; ++ path_rec->resp.mtu_selector = resp->mtu_selector; ++ path_rec->resp.mtu = resp->mtu; ++ path_rec->resp.rate_selector = resp->rate_selector; ++ path_rec->resp.rate = resp->rate; ++ path_rec->resp.packet_life_time_selector ++ = resp->packet_life_time_selector; ++ path_rec->resp.packet_life_time = resp->packet_life_time; ++ path_rec->resp.preference = resp->preference; ++ ++queue_work: ++ free_query_list(entry); ++ ++ INIT_WORK(&cb_work->work, ibp_send_callback); ++ queue_work(client->workqueue, &cb_work->work); ++ return; ++err2: ++ kfree(cb_work); ++err1: ++ kfree(data); ++ return; ++} ++ ++int ibp_cmd_sa_path_rec_get(struct ibp_client *ibp_client, ++ struct ibp_msg_header *hdr) ++{ ++ struct ib_device *ib_device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_sa_path_rec_get_cmd *cmd; ++ struct ibp_sa_path_rec_get_resp *resp; ++ struct ib_sa_client *client; ++ struct ib_sa_query *sa_query; ++ struct sa_query_entry *entry; ++ struct path_rec_cb_data *data = NULL; ++ struct ib_sa_path_rec rec; ++ size_t len; ++ int query_id; ++ int ret = 0; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_sa_path_rec_get_cmd *) hdr; ++ ib_device = (struct ib_device *) cmd->device; ++ client = (struct ib_sa_client *) cmd->ibp_client; ++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf; ++ len = sizeof(*msg); ++ ++ entry = add_query_list(ibp_client); ++ if (IS_ERR(entry)) { ++ ret = PTR_ERR(entry); ++ goto send_resp; ++ } ++ ++ data = kzalloc(sizeof(*data), GFP_KERNEL); ++ if (!data) { ++ free_query_list(entry); ++ print_err("kzalloc failed\n"); ++ ret = -ENOMEM; ++ goto send_resp; ++ } ++ ++ data->entry = entry; ++ data->ibp_entry = cmd->entry; ++ data->ibp_query = cmd->query; ++ ++ rec.service_id = cmd->rec.service_id; ++ rec.dgid.global.interface_id ++ = cmd->rec.dgid_id; ++ rec.dgid.global.subnet_prefix ++ = cmd->rec.dgid_prefix; ++ rec.sgid.global.interface_id ++ = cmd->rec.sgid_id; ++ rec.sgid.global.subnet_prefix ++ = cmd->rec.sgid_prefix; ++ rec.dlid = cmd->rec.dlid; ++ rec.slid = cmd->rec.slid; ++ rec.raw_traffic = cmd->rec.raw_traffic; ++ rec.flow_label = cmd->rec.flow_label; ++ rec.hop_limit = cmd->rec.hop_limit; ++ rec.traffic_class = cmd->rec.traffic_class; ++ rec.reversible = cmd->rec.reversible; ++ rec.numb_path = cmd->rec.numb_path; ++ rec.pkey = cmd->rec.pkey; ++ rec.qos_class = cmd->rec.qos_class; ++ rec.sl = cmd->rec.sl; ++ rec.mtu_selector = cmd->rec.mtu_selector; ++ rec.mtu = cmd->rec.mtu; ++ rec.rate_selector = cmd->rec.rate_selector; ++ rec.rate = cmd->rec.rate; ++ rec.packet_life_time_selector ++ = cmd->rec.packet_life_time_selector; ++ rec.packet_life_time = cmd->rec.packet_life_time; ++ rec.preference = cmd->rec.preference; ++ ++ mutex_init(&data->lock); ++ mutex_lock(&data->lock); ++ ++ query_id = ib_sa_path_rec_get(client, ib_device, cmd->port_num, &rec, ++ cmd->comp_mask, cmd->timeout_ms, ++ GFP_KERNEL, path_rec_get_callback, data, ++ &sa_query); ++ if (query_id < 0) { ++ ret = query_id; ++ print_err("ib_sa_path_rec_get returned %d\n", ret); ++ free_query_list(entry); ++ mutex_unlock(&data->lock); ++ kfree(data); ++ data = NULL; ++ goto send_resp; ++ } ++ entry->query = sa_query; ++ entry->sa_client = client; ++ entry->id = query_id; ++ ++ len += sizeof(*resp); ++ resp = (struct ibp_sa_path_rec_get_resp *) msg->data; ++ resp->query_id = query_id; ++ resp->sa_query = (u64)sa_query; ++ ++send_resp: ++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret); ++ ++ ret = ibp_send(ibp_client->ep, msg, len); ++ ++ if (data) { ++ data->ret = ret; ++ mutex_unlock(&data->lock); ++ } ++ ++ if (ret) ++ print_err("ibp_send returned %d\n", ret); ++ ++ return ret; ++} ++ ++static int sa_join_callback(int status, struct ib_sa_multicast *multicast) ++{ ++ struct join_mcast_cb_data *data; ++ struct ibp_client *client; ++ struct ib_sa_client *ib_client; ++ struct callback_work *cb_work; ++ struct cb_header *header; ++ struct mc_join_data *mc_join; ++ struct ib_sa_mcmember_rec *ib_rec; ++ struct ibp_sa_mcmember_rec *ibp_rec; ++ int ret = 0; ++ ++ print_trace("in\n"); ++ ++ data = (struct join_mcast_cb_data *) multicast->context; ++ ++ if (status == -ENETRESET) ++ goto err1; ++ ++ cb_work = kzalloc(sizeof(struct callback_work), GFP_KERNEL); ++ if (!cb_work) { ++ print_err("kzalloc failed\n"); ++ ret = -ENOMEM; ++ goto err1; ++ } ++ ++ client = data->client; ++ ++ ib_client = find_ibp_client(client); ++ if (!ib_client) { ++ print_err("Could not find client for event handler\n"); ++ ret = -EINVAL; ++ goto err2; ++ } ++ ++ cb_work->data = (struct generic_cb_data *) data; ++ cb_work->client = client; ++ cb_work->length = sizeof(*header) + sizeof(*mc_join); ++ ++ header = &cb_work->msg.header; ++ header->cb_type = JOIN_MCAST_CB; ++ header->status = status; ++ header->ibp_client = (u64) ib_client; ++ ++ mc_join = &cb_work->msg.u.mc_join; ++ mc_join->ibp_mcast = (u64) multicast; ++ mc_join->mcentry = data->mcentry; ++ ++ if (status) { ++ print_err("callback status %d\n", status); ++ goto queue_work; ++ } ++ ++ ib_rec = &multicast->rec; ++ ibp_rec = &mc_join->rec; ++ ++ ibp_rec->mgid_prefix = ib_rec->mgid.global.subnet_prefix; ++ ibp_rec->mgid_id = ib_rec->mgid.global.interface_id; ++ ibp_rec->port_gid_prefix = ib_rec->port_gid.global.subnet_prefix; ++ ibp_rec->port_gid_id = ib_rec->port_gid.global.interface_id; ++ ibp_rec->qkey = ib_rec->qkey; ++ ibp_rec->mlid = ib_rec->mlid; ++ ibp_rec->mtu_selector = ib_rec->mtu_selector; ++ ibp_rec->mtu = ib_rec->mtu; ++ ibp_rec->traffic_class = ib_rec->traffic_class; ++ ibp_rec->pkey = ib_rec->pkey; ++ ibp_rec->rate_selector = ib_rec->rate_selector; ++ ibp_rec->rate = ib_rec->rate; ++ ibp_rec->packet_life_time_selector ++ = ib_rec->packet_life_time_selector; ++ ibp_rec->packet_life_time = ib_rec->packet_life_time; ++ ibp_rec->sl = ib_rec->sl; ++ ibp_rec->flow_label = ib_rec->flow_label; ++ ibp_rec->hop_limit = ib_rec->hop_limit; ++ ibp_rec->join_state = ib_rec->join_state; ++ ibp_rec->proxy_join = ib_rec->proxy_join; ++ ++queue_work: ++ INIT_WORK(&cb_work->work, ibp_send_callback); ++ queue_work(client->workqueue, &cb_work->work); ++ return 0; ++err2: ++ kfree(cb_work); ++err1: ++ return ret; ++} ++ ++int ibp_cmd_sa_join_multicast(struct ibp_client *ibp_client, ++ struct ibp_msg_header *hdr) ++{ ++ struct ib_device *ib_device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_sa_join_multicast_cmd *cmd; ++ struct ibp_sa_join_multicast_resp *resp; ++ struct ib_sa_client *client; ++ struct ib_sa_multicast *multicast; ++ struct join_mcast_cb_data *data; ++ size_t len; ++ int ret = 0; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_sa_join_multicast_cmd *) hdr; ++ ib_device = (struct ib_device *) cmd->device; ++ client = (struct ib_sa_client *) cmd->ibp_client; ++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf; ++ len = sizeof(*msg); ++ ++ data = kzalloc(sizeof(*data), GFP_KERNEL); ++ if (!data) { ++ ret = -ENOMEM; ++ goto send_resp; ++ } ++ ++ data->client = ibp_client; ++ data->mcentry = cmd->mcentry; ++ ++ mutex_init(&data->lock); ++ mutex_lock(&data->lock); ++ ++ down_write(&list_rwsem); ++ list_add(&data->list, &mcast_list); ++ up_write(&list_rwsem); ++ ++ multicast = ib_sa_join_multicast(client, ib_device, ++ cmd->port_num, &cmd->rec, ++ cmd->comp_mask, GFP_KERNEL, ++ sa_join_callback, data); ++ ++ if (IS_ERR(multicast)) { ++ ret = PTR_ERR(multicast); ++ print_err("ib_sa_join_multicast returned %d\n", ret); ++ mutex_unlock(&data->lock); ++ down_write(&list_rwsem); ++ list_del(&data->list); ++ up_write(&list_rwsem); ++ kfree(data); ++ data = NULL; ++ goto send_resp; ++ } ++ data->mcast = multicast; ++ ++ len += sizeof(*resp); ++ resp = (struct ibp_sa_join_multicast_resp *) msg->data; ++ ++ resp->ibp_mcast = (u64) multicast; ++ ++send_resp: ++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret); ++ ++ ret = ibp_send(ibp_client->ep, msg, len); ++ ++ if (data) { ++ data->ret = ret; ++ mutex_unlock(&data->lock); ++ } ++ ++ if (ret) ++ print_err("ibp_send returned %d\n", ret); ++ ++ return ret; ++} ++ ++int ibp_cmd_sa_free_multicast(struct ibp_client *ibp_client, ++ struct ibp_msg_header *hdr) ++{ ++ struct ibp_verb_response_msg *msg; ++ struct ibp_sa_free_multicast_cmd *cmd; ++ struct ib_sa_multicast *multicast; ++ struct join_mcast_cb_data *data; ++ size_t len; ++ int ret = 0; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_sa_free_multicast_cmd *) hdr; ++ multicast = (struct ib_sa_multicast *) cmd->ibp_mcast; ++ data = (struct join_mcast_cb_data *) multicast->context; ++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf; ++ len = sizeof(*msg); ++ ++ ib_sa_free_multicast(multicast); ++ ++ down_write(&list_rwsem); ++ list_del(&data->list); ++ up_write(&list_rwsem); ++ ++ kfree(data); ++ ++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret); ++ ++ ret = ibp_send(ibp_client->ep, msg, len); ++ if (ret) ++ print_err("ibp_send returned %d\n", ret); ++ ++ return ret; ++} ++ ++int ibp_cmd_sa_get_mcmember_rec(struct ibp_client *ibp_client, ++ struct ibp_msg_header *hdr) ++{ ++ struct ib_device *ib_device; ++ struct ibp_verb_response_msg *msg; ++ struct ibp_sa_get_mcmember_rec_cmd *cmd; ++ struct ibp_sa_get_mcmember_rec_resp *resp; ++ struct ib_sa_mcmember_rec rec; ++ union ib_gid mgid; ++ size_t len; ++ int ret; ++ ++ print_trace("in\n"); ++ ++ cmd = (struct ibp_sa_get_mcmember_rec_cmd *) hdr; ++ ib_device = (struct ib_device *) cmd->device; ++ msg = (struct ibp_verb_response_msg *) ibp_client->tx_buf; ++ len = sizeof(*msg); ++ ++ mgid.global.subnet_prefix = cmd->subnet_prefix; ++ mgid.global.interface_id = cmd->interface_id; ++ ++ ret = ib_sa_get_mcmember_rec(ib_device, cmd->port_num, &mgid, &rec); ++ if (ret) { ++ print_err("ib_sa_get_mcmember_rec returned %d\n", ret); ++ goto send_resp; ++ } ++ ++ len += sizeof(*resp); ++ resp = (struct ibp_sa_get_mcmember_rec_resp *) msg->data; ++ ++ resp->rec.mgid.global.subnet_prefix ++ = rec.mgid.global.subnet_prefix; ++ resp->rec.mgid.global.interface_id ++ = rec.mgid.global.interface_id; ++ resp->rec.port_gid.global.subnet_prefix ++ = rec.port_gid.global.subnet_prefix; ++ resp->rec.port_gid.global.interface_id ++ = rec.port_gid.global.interface_id; ++ resp->rec.qkey = rec.qkey; ++ resp->rec.mlid = rec.mlid; ++ resp->rec.mtu_selector = rec.mtu_selector; ++ resp->rec.mtu = rec.mtu; ++ resp->rec.traffic_class = rec.traffic_class; ++ resp->rec.pkey = rec.pkey; ++ resp->rec.rate_selector = rec.rate_selector; ++ resp->rec.rate = rec.rate; ++ resp->rec.packet_life_time_selector ++ = rec.packet_life_time_selector; ++ resp->rec.packet_life_time ++ = rec.packet_life_time; ++ resp->rec.sl = rec.sl; ++ resp->rec.flow_label = rec.flow_label; ++ resp->rec.hop_limit = rec.hop_limit; ++ resp->rec.scope = rec.scope; ++ resp->rec.join_state = rec.join_state; ++ resp->rec.proxy_join = rec.proxy_join; ++ ++send_resp: ++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, ret); ++ ++ ret = ibp_send(ibp_client->ep, msg, len); ++ if (ret) ++ print_err("ibp_send returned %d\n", ret); ++ ++ return ret; ++} +diff -urN a6/drivers/infiniband/ibp/sa/sa_table.h a7/drivers/infiniband/ibp/sa/sa_table.h +--- a6/drivers/infiniband/ibp/sa/sa_table.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/sa/sa_table.h 2015-02-23 10:01:30.294769309 -0800 +@@ -0,0 +1,131 @@ ++/*" ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#define PATH_REC_FIELD(field) \ ++ .struct_offset_bytes = offsetof(struct ib_sa_path_rec, field), \ ++ .struct_size_bytes = sizeof((struct ib_sa_path_rec *) 0)->field, \ ++ .field_name = "sa_path_rec:" #field ++ ++static const struct ib_field path_rec_table[] = { ++ { PATH_REC_FIELD(service_id), ++ .offset_words = 0, ++ .offset_bits = 0, ++ .size_bits = 64 }, ++ { PATH_REC_FIELD(dgid), ++ .offset_words = 2, ++ .offset_bits = 0, ++ .size_bits = 128 }, ++ { PATH_REC_FIELD(sgid), ++ .offset_words = 6, ++ .offset_bits = 0, ++ .size_bits = 128 }, ++ { PATH_REC_FIELD(dlid), ++ .offset_words = 10, ++ .offset_bits = 0, ++ .size_bits = 16 }, ++ { PATH_REC_FIELD(slid), ++ .offset_words = 10, ++ .offset_bits = 16, ++ .size_bits = 16 }, ++ { PATH_REC_FIELD(raw_traffic), ++ .offset_words = 11, ++ .offset_bits = 0, ++ .size_bits = 1 }, ++ { RESERVED, ++ .offset_words = 11, ++ .offset_bits = 1, ++ .size_bits = 3 }, ++ { PATH_REC_FIELD(flow_label), ++ .offset_words = 11, ++ .offset_bits = 4, ++ .size_bits = 20 }, ++ { PATH_REC_FIELD(hop_limit), ++ .offset_words = 11, ++ .offset_bits = 24, ++ .size_bits = 8 }, ++ { PATH_REC_FIELD(traffic_class), ++ .offset_words = 12, ++ .offset_bits = 0, ++ .size_bits = 8 }, ++ { PATH_REC_FIELD(reversible), ++ .offset_words = 12, ++ .offset_bits = 8, ++ .size_bits = 1 }, ++ { PATH_REC_FIELD(numb_path), ++ .offset_words = 12, ++ .offset_bits = 9, ++ .size_bits = 7 }, ++ { PATH_REC_FIELD(pkey), ++ .offset_words = 12, ++ .offset_bits = 16, ++ .size_bits = 16 }, ++ { PATH_REC_FIELD(qos_class), ++ .offset_words = 13, ++ .offset_bits = 0, ++ .size_bits = 12 }, ++ { PATH_REC_FIELD(sl), ++ .offset_words = 13, ++ .offset_bits = 12, ++ .size_bits = 4 }, ++ { PATH_REC_FIELD(mtu_selector), ++ .offset_words = 13, ++ .offset_bits = 16, ++ .size_bits = 2 }, ++ { PATH_REC_FIELD(mtu), ++ .offset_words = 13, ++ .offset_bits = 18, ++ .size_bits = 6 }, ++ { PATH_REC_FIELD(rate_selector), ++ .offset_words = 13, ++ .offset_bits = 24, ++ .size_bits = 2 }, ++ { PATH_REC_FIELD(rate), ++ .offset_words = 13, ++ .offset_bits = 26, ++ .size_bits = 6 }, ++ { PATH_REC_FIELD(packet_life_time_selector), ++ .offset_words = 14, ++ .offset_bits = 0, ++ .size_bits = 2 }, ++ { PATH_REC_FIELD(packet_life_time), ++ .offset_words = 14, ++ .offset_bits = 2, ++ .size_bits = 6 }, ++ { PATH_REC_FIELD(preference), ++ .offset_words = 14, ++ .offset_bits = 8, ++ .size_bits = 8 }, ++ { RESERVED, ++ .offset_words = 14, ++ .offset_bits = 16, ++ .size_bits = 48 }, ++}; +diff -urN a6/drivers/infiniband/ibp/sa/server.c a7/drivers/infiniband/ibp/sa/server.c +--- a6/drivers/infiniband/ibp/sa/server.c 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/sa/server.c 2015-02-23 10:01:30.294769309 -0800 +@@ -0,0 +1,221 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#include "server.h" ++ ++MODULE_AUTHOR("Jerrie Coffman"); ++MODULE_AUTHOR("Phil Cayton"); ++MODULE_AUTHOR("Jay Sternberg"); ++MODULE_LICENSE("Dual BSD/GPL"); ++MODULE_DESCRIPTION(DRV_DESC); ++MODULE_VERSION(DRV_VERSION); ++ ++MODULE_PARAM(port, port, int, IBP_SA_PORT, "Connection port"); ++MODULE_PARAM(backlog, backlog, int, 8, "Connection backlog"); ++MODULE_PARAM(timeout, timeout, int, 1000, "Listen/Poll time in milliseconds"); ++ ++#ifdef IBP_DEBUG ++MODULE_PARAM(debug_level, debug_level, int, 0, "Debug: 0-none, 1-some, 2-all"); ++#endif ++ ++struct rw_semaphore list_rwsem; ++ ++LIST_HEAD(client_list); ++ ++static struct task_struct *listen_thread; ++ ++static struct ibp_client *ibp_create_client(scif_epd_t ep, uint16_t node) ++{ ++ struct ibp_client *client; ++ int ret = -ENOMEM; ++ ++ client = kzalloc(sizeof(*client), GFP_KERNEL); ++ if (!client) { ++ print_err("kzalloc failed\n"); ++ return ERR_PTR(ret); ++ } ++ ++ client->ep = ep; ++ ++ client->rx_buf = (void *)__get_free_page(GFP_KERNEL); ++ if (!client->rx_buf) { ++ print_err("__get_free_page rx_buf failed\n"); ++ goto err0; ++ } ++ ++ client->tx_buf = (void *)__get_free_page(GFP_KERNEL); ++ if (!client->tx_buf) { ++ print_err("__get_free_page tx_buf failed\n"); ++ goto err1; ++ } ++ ++ client->workqueue = create_singlethread_workqueue(DRV_NAME); ++ if (!client->workqueue) { ++ print_err("create_singlethread_workqueue failed\n"); ++ goto err2; ++ } ++ ++ down_write(&list_rwsem); ++ list_add(&client->list, &client_list); ++ up_write(&list_rwsem); ++ ++ client->ibp_sa_client_thread = kthread_run(ibp_process_recvs, ++ client, DRV_NAME); ++ if (!client->ibp_sa_client_thread) { ++ print_err("create client thread failed\n"); ++ goto err3; ++ } ++ ++ return client; ++err3: ++ down_write(&list_rwsem); ++ list_del(&client->list); ++ up_write(&list_rwsem); ++ ++ destroy_workqueue(client->workqueue); ++err2: ++ free_page((uintptr_t)client->tx_buf); ++err1: ++ free_page((uintptr_t)client->rx_buf); ++err0: ++ kfree(client); ++ return ERR_PTR(ret); ++} ++ ++static int ibp_sa_listen(void *data) ++{ ++ struct ibp_client *client; ++ struct scif_pollepd listen; ++ struct scif_portID peer; ++ scif_epd_t ep; ++ int ret; ++ ++ listen.epd = scif_open(); ++ if (IS_NULL_OR_ERR(listen.epd)) { ++ print_err("scif_open failed\n"); ++ ret = -EIO; ++ goto err0; ++ } ++ listen.events = POLLIN; ++ ++ ret = scif_bind(listen.epd, port); ++ if (ret < 0) { ++ print_err("scif_bind returned %d\n", ret); ++ goto err1; ++ } ++ ++ ret = scif_listen(listen.epd, backlog); ++ if (ret) { ++ print_err("scif_listen returned %d\n", ret); ++ goto err1; ++ } ++ ++ while (!kthread_should_stop()) { ++ ++ schedule(); ++ ++ ret = scif_poll(&listen, 1, timeout); ++ if (ret == 0) /* timeout */ ++ continue; ++ if (ret < 0) { ++ print_err("scif_poll revents 0x%x\n", listen.revents); ++ continue; ++ } ++ ++ ret = scif_accept(listen.epd, &peer, &ep, 0); ++ if (ret) { ++ print_err("scif_accept returned %d\n", ret); ++ continue; ++ } ++ ++ print_dbg("accepted node %d port %d\n", peer.node, peer.port); ++ ++ client = ibp_create_client(ep, peer.node); ++ if (IS_ERR(client)) { ++ ret = PTR_ERR(client); ++ print_err("ibp_create_client returned %d\n", ret); ++ scif_close(ep); ++ } ++ } ++err1: ++ scif_close(listen.epd); ++err0: ++ return ret; ++} ++ ++static int __init ibp_sa_server_init(void) ++{ ++ int ret = 0; ++ ++ print_info(DRV_SIGNON); ++ ++ init_rwsem(&list_rwsem); ++ ++ /* Start a thread for inbound connections. */ ++ listen_thread = kthread_run(ibp_sa_listen, NULL, DRV_NAME); ++ if (IS_NULL_OR_ERR(listen_thread)) { ++ ret = PTR_ERR(listen_thread); ++ print_err("kthread_run returned %d\n", ret); ++ } ++ ++ return ret; ++} ++ ++static void __exit ibp_sa_server_exit(void) ++{ ++ struct ibp_client *client, *next; ++ struct completion done; ++ ++ kthread_stop(listen_thread); ++ ++ down_write(&list_rwsem); ++ list_for_each_entry_safe(client, next, &client_list, list) { ++ init_completion(&done); ++ client->done = &done; ++ ++ /* Close scif ep to unblock the client thread scif_recv */ ++ scif_close(client->ep); ++ ++ up_write(&list_rwsem); ++ ++ /* Wait for client thread to finish */ ++ wait_for_completion(&done); ++ ++ down_write(&list_rwsem); ++ } ++ up_write(&list_rwsem); ++ ++ print_info(DRV_DESC " unloaded\n"); ++} ++ ++module_init(ibp_sa_server_init); ++module_exit(ibp_sa_server_exit); +diff -urN a6/drivers/infiniband/ibp/sa/server.h a7/drivers/infiniband/ibp/sa/server.h +--- a6/drivers/infiniband/ibp/sa/server.h 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/sa/server.h 2015-02-23 10:01:30.294769309 -0800 +@@ -0,0 +1,172 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#ifndef SERVER_H ++#define SERVER_H ++ ++#include ++#include ++#include ++#include ++#include ++#include "ibp-abi.h" ++#include "sa_ibp_abi.h" ++#include "common.h" ++ ++#define DRV_ROLE "Server" ++#define DRV_NAME "ibp_sa_server" ++ ++extern int timeout; ++extern struct rw_semaphore list_rwsem; ++extern struct list_head client_list; ++extern struct list_head sa_entry_list; ++extern struct list_head query_list; ++extern struct list_head mcast_list; ++ ++struct ib_sa_sm_ah { ++ struct ib_ah *ah; ++ struct kref ref; ++ u16 pkey_index; ++ u8 src_path_mask; ++}; ++ ++struct ib_sa_port { ++ struct ib_mad_agent *agent; ++ struct ib_mad_agent *notice_agent; ++ struct ib_sa_sm_ah *sm_ah; ++ struct work_struct update_task; ++ spinlock_t ah_lock; ++ u8 port_num; ++ struct ib_device *device; ++}; ++ ++struct ib_sa_device { ++ int start_port, end_port; ++ struct ib_event_handler event_handler; ++ struct ib_sa_port port[0]; ++}; ++ ++struct ibp_client { ++ struct list_head list; ++ scif_epd_t ep; ++ void *rx_buf; ++ void *tx_buf; ++ struct completion *done; ++ struct workqueue_struct *workqueue; ++ struct task_struct *ibp_sa_client_thread; ++}; ++ ++struct sa_entry { ++ struct list_head list; ++ struct ib_sa_client ib_client; ++ struct ibp_client *client; ++}; ++ ++struct sa_query_entry { ++ struct list_head list; ++ int id; ++ struct ibp_client *ibp_client; ++ struct ib_sa_client *sa_client; ++ struct ib_sa_query *query; ++}; ++ ++struct path_rec_cb_data { ++ struct mutex lock; ++ int ret; ++ struct sa_query_entry *entry; ++ u64 ibp_entry; ++ u64 ibp_query; ++}; ++ ++struct join_mcast_cb_data { ++ struct mutex lock; ++ int ret; ++ struct ibp_client *client; ++ struct ib_sa_multicast *mcast; ++ struct list_head list; ++ u64 entry; ++ u64 mcentry; ++}; ++ ++struct generic_cb_data { ++ struct mutex lock; ++ int ret; ++}; ++ ++struct callback_work { ++ struct work_struct work; ++ struct ibp_client *client; ++ struct generic_cb_data *data; ++ int length; ++ struct callback_msg msg; ++}; ++ ++#define IBP_INIT_MSG(msg, size, op) \ ++ do { \ ++ (msg)->header.opcode = IBP_##op; \ ++ (msg)->header.length = (size); \ ++ (msg)->header.status = 0; \ ++ (msg)->header.reserved = 0; \ ++ (msg)->header.request = 0; \ ++ } while (0) ++ ++#define IBP_INIT_RESP(resp, size, op, req, stat) \ ++ do { \ ++ (resp)->header.opcode = IBP_##op; \ ++ (resp)->header.length = (size); \ ++ (resp)->header.status = (stat); \ ++ (resp)->header.reserved = 0; \ ++ (resp)->header.request = (req); \ ++ } while (0) ++ ++int ibp_process_recvs(void *p); ++ ++int ibp_cmd_sa_path_rec_get(struct ibp_client *client, ++ struct ibp_msg_header *hdr); ++int ibp_cmd_sa_register_client(struct ibp_client *client, ++ struct ibp_msg_header *hdr); ++int ibp_cmd_sa_unregister_client(struct ibp_client *client, ++ struct ibp_msg_header *hdr); ++int ibp_cmd_sa_cancel_query(struct ibp_client *client, ++ struct ibp_msg_header *hdr); ++int ibp_cmd_init_ah_from_path(struct ibp_client *client, ++ struct ibp_msg_header *hdr); ++int ibp_cmd_init_ah_from_mcmember(struct ibp_client *client, ++ struct ibp_msg_header *hdr); ++int ibp_cmd_sa_join_multicast(struct ibp_client *client, ++ struct ibp_msg_header *hdr); ++int ibp_cmd_sa_free_multicast(struct ibp_client *client, ++ struct ibp_msg_header *hdr); ++int ibp_cmd_sa_get_mcmember_rec(struct ibp_client *client, ++ struct ibp_msg_header *hdr); ++ ++#endif /* SERVER_H */ +diff -urN a6/drivers/infiniband/ibp/sa/server_msg.c a7/drivers/infiniband/ibp/sa/server_msg.c +--- a6/drivers/infiniband/ibp/sa/server_msg.c 1969-12-31 16:00:00.000000000 -0800 ++++ a7/drivers/infiniband/ibp/sa/server_msg.c 2015-02-23 10:01:30.294769309 -0800 +@@ -0,0 +1,185 @@ ++/* ++ * Copyright (c) 2011-2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++ ++#include ++ ++#include "server.h" ++#include "sa_ibp_abi.h" ++ ++int ibp_send(scif_epd_t ep, void *buf, size_t len) ++{ ++ int ret; ++ ++ while (len) { ++ ret = scif_send(ep, buf, (uint32_t)len, SCIF_SEND_BLOCK); ++ if (ret < 0) { ++ print_dbg("scif_send returned %d\n", ret); ++ return ret; ++ } ++ buf += ret; ++ len -= ret; ++ } ++ ++ return 0; ++} ++ ++int ibp_recv(scif_epd_t ep, void *buf, size_t len) ++{ ++ int ret; ++ ++ while (len) { ++ ret = scif_recv(ep, buf, (uint32_t)len, SCIF_RECV_BLOCK); ++ if (ret < 0) { ++ print_dbg("scif_recv returned %d\n", ret); ++ return ret; ++ } ++ buf += ret; ++ len -= ret; ++ } ++ ++ return 0; ++} ++ ++static int ++ibp_cmd_bad_request(struct ibp_client *client, struct ibp_msg_header *hdr) ++{ ++ struct ibp_verb_response_msg *msg; ++ size_t len; ++ int status = -EBADRQC; ++ ++ msg = (struct ibp_verb_response_msg *) client->tx_buf; ++ len = sizeof(*msg); ++ ++ print_dbg("opcode 0x%x\n", hdr->opcode); ++ ++ IBP_INIT_RESP(msg, len, RESPONSE, hdr->request, status); ++ return ibp_send(client->ep, msg, len); ++} ++ ++static void ++ibp_sa_destroy_client(struct ibp_client *client) ++{ ++ struct join_mcast_cb_data *mcast, *next_mcast; ++ struct sa_query_entry *query, *next_query; ++ struct sa_entry *sa, *next_sa; ++ ++ down_write(&list_rwsem); ++ list_del(&client->list); ++ list_for_each_entry_safe(mcast, next_mcast, &mcast_list, list) ++ if (mcast->client == client) { ++ ib_sa_free_multicast(mcast->mcast); ++ list_del(&mcast->list); ++ kfree(mcast); ++ } ++ list_for_each_entry_safe(query, next_query, &query_list, list) ++ if (query->ibp_client == client) { ++ ib_sa_cancel_query(query->id, query->query); ++ list_del(&query->list); ++ kfree(query); ++ } ++ list_for_each_entry_safe(sa, next_sa, &sa_entry_list, list) ++ if (sa->client == client) { ++ ib_sa_unregister_client(&sa->ib_client); ++ list_del(&sa->list); ++ kfree(sa); ++ } ++ up_write(&list_rwsem); ++ ++ destroy_workqueue(client->workqueue); ++ ++ free_page((uintptr_t)client->tx_buf); ++ free_page((uintptr_t)client->rx_buf); ++ ++ if (client->done) ++ complete(client->done); ++ else ++ scif_close(client->ep); ++ ++ kfree(client); ++} ++ ++static int ++(*ibp_msg_table[])(struct ibp_client *c, struct ibp_msg_header *h) = { ++ [IBP_SA_PATH_REC_GET] = ibp_cmd_sa_path_rec_get, ++ [IBP_SA_REGISTER_CLIENT] = ibp_cmd_sa_register_client, ++ [IBP_SA_UNREGISTER_CLIENT] = ibp_cmd_sa_unregister_client, ++ [IBP_SA_CANCEL_QUERY] = ibp_cmd_sa_cancel_query, ++ [IBP_INIT_AH_FROM_PATH] = ibp_cmd_init_ah_from_path, ++ [IBP_INIT_AH_FROM_MCMEMBER] = ibp_cmd_init_ah_from_mcmember, ++ [IBP_SA_JOIN_MCAST] = ibp_cmd_sa_join_multicast, ++ [IBP_SA_FREE_MCAST] = ibp_cmd_sa_free_multicast, ++ [IBP_SA_GET_MCMEMBER_REC] = ibp_cmd_sa_get_mcmember_rec, ++}; ++ ++int ibp_process_recvs(void *p) ++{ ++ struct ibp_client *client; ++ struct ibp_msg_header *hdr; ++ int ret; ++ ++ client = (struct ibp_client *) p; ++ hdr = (struct ibp_msg_header *) client->rx_buf; ++ ++ for (;;) { ++ ret = ibp_recv(client->ep, hdr, sizeof(*hdr)); ++ if (ret) ++ break; ++ ++ if (hdr->length > MAX_MSG_SIZE) { ++ print_err("message too large, len %u max %lu\n", ++ hdr->length, MAX_MSG_SIZE); ++ ret = -EMSGSIZE; ++ break; ++ } ++ ++ if (hdr->length > sizeof(*hdr)) { ++ ret = ibp_recv(client->ep, hdr->data, ++ hdr->length - sizeof(*hdr)); ++ if (ret) ++ break; ++ } ++ ++ if ((hdr->opcode >= ARRAY_SIZE(ibp_msg_table)) || ++ !ibp_msg_table[hdr->opcode]) { ++ ibp_cmd_bad_request(client, hdr); ++ continue; ++ } ++ ++ ret = ibp_msg_table[hdr->opcode](client, hdr); ++ if (ret) ++ break; ++ } ++ ++ ibp_sa_destroy_client(client); ++ ++ return ret; ++} diff --git a/tech-preview/xeon-phi/0008-Add-ibscif-to-the-Infiniband-HW-directory.patch b/tech-preview/xeon-phi/0008-Add-ibscif-to-the-Infiniband-HW-directory.patch new file mode 100644 index 0000000..b34ff02 --- /dev/null +++ b/tech-preview/xeon-phi/0008-Add-ibscif-to-the-Infiniband-HW-directory.patch @@ -0,0 +1,9132 @@ +From 674c5e41008346a8d68f534d408e240b152dec5e Mon Sep 17 00:00:00 2001 +From: Phil Cayton +Date: Wed, 28 May 2014 15:53:58 -0700 +Subject: [PATCH 08/13] Add ibscif to the Infiniband HW directory + +Signed-off-by: Phil Cayton +--- +diff -urN a7/drivers/infiniband/hw/scif/ibscif_ah.c a8/drivers/infiniband/hw/scif/ibscif_ah.c +--- a7/drivers/infiniband/hw/scif/ibscif_ah.c 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_ah.c 2015-02-23 10:14:37.482809663 -0800 +@@ -0,0 +1,50 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "ibscif_driver.h" ++ ++struct ib_ah *ibscif_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) ++{ ++ struct ibscif_ah *ah; ++ ++ ah = kzalloc(sizeof *ah, GFP_KERNEL); ++ if (!ah) ++ return ERR_PTR(-ENOMEM); ++ ++ ah->dlid = cpu_to_be16(attr->dlid); ++ ++ return &ah->ibah; ++} ++ ++int ibscif_destroy_ah(struct ib_ah *ibah) ++{ ++ kfree(to_ah(ibah)); ++ return 0; ++} +diff -urN a7/drivers/infiniband/hw/scif/ibscif_cm.c a8/drivers/infiniband/hw/scif/ibscif_cm.c +--- a7/drivers/infiniband/hw/scif/ibscif_cm.c 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_cm.c 2015-02-23 10:14:37.482809663 -0800 +@@ -0,0 +1,515 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "ibscif_driver.h" ++ ++static LIST_HEAD(listen_list); ++DEFINE_SPINLOCK(listen_list_lock); ++ ++static int sockaddr_in_to_node_id( struct sockaddr_in addr ) ++{ ++ u8 *p = (u8 *)&addr.sin_addr.s_addr; ++ ++ if (p[0]==192 && p[1]==0 && p[2]==2 && p[3]>=100 && p[3]<100+IBSCIF_MAX_DEVICES) ++ return (int)(p[3]-100); ++ ++ else ++ return -EINVAL; ++} ++ ++static struct sockaddr_in node_id_to_sockaddr_in( int node_id ) ++{ ++ struct sockaddr_in addr; ++ u8 *p = (u8 *)&addr.sin_addr.s_addr; ++ ++ addr.sin_family = AF_INET; ++ addr.sin_addr.s_addr = 0; ++ addr.sin_port = 0; ++ ++ p[0] = 192; ++ p[1] = 0; ++ p[2] = 2; ++ p[3] = 100 + node_id; ++ ++ return addr; ++} ++ ++void free_cm(struct kref *kref) ++{ ++ struct ibscif_cm *cm_ctx; ++ cm_ctx = container_of(kref, struct ibscif_cm, kref); ++ if (cm_ctx->conn) ++ ibscif_put_conn(cm_ctx->conn); ++ kfree(cm_ctx); ++} ++ ++static inline void get_cm(struct ibscif_cm *cm_ctx) ++{ ++ kref_get(&cm_ctx->kref); ++} ++ ++static inline void put_cm(struct ibscif_cm *cm_ctx) ++{ ++ kref_put(&cm_ctx->kref, free_cm); ++} ++ ++void free_listen(struct kref *kref) ++{ ++ struct ibscif_listen *listen; ++ listen = container_of(kref, struct ibscif_listen, kref); ++ kfree(listen); ++} ++ ++static inline void get_listen(struct ibscif_listen *listen) ++{ ++ kref_get(&listen->kref); ++} ++ ++static inline void put_listen(struct ibscif_listen *listen) ++{ ++ kref_put(&listen->kref, free_listen); ++} ++ ++static int connect_qp(struct ibscif_cm *cm_ctx) ++{ ++ struct ibscif_qp *qp; ++ struct ib_qp_attr qp_attr; ++ int qp_attr_mask; ++ int err; ++ ++ qp = ibscif_get_qp(cm_ctx->qpn); ++ if (IS_ERR(qp)) { ++ printk(KERN_ERR PFX "%s: invalid QP number: %d\n", __func__, cm_ctx->qpn); ++ return -EINVAL; ++ } ++ ++ qp_attr_mask = IB_QP_STATE | ++ IB_QP_AV | ++ IB_QP_DEST_QPN | ++ IB_QP_ACCESS_FLAGS | ++ IB_QP_MAX_QP_RD_ATOMIC | ++ IB_QP_MAX_DEST_RD_ATOMIC; ++ ++ qp_attr.ah_attr.ah_flags = 0; ++ qp_attr.ah_attr.dlid = IBSCIF_NODE_ID_TO_LID(cm_ctx->remote_node_id); ++ qp_attr.dest_qp_num = cm_ctx->remote_qpn; ++ qp_attr.qp_state = IB_QPS_RTS; ++ qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE | ++ IB_ACCESS_REMOTE_WRITE | ++ IB_ACCESS_REMOTE_READ | ++ IB_ACCESS_REMOTE_ATOMIC; ++ qp_attr.max_rd_atomic = 16; /* 8-bit value, don't use MAX_OR */ ++ qp_attr.max_dest_rd_atomic = 16;/* 8-bit value, don't use MAX_IR */ ++ ++ err = ib_modify_qp(&qp->ibqp, &qp_attr, qp_attr_mask); ++ ++ if (!err) { ++ qp->cm_context = cm_ctx; ++ get_cm(cm_ctx); ++ } ++ ++ ibscif_put_qp(qp); ++ ++ return err; ++} ++ ++static void event_connection_close(struct ibscif_cm *cm_ctx) ++{ ++ struct iw_cm_event event; ++ ++ memset(&event, 0, sizeof(event)); ++ event.event = IW_CM_EVENT_CLOSE; ++ event.status = -ECONNRESET; ++ if (cm_ctx->cm_id) { ++ cm_ctx->cm_id->event_handler(cm_ctx->cm_id, &event); ++ cm_ctx->cm_id->rem_ref(cm_ctx->cm_id); ++ cm_ctx->cm_id = NULL; ++ } ++} ++ ++static void event_connection_reply(struct ibscif_cm *cm_ctx, int status) ++{ ++ struct iw_cm_event event; ++ ++ memset(&event, 0, sizeof(event)); ++ event.event = IW_CM_EVENT_CONNECT_REPLY; ++ event.status = status; ++ event.local_addr = *(struct sockaddr_storage *) &cm_ctx->local_addr; ++ event.remote_addr = *(struct sockaddr_storage *) &cm_ctx->remote_addr; ++ ++ if ((status == 0) || (status == -ECONNREFUSED)) { ++ event.private_data_len = cm_ctx->plen; ++ event.private_data = cm_ctx->pdata; ++ } ++ if (cm_ctx->cm_id) { ++ cm_ctx->cm_id->event_handler(cm_ctx->cm_id, &event); ++ if (status == -ECONNREFUSED) { ++ cm_ctx->cm_id->rem_ref(cm_ctx->cm_id); ++ cm_ctx->cm_id = NULL; ++ } ++ } ++} ++ ++static void event_connection_request(struct ibscif_cm *cm_ctx) ++{ ++ struct iw_cm_event event; ++ ++ memset(&event, 0, sizeof(event)); ++ event.event = IW_CM_EVENT_CONNECT_REQUEST; ++ event.local_addr = *(struct sockaddr_storage *) &cm_ctx->local_addr; ++ event.remote_addr = *(struct sockaddr_storage *) &cm_ctx->remote_addr; ++ event.private_data_len = cm_ctx->plen; ++ event.private_data = cm_ctx->pdata; ++ event.provider_data = cm_ctx; ++ event.ird = 16; ++ event.ord = 16; ++ ++ if (cm_ctx->listen) { ++ cm_ctx->listen->cm_id->event_handler( cm_ctx->listen->cm_id, &event); ++ put_listen(cm_ctx->listen); ++ cm_ctx->listen = NULL; ++ } ++} ++ ++static void event_connection_established( struct ibscif_cm *cm_ctx ) ++{ ++ struct iw_cm_event event; ++ ++ memset(&event, 0, sizeof(event)); ++ event.event = IW_CM_EVENT_ESTABLISHED; ++ event.ird = 16; ++ event.ord = 16; ++ if (cm_ctx->cm_id) { ++ cm_ctx->cm_id->event_handler(cm_ctx->cm_id, &event); ++ } ++} ++ ++void ibscif_cm_async_callback(void *cm_context) ++{ ++ struct ibscif_cm *cm_ctx = cm_context; ++ ++ if (cm_ctx) { ++ event_connection_close(cm_ctx); ++ put_cm(cm_ctx); ++ } ++} ++ ++int ibscif_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ++{ ++ struct ibscif_cm *cm_ctx; ++ struct sockaddr_in *local_addr = (struct sockaddr_in *) &cm_id->local_addr; ++ struct sockaddr_in *remote_addr = (struct sockaddr_in *) &cm_id->remote_addr; ++ int node_id; ++ int remote_node_id; ++ int err = 0; ++ ++ cm_ctx = kzalloc(sizeof *cm_ctx, GFP_KERNEL); ++ if (!cm_ctx) { ++ printk(KERN_ALERT PFX "%s: cannot allocate cm_ctx\n", __func__); ++ return -ENOMEM; ++ } ++ ++ kref_init(&cm_ctx->kref); /* refcnt <- 1 */ ++ spin_lock_init(&cm_ctx->lock); ++ ++ node_id = sockaddr_in_to_node_id(*local_addr); ++ remote_node_id = sockaddr_in_to_node_id(*remote_addr); ++ if (node_id<0 || remote_node_id<0) { ++ printk(KERN_ALERT PFX "%s: invalid address, local_addr=%8x, remote_addr=%8x, node_id=%d, remote_node_id=%d\n", ++ __func__, local_addr->sin_addr.s_addr, remote_addr->sin_addr.s_addr, ++ node_id, remote_node_id); ++ err = -EINVAL; ++ goto out_free; ++ } ++ ++ cm_ctx->conn = ibscif_get_conn( node_id, remote_node_id, 0 ); ++ if (!cm_ctx->conn) { ++ printk(KERN_ALERT PFX "%s: failed to get connection %d-->%d\n", __func__, node_id, remote_node_id); ++ err = -EINVAL; ++ goto out_free; ++ } ++ ++ cm_id->add_ref(cm_id); ++ cm_id->provider_data = cm_ctx; ++ ++ cm_ctx->cm_id = cm_id; ++ cm_ctx->node_id = node_id; ++ cm_ctx->remote_node_id = remote_node_id; ++ cm_ctx->local_addr = *local_addr; ++ cm_ctx->remote_addr = *remote_addr; ++ cm_ctx->qpn = conn_param->qpn; ++ cm_ctx->plen = conn_param->private_data_len; ++ if (cm_ctx->plen > IBSCIF_MAX_PDATA_SIZE) { ++ printk(KERN_ALERT PFX "%s: plen (%d) exceeds the limit (%d), truncated.\n", ++ __func__, cm_ctx->plen, IBSCIF_MAX_PDATA_SIZE); ++ cm_ctx->plen = IBSCIF_MAX_PDATA_SIZE; ++ } ++ if (cm_ctx->plen) ++ memcpy(cm_ctx->pdata, conn_param->private_data, cm_ctx->plen); ++ ++ err = ibscif_send_cm_req( cm_ctx ); ++ ++ return err; ++ ++out_free: ++ kfree(cm_ctx); ++ return err; ++} ++ ++int ibscif_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ++{ ++ struct ibscif_cm *cm_ctx = cm_id->provider_data; ++ int err = 0; ++ ++ cm_id->add_ref(cm_id); ++ cm_ctx->cm_id = cm_id; ++ cm_ctx->qpn = conn_param->qpn; ++ cm_ctx->plen = conn_param->private_data_len; ++ if (cm_ctx->plen > IBSCIF_MAX_PDATA_SIZE) { ++ printk(KERN_ALERT PFX "%s: plen (%d) exceeds the limit (%d), truncated.\n", ++ __func__, cm_ctx->plen, IBSCIF_MAX_PDATA_SIZE); ++ cm_ctx->plen = IBSCIF_MAX_PDATA_SIZE; ++ } ++ if (cm_ctx->plen) ++ memcpy(cm_ctx->pdata, conn_param->private_data, cm_ctx->plen); ++ ++ err = connect_qp( cm_ctx ); ++ if (err) { ++ printk(KERN_ALERT PFX "%s: failed to modify QP into connected state\n", __func__); ++ goto err_out; ++ } ++ ++ err = ibscif_send_cm_rep( cm_ctx ); ++ if (err) { ++ printk(KERN_ALERT PFX "%s: failed to send REP\n", __func__); ++ goto err_out; ++ } ++ ++ return 0; ++ ++err_out: ++ cm_id->rem_ref(cm_id); ++ cm_ctx->cm_id = NULL; ++ put_cm(cm_ctx); ++ return err; ++} ++ ++int ibscif_cm_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) ++{ ++ struct ibscif_cm *cm_ctx = cm_id->provider_data; ++ int err = 0; ++ ++ err = ibscif_send_cm_rej( cm_ctx, pdata, pdata_len ); ++ ++ put_cm(cm_ctx); ++ return err; ++} ++ ++int ibscif_cm_create_listen(struct iw_cm_id *cm_id, int backlog) ++{ ++ struct ibscif_listen *listen; ++ struct sockaddr_in *local_addr = (struct sockaddr_in *) &cm_id->local_addr; ++ ++ listen = kzalloc(sizeof *listen, GFP_KERNEL); ++ if (!listen) { ++ printk(KERN_ALERT PFX "%s: cannot allocate listen object\n", __func__); ++ return -ENOMEM; ++ } ++ ++ kref_init(&listen->kref); /* refcnt <- 1 */ ++ ++ listen->cm_id = cm_id; ++ listen->port = local_addr->sin_port; ++ cm_id->provider_data = listen; ++ cm_id->add_ref(cm_id); ++ ++ spin_lock_bh(&listen_list_lock); ++ list_add(&listen->entry, &listen_list); ++ spin_unlock_bh(&listen_list_lock); ++ ++ return 0; ++} ++ ++int ibscif_cm_destroy_listen(struct iw_cm_id *cm_id) ++{ ++ struct ibscif_listen *listen = cm_id->provider_data; ++ ++ spin_lock_bh(&listen_list_lock); ++ list_del(&listen->entry); ++ spin_unlock_bh(&listen_list_lock); ++ cm_id->rem_ref(cm_id); ++ put_listen(listen); ++ ++ return 0; ++} ++ ++/* similar to ibscif_get_qp(), but differs in: ++ * (1) use the "irqsave" version of the lock functions to avoid the ++ * kernel warnings about "local_bh_enable_ip"; ++ * (2) don't hold the reference on success; ++ * (3) return NULL instead of error code on failure. ++ */ ++struct ib_qp *ibscif_cm_get_qp(struct ib_device *ibdev, int qpn) ++{ ++ struct ibscif_qp *qp; ++ unsigned long flags; ++ ++ read_lock_irqsave(&wiremap_lock, flags); ++ qp = idr_find(&wiremap, qpn); ++ if (likely(qp) && unlikely(qp->magic != QP_MAGIC)) ++ qp = NULL; ++ read_unlock_irqrestore(&wiremap_lock,flags); ++ ++ return qp ? &qp->ibqp : NULL; ++} ++ ++void ibscif_cm_add_ref(struct ib_qp *ibqp) ++{ ++ struct ibscif_qp *qp; ++ ++ if (likely(ibqp)) { ++ qp = to_qp(ibqp); ++ kref_get(&qp->ref); ++ } ++} ++ ++void ibscif_cm_rem_ref(struct ib_qp *ibqp) ++{ ++ struct ibscif_qp *qp; ++ ++ if (likely(ibqp)) { ++ qp = to_qp(ibqp); ++ ibscif_put_qp(qp); ++ } ++} ++ ++int ibscif_process_cm_skb(struct sk_buff *skb, struct ibscif_conn *conn) ++{ ++ union ibscif_pdu *pdu = (union ibscif_pdu *)skb->data; ++ struct ibscif_cm *cm_ctx; ++ struct ibscif_listen *listen; ++ int cmd, qpn, status, plen, err, port; ++ u64 req_ctx, rep_ctx; ++ ++ req_ctx = __be64_to_cpu(pdu->cm.req_ctx); ++ rep_ctx = __be64_to_cpu(pdu->cm.rep_ctx); ++ cmd = __be32_to_cpu(pdu->cm.cmd); ++ port = __be32_to_cpu(pdu->cm.port); ++ qpn = __be32_to_cpu(pdu->cm.qpn); ++ status = __be32_to_cpu(pdu->cm.status); ++ plen = __be32_to_cpu(pdu->cm.plen); ++ ++ switch (cmd) { ++ case IBSCIF_CM_REQ: ++ cm_ctx = kzalloc(sizeof *cm_ctx, GFP_KERNEL); ++ if (!cm_ctx) { ++ printk(KERN_ALERT PFX "%s: cannot allocate cm_ctx\n", __func__); ++ return -ENOMEM; ++ } ++ kref_init(&cm_ctx->kref); /* refcnt <- 1 */ ++ spin_lock_init(&cm_ctx->lock); ++ ++ spin_lock_bh(&listen_list_lock); ++ list_for_each_entry(listen, &listen_list, entry) { ++ if (listen->port == port) { ++ cm_ctx->listen = listen; ++ get_listen(listen); ++ } ++ } ++ spin_unlock_bh(&listen_list_lock); ++ ++ if (!cm_ctx->listen) { ++ printk(KERN_ALERT PFX "%s: no matching listener for connection request, port=%d\n", __func__, port); ++ put_cm(cm_ctx); ++ /* fix me: send CM_REJ */ ++ return -EINVAL; ++ } ++ ++ cm_ctx->cm_id = NULL; ++ cm_ctx->node_id = conn->dev->node_id; ++ cm_ctx->remote_node_id = conn->remote_node_id; ++ cm_ctx->local_addr = node_id_to_sockaddr_in(cm_ctx->node_id); ++ if (cm_ctx->listen) ++ cm_ctx->local_addr.sin_port = cm_ctx->listen->port; ++ cm_ctx->remote_addr = node_id_to_sockaddr_in(cm_ctx->remote_node_id); ++ cm_ctx->remote_qpn = qpn; ++ cm_ctx->plen = plen; ++ if (cm_ctx->plen > IBSCIF_MAX_PDATA_SIZE) { ++ printk(KERN_ALERT PFX "%s: plen (%d) exceeds the limit (%d), truncated.\n", ++ __func__, cm_ctx->plen, IBSCIF_MAX_PDATA_SIZE); ++ cm_ctx->plen = IBSCIF_MAX_PDATA_SIZE; ++ } ++ if (cm_ctx->plen) ++ memcpy(cm_ctx->pdata, pdu->cm.pdata, cm_ctx->plen); ++ ++ cm_ctx->peer_context = req_ctx; ++ cm_ctx->conn = conn; ++ atomic_inc(&conn->refcnt); ++ ++ event_connection_request(cm_ctx); ++ break; ++ ++ case IBSCIF_CM_REP: ++ cm_ctx = (struct ibscif_cm *)req_ctx; ++ cm_ctx->plen = plen; ++ memcpy(cm_ctx->pdata, pdu->cm.pdata, plen); ++ cm_ctx->remote_qpn = qpn; ++ cm_ctx->peer_context = rep_ctx; ++ err = connect_qp( cm_ctx ); ++ if (!err) ++ err = ibscif_send_cm_rtu(cm_ctx); ++ if (err) ++ printk(KERN_ALERT PFX "%s: failed to modify QP into connected state\n", __func__); ++ event_connection_reply(cm_ctx, err); ++ put_cm(cm_ctx); ++ break; ++ ++ case IBSCIF_CM_REJ: ++ cm_ctx = (struct ibscif_cm *)req_ctx; ++ cm_ctx->plen = plen; ++ memcpy(cm_ctx->pdata, pdu->cm.pdata, plen); ++ event_connection_reply(cm_ctx, status); ++ put_cm(cm_ctx); ++ break; ++ ++ case IBSCIF_CM_RTU: ++ cm_ctx = (struct ibscif_cm *)rep_ctx; ++ event_connection_established( cm_ctx ); ++ put_cm(cm_ctx); ++ break; ++ ++ default: ++ printk(KERN_ALERT PFX "%s: invalid CM cmd: %d\n", __func__, pdu->cm.cmd); ++ break; ++ } ++ ++ return 0; ++} ++ +diff -urN a7/drivers/infiniband/hw/scif/ibscif_cq.c a8/drivers/infiniband/hw/scif/ibscif_cq.c +--- a7/drivers/infiniband/hw/scif/ibscif_cq.c 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_cq.c 2015-02-23 10:14:37.483809663 -0800 +@@ -0,0 +1,313 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "ibscif_driver.h" ++ ++static void ibscif_cq_tasklet(unsigned long cq_ptr) ++{ ++ struct ibscif_cq *cq = (struct ibscif_cq *)cq_ptr; ++ cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); ++} ++ ++#ifdef MOFED ++struct ib_cq *ibscif_create_cq(struct ib_device *ibdev, struct ib_cq_init_attr *attr, ++ struct ib_ucontext *context, struct ib_udata *udata) ++#else ++struct ib_cq *ibscif_create_cq(struct ib_device *ibdev, int entries, int comp_vector, ++ struct ib_ucontext *context, struct ib_udata *udata) ++#endif ++{ ++ struct ibscif_dev *dev = to_dev(ibdev); ++ struct ibscif_cq *cq; ++ int nbytes, npages; ++ int err; ++#ifdef MOFED ++ int entries = attr->cqe; ++#endif ++ ++ if (entries < 1 || entries > MAX_CQ_SIZE) ++ return ERR_PTR(-EINVAL); ++ ++ if (!atomic_add_unless(&dev->cq_cnt, 1, MAX_CQS)) ++ return ERR_PTR(-EAGAIN); ++ ++ cq = kzalloc(sizeof *cq, GFP_KERNEL); ++ if (!cq) { ++ atomic_dec(&dev->cq_cnt); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ spin_lock_init(&cq->lock); ++ tasklet_init(&cq->tasklet, ibscif_cq_tasklet, (unsigned long)cq); ++ cq->state = CQ_READY; ++ ++ nbytes = PAGE_ALIGN(entries * sizeof *cq->wc); ++ npages = nbytes >> PAGE_SHIFT; ++ ++ err = ibscif_reserve_quota(&npages); ++ if (err) ++ goto out; ++ ++ cq->wc = vzalloc(nbytes); /* Consider using vmalloc_user */ ++ if (!cq->wc) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ cq->ibcq.cqe = nbytes / sizeof *cq->wc; ++ ++ return &cq->ibcq; ++out: ++ ibscif_destroy_cq(&cq->ibcq); ++ return ERR_PTR(err); ++} ++ ++int ibscif_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) ++{ ++ struct ibscif_cq *cq = to_cq(ibcq); ++ struct ibscif_wc *old_wc, *new_wc; ++ int nbytes, old_npages, new_npages, i, err; ++ ++ if (cqe < 1 || cqe > MAX_CQ_SIZE) ++ return -EINVAL; ++ ++ nbytes = PAGE_ALIGN(cqe * sizeof *cq->wc); ++ new_npages = nbytes >> PAGE_SHIFT; ++ old_npages = PAGE_ALIGN(ibcq->cqe * sizeof *cq->wc) >> PAGE_SHIFT; ++ new_npages -= old_npages; ++ ++ if (new_npages == 0) ++ return 0; ++ ++ if (new_npages > 0) { ++ err = ibscif_reserve_quota(&new_npages); ++ if (err) ++ return err; ++ } ++ ++ new_wc = vzalloc(nbytes); /* Consider using vmalloc_user */ ++ if (!new_wc) { ++ err = -ENOMEM; ++ goto out1; ++ } ++ cqe = nbytes / sizeof *cq->wc; ++ old_wc = cq->wc; ++ ++ spin_lock_bh(&cq->lock); ++ ++ if (cqe < cq->depth) { ++ err = -EBUSY; ++ goto out2; ++ } ++ ++ for (i = 0; i < cq->depth; i++) { ++ new_wc[i] = old_wc[cq->head]; ++ cq->head = (cq->head + 1) % ibcq->cqe; ++ } ++ ++ cq->wc = new_wc; ++ cq->head = 0; ++ cq->tail = cq->depth; ++ ibcq->cqe = cqe; ++ ++ spin_unlock_bh(&cq->lock); ++ ++ if (old_wc) ++ vfree(old_wc); ++ if (new_npages < 0) ++ ibscif_release_quota(-new_npages); ++ ++ return 0; ++out2: ++ spin_unlock_bh(&cq->lock); ++ vfree(new_wc); ++out1: ++ if (new_npages > 0) ++ ibscif_release_quota(new_npages); ++ return err; ++} ++ ++int ibscif_destroy_cq(struct ib_cq *ibcq) ++{ ++ struct ibscif_dev *dev = to_dev(ibcq->device); ++ struct ibscif_cq *cq = to_cq(ibcq); ++ ++ tasklet_kill(&cq->tasklet); ++ ++ if (cq->wc) ++ vfree(cq->wc); ++ ++ ibscif_release_quota(PAGE_ALIGN(ibcq->cqe * sizeof *cq->wc) >> PAGE_SHIFT); ++ ++ atomic_dec(&dev->cq_cnt); ++ ++ kfree(cq); ++ return 0; ++} ++ ++int ibscif_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) ++{ ++ struct ibscif_cq *cq = to_cq(ibcq); ++ struct ibscif_wq *wq; ++ int i, reap; ++ ++ /* ++ * The protocol layer holds WQ lock while processing a packet and acquires ++ * the CQ lock to append a work completion. To avoid a deadly embrace, do ++ * not hold the CQ lock when adjusting the WQ reap count. ++ */ ++ for (i = 0; (i < num_entries) && cq->depth; i++) { ++ ++ spin_lock_bh(&cq->lock); ++ entry[i] = cq->wc[cq->head].ibwc; ++ reap = cq->wc[cq->head].reap; ++ cq->depth--; ++ wq = cq->wc[cq->head].wq; ++ cq->head = (cq->head + 1) % ibcq->cqe; ++ spin_unlock_bh(&cq->lock); ++ ++ /* WQ may no longer exist or has been flushed. */ ++ if (wq) { ++ spin_lock_bh(&wq->lock); ++ wq->head = (wq->head + reap) % wq->size; ++ wq->depth -= reap; ++ wq->completions -= reap; ++ spin_unlock_bh(&wq->lock); ++ } ++ } ++ ++ return i; ++} ++ ++int ibscif_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify) ++{ ++ struct ibscif_cq *cq = to_cq(ibcq); ++ int ret; ++ ++ spin_lock_bh(&cq->lock); ++ ++ cq->arm |= notify & IB_CQ_SOLICITED_MASK; ++ ++ if (notify & IB_CQ_SOLICITED) ++ cq->solicited = 0; ++ ++ ret = (notify & IB_CQ_REPORT_MISSED_EVENTS) && cq->depth; ++ ++ spin_unlock_bh(&cq->lock); ++ ++ return ret; ++} ++ ++void ibscif_notify_cq(struct ibscif_cq *cq) ++{ ++ if (!cq->arm || !cq->depth) ++ return; ++ ++ spin_lock_bh(&cq->lock); ++ if ((cq->arm & IB_CQ_NEXT_COMP) || ((cq->arm & IB_CQ_SOLICITED) && cq->solicited)) { ++ cq->arm = 0; /* Disarm the CQ */ ++ spin_unlock_bh(&cq->lock); ++ tasklet_hi_schedule(&cq->tasklet); ++ } else ++ spin_unlock_bh(&cq->lock); ++} ++ ++void ibscif_clear_cqes(struct ibscif_cq *cq, struct ibscif_wq *wq) ++{ ++ struct ibscif_wc *wc; ++ int i, j; ++ ++ if (!cq) ++ return; ++ ++ /* ++ * Walk the CQ work completions and clear pointers to the ++ * given WQ to prevent retiring WQEs when CQEs are polled. ++ */ ++ spin_lock_bh(&cq->lock); ++ j = cq->head; ++ for (i = 0; i < cq->depth; i++) { ++ wc = &cq->wc[j]; ++ if (wc->wq == wq) ++ wc->wq = NULL; ++ j = (j + 1) % cq->ibcq.cqe; ++ } ++ spin_unlock_bh(&cq->lock); ++} ++ ++/* ++ * Acquire lock and reserve a completion queue entry. ++ * Note that cq->lock is held upon successful completion of this call. ++ * On error, WQs affiliated with this CQ should generate an event and ++ * transition to the error state; refer to IB Spec r1.2 C11-39 and C11-40. ++ */ ++int ibscif_reserve_cqe(struct ibscif_cq *cq, struct ibscif_wc **wc) ++{ ++ spin_lock_bh(&cq->lock); ++ ++ if (cq->state != CQ_READY) { ++ spin_unlock_bh(&cq->lock); ++ return -EIO; ++ } ++ if (!cq->ibcq.cqe) { ++ spin_unlock_bh(&cq->lock); ++ return -ENOSPC; ++ } ++ if (cq->depth == cq->ibcq.cqe) { ++ cq->state = CQ_ERROR; ++ spin_unlock_bh(&cq->lock); ++ ++ if (cq->ibcq.event_handler) { ++ struct ib_event record; ++ record.event = IB_EVENT_CQ_ERR; ++ record.device = cq->ibcq.device; ++ record.element.cq = &cq->ibcq; ++ cq->ibcq.event_handler(&record, cq->ibcq.cq_context); ++ } ++ return -ENOBUFS; ++ } ++ ++ *wc = &cq->wc[cq->tail]; ++ ++ return 0; ++} ++ ++/* ++ * Append a completion queue entry and release lock. ++ * Note that this function assumes that the cq->lock is currently held. ++ */ ++void ibscif_append_cqe(struct ibscif_cq *cq, struct ibscif_wc *wc, int solicited) ++{ ++ cq->solicited = !!(solicited || (wc->ibwc.status != IB_WC_SUCCESS)); ++ cq->tail = (cq->tail + 1) % cq->ibcq.cqe; ++ cq->depth++; ++ ++ spin_unlock_bh(&cq->lock); ++} +diff -urN a7/drivers/infiniband/hw/scif/ibscif_driver.h a8/drivers/infiniband/hw/scif/ibscif_driver.h +--- a7/drivers/infiniband/hw/scif/ibscif_driver.h 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_driver.h 2015-02-23 10:14:37.483809663 -0800 +@@ -0,0 +1,787 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#ifndef IBSCIF_DRIVER_H ++#define IBSCIF_DRIVER_H ++ ++#include ++#include /* for idr routines */ ++#include /* for kthread routines */ ++#include /* for kmap_atomic */ ++#include /* for TC_PRIO_CONTROL */ ++#include /* for ARPHRD_ETHER */ ++#include /* for totalram_pages */ ++#include /* for proc_mkdir */ ++#include /* for LINUX_VERSION_CODE */ ++#include ++#include ++#include ++ ++/* these macros are defined in "linux/semaphore.h". ++ * however, they may be missing on older systems. ++ */ ++#ifndef DECLARE_MUTEX ++#define DECLARE_MUTEX(name) \ ++ struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1) ++#endif ++ ++#ifndef init_MUTEX ++#define init_MUTEX(sem) sema_init(sem, 1) ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,4,0) ++ #include ++ ++ #define KMAP_ATOMIC(x,y) kmap_atomic(x) ++ #define KUNMAP_ATOMIC(x,y) kunmap_atomic(x) ++#else ++ #define KMAP_ATOMIC(x,y) kmap_atomic(x, y) ++ #define KUNMAP_ATOMIC(x,y) kunmap_atomic(x, y) ++#endif ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include "ibscif_protocol.h" ++ ++#define IBSCIF_MTU 4096 ++ ++#define IBSCIF_EP_TYPE_LISTEN 0 ++#define IBSCIF_EP_TYPE_COMM 1 ++ ++#define DRV_NAME "ibscif" ++#define PFX DRV_NAME ": " ++#define IBDEV_PFX DRV_NAME "" ++#define DRV_DESC "OpenFabrics IBSCIF Driver" ++#define DRV_VERSION "0.1" ++#define DRV_SIGNON DRV_DESC " v" DRV_VERSION ++#define DRV_BUILD " built " __DATE__ " " __TIME__ ++ ++#define UVERBS_ABI_VER 6 ++#define VENDOR_ID 0x8086 /* Intel Corporation */ ++#define DEVICE_ID 0 ++#define HW_REV 1 ++#define FW_REV IBSCIF_PROTOCOL_VER ++ ++/* ++ * Attribute limits. ++ * These limits are imposed on client requests, however, the actual values ++ * returned may be larger than these limits on some objects due to rounding. ++ * The definitions are intended to show the thinking behind the values. ++ * E.g., MAX_PDS defined as MAX_QPS is intended to allow each QP to be ++ * on a separate PD, although that is not a usage requirement. ++ */ ++#define MAX_QPS (64 * 1024) ++#define MAX_QP_SIZE (16 * 1024) ++#define MAX_CQS (MAX_QPS * 2) /* x2:send queues + recv queues */ ++#define MAX_CQ_SIZE (MAX_QP_SIZE * 4) /* or combined */ ++#define MAX_PDS MAX_QPS /* 1 per QP */ ++#if 0 ++#define MAX_MRS (MAX_QPS * 4) /* x4:local/remote,read/write */ ++#else ++#define MAX_MRS 16383 /* limited by IBSCIF_MR_MAX_KEY */ ++#endif ++#define MAX_MR_SIZE (2U * 1024 * 1024 * 1024) ++#define MAX_SGES (PAGE_SIZE / sizeof(struct ib_sge)) ++#define MAX_OR (MAX_QP_SIZE / 2) /* half outbound reqs */ ++#define MAX_IR MAX_OR /* balance inbound with outbound */ ++ ++extern int window_size; ++#define MIN_WINDOW_SIZE 4 /* Ack every window_size/MIN_WINDOW_SIZE packets */ ++ ++extern int rma_threshold; ++extern int fast_rdma; ++extern int blocking_send; ++extern int blocking_recv; ++extern int scif_loopback; ++extern int host_proxy; ++extern int new_ib_type; ++extern int verbose; ++extern int check_grh; ++ ++extern struct list_head devlist; ++extern struct semaphore devlist_mutex; ++ ++extern struct idr wiremap; ++extern rwlock_t wiremap_lock; ++ ++extern struct ib_dma_mapping_ops ibscif_dma_mapping_ops; ++ ++/* Match IB opcodes for copy in post_send; append driver specific values. */ ++enum ibscif_wr_opcode { ++ WR_SEND = IB_WR_SEND, ++ WR_SEND_WITH_IMM = IB_WR_SEND_WITH_IMM, ++ WR_RDMA_WRITE = IB_WR_RDMA_WRITE, ++ WR_RDMA_WRITE_WITH_IMM = IB_WR_RDMA_WRITE_WITH_IMM, ++ WR_RDMA_READ = IB_WR_RDMA_READ, ++ WR_ATOMIC_CMP_AND_SWP = IB_WR_ATOMIC_CMP_AND_SWP, ++ WR_ATOMIC_FETCH_AND_ADD = IB_WR_ATOMIC_FETCH_AND_ADD, ++ WR_RDMA_READ_RSP, ++ WR_ATOMIC_RSP, ++ WR_RMA_RSP, ++ WR_UD, ++ NR_WR_OPCODES /* Must be last (for stats) */ ++}; ++ ++struct ibscif_stats { ++ unsigned long packets_sent; ++ unsigned long packets_rcvd; ++ unsigned long bytes_sent; ++ unsigned long bytes_rcvd; ++ unsigned long duplicates; ++ unsigned long tx_errors; ++ unsigned long sched_exhaust; ++ unsigned long unavailable; ++ unsigned long loopback; ++ unsigned long recv; ++ unsigned long recv_imm; ++ unsigned long wr_opcode[NR_WR_OPCODES]; ++ unsigned long fast_rdma_write; ++ unsigned long fast_rdma_read; ++ unsigned long fast_rdma_unavailable; ++ unsigned long fast_rdma_fallback; ++ unsigned long fast_rdma_force_ack; ++ unsigned long fast_rdma_tail_write; ++}; ++ ++#define DEV_STAT(dev, counter) dev->stats.counter ++ ++#define IBSCIF_MAX_DEVICES 16 ++#define IBSCIF_NAME_SIZE 12 ++ ++#define IBSCIF_NODE_ID_TO_LID(node_id) (node_id+1000) ++#define IBSCIF_LID_TO_NODE_ID(lid) (lid-1000) ++ ++struct ibscif_conn { ++ struct list_head entry; ++ atomic_t refcnt; ++ scif_epd_t ep; ++ unsigned short remote_node_id; ++ union ib_gid remote_gid; ++ struct ibscif_dev *dev; ++ int local_close; ++ int remote_close; ++}; ++ ++struct ibscif_listen { ++ struct iw_cm_id *cm_id; ++ struct list_head entry; ++ struct kref kref; ++ int port; ++}; ++ ++#define IBSCIF_MAX_PDATA_SIZE 256 ++struct ibscif_cm { ++ struct iw_cm_id *cm_id; ++ struct ibscif_conn *conn; ++ struct ibscif_listen *listen; ++ struct kref kref; ++ spinlock_t lock; ++ struct sockaddr_in local_addr; ++ struct sockaddr_in remote_addr; ++ unsigned short node_id; ++ unsigned short remote_node_id; ++ u32 qpn; ++ u32 remote_qpn; ++ int plen; ++ u8 pdata[IBSCIF_MAX_PDATA_SIZE]; ++ u64 peer_context; ++}; ++ ++struct ibscif_dev { ++ struct ib_device ibdev; ++ struct net_device *netdev; /* for RDMA CM support */ ++ struct list_head entry; ++ ++ char name[IBSCIF_NAME_SIZE]; ++ union ib_gid gid; ++ unsigned short node_id; ++ atomic_t refcnt; ++ scif_epd_t listen_ep; ++ struct list_head conn_list; ++ struct list_head mr_list; ++ struct semaphore mr_list_mutex; ++ ++ struct proc_dir_entry *procfs; ++ struct ibscif_stats stats; ++ ++ atomic_t pd_cnt; ++ atomic_t cq_cnt; ++ atomic_t qp_cnt; ++ atomic_t mr_cnt; ++ ++ atomic_t available; ++ atomic_t was_new; ++ ++ spinlock_t atomic_op; ++ ++ struct semaphore mutex; ++ struct list_head wq_list; /* List of WQ's on this device */ ++}; ++ ++struct ibscif_pd { ++ struct ib_pd ibpd; ++}; ++ ++struct ibscif_ah { ++ struct ib_ah ibah; ++ __be16 dlid; ++}; ++ ++struct ibscif_wc { ++ struct ib_wc ibwc; ++ int reap; ++ struct ibscif_wq *wq; ++}; ++ ++enum ibscif_cq_state { ++ CQ_READY, ++ CQ_ERROR ++}; ++ ++struct ibscif_cq { ++ struct ib_cq ibcq; ++ spinlock_t lock; ++ struct tasklet_struct tasklet; ++ enum ibscif_cq_state state; ++ enum ib_cq_notify_flags arm; ++ int solicited; ++ int head; ++ int tail; ++ int depth; ++ struct ibscif_wc *wc; ++}; ++ ++struct ibscif_ds { ++ struct ibscif_mr *mr; ++ u32 offset; ++ u32 length; ++ u32 lkey; ++ u32 in_use; ++ struct ibscif_mreg_info *current_mreg; ++}; ++ ++struct ibscif_segmentation { ++ struct ibscif_ds *current_ds; ++ u32 current_page_index; ++ u32 current_page_offset; ++ u32 wr_length_remaining; ++ u32 ds_length_remaining; ++ u32 starting_seq; ++ u32 next_seq; ++ u32 ending_seq; ++}; ++ ++struct ibscif_reassembly { ++ struct ibscif_ds *current_ds; ++ u32 current_ds_offset; ++ u32 last_packet_seq; ++ u32 last_seen_seq; ++ __be32 immediate_data; ++ int final_length; ++ u16 opcode; ++}; ++ ++struct ibscif_sar { ++ struct ibscif_segmentation seg; ++ struct ibscif_reassembly rea; ++}; ++ ++enum ibscif_wr_state { ++ WR_WAITING, ++ WR_STARTED, ++ WR_WAITING_FOR_ACK, ++ WR_WAITING_FOR_RSP, ++ WR_LAST_SEEN, ++ WR_COMPLETED ++}; ++ ++struct ibscif_wr { ++ u64 id; ++ enum ibscif_wr_opcode opcode; ++ int length; ++ enum ib_send_flags flags; ++ ++ u32 msg_id; ++ enum ibscif_wr_state state; ++ struct ibscif_sar sar; ++ u32 use_rma; ++ u32 rma_id; ++ ++ union { ++ struct ibscif_send { ++ u32 immediate_data; ++ } send; ++ ++ struct ibscif_ud { ++ u16 remote_node_id; ++ u32 remote_qpn; ++ } ud; ++ ++ struct ibscif_read { ++ u64 remote_address; ++ int remote_length; ++ u32 rkey; ++ } read; ++ ++ struct ibscif_write { ++ u64 remote_address; ++ u32 rkey; ++ u32 immediate_data; ++ } write; ++ ++ struct ibscif_cmp_swp { ++ u64 cmp_operand; ++ u64 swp_operand; ++ u64 remote_address; ++ u32 rkey; ++ } cmp_swp; ++ ++ struct ibscif_fetch_add { ++ u64 add_operand; ++ u64 remote_address; ++ u32 rkey; ++ } fetch_add; ++ ++ struct ibscif_atomic_rsp { ++ u64 orig_data; ++ u16 opcode; ++ } atomic_rsp; ++ ++ struct ibscif_rma_rsp { ++ u32 xfer_length; ++ u32 error; ++ } rma_rsp; ++ }; ++ ++ u32 num_ds; ++ struct ibscif_ds ds_list[0]; /* Must be last */ ++}; ++ ++struct ibscif_tx_state { ++ u32 next_seq; ++ u32 last_ack_seq_recvd; ++ u32 next_msg_id; ++}; ++ ++struct ibscif_rx_state { ++ u32 last_in_seq; ++ u32 last_seq_acked; ++ int defer_in_process; ++}; ++ ++struct ibscif_wirestate { ++ struct ibscif_tx_state tx; ++ struct ibscif_rx_state rx; ++}; ++ ++struct ibscif_wire { ++ struct ibscif_wirestate sq; ++ struct ibscif_wirestate iq; ++}; ++ ++struct ibscif_wq { ++ struct list_head entry; ++ struct ibscif_qp *qp; ++ spinlock_t lock; ++ struct ibscif_wr *wr; ++ int head; ++ int tail; ++ int depth; ++ int size; ++ int max_sge; ++ int wr_size; ++ int completions; ++ int reap; ++ int next_wr; ++ int next_msg_id; ++ struct ibscif_wirestate *wirestate; ++ int fast_rdma_completions; ++ int ud_msg_id; ++}; ++ ++enum ibscif_qp_state { ++ QP_IDLE, ++ QP_CONNECTED, ++ QP_DISCONNECT, ++ QP_ERROR, ++ QP_RESET, ++ QP_IGNORE, ++ NR_QP_STATES /* Must be last */ ++}; ++ ++enum ibscif_schedule { ++ SCHEDULE_RESUME = 1 << 0, ++ SCHEDULE_RETRY = 1 << 1, ++ SCHEDULE_TIMEOUT = 1 << 2, ++ SCHEDULE_SQ = 1 << 6, ++ SCHEDULE_IQ = 1 << 7 ++}; ++ ++struct ibscif_qp { ++ int magic; /* Must be first */ ++# define QP_MAGIC 0x5b51505d /* "[QP]" */ ++ struct kref ref; ++ struct completion done; ++ struct ib_qp ibqp; ++ struct ibscif_dev *dev; ++ enum ib_access_flags access; ++ enum ib_sig_type sq_policy; ++ enum ibscif_schedule schedule; ++ struct ibscif_wire wire; ++ int mtu; ++ ++ int max_or; ++ atomic_t or_depth; ++ atomic_t or_posted; ++ ++ struct semaphore modify_mutex; ++ spinlock_t lock; ++ enum ibscif_qp_state state; ++ u16 local_node_id; ++ u16 remote_node_id; ++ struct ibscif_conn *conn; ++ u32 remote_qpn; ++ int loopback; ++ struct ibscif_wq sq; ++ struct ibscif_wq rq; ++ struct ibscif_wq iq; ++ int in_scheduler; ++ ++ struct ibscif_conn *ud_conn[IBSCIF_MAX_DEVICES]; ++ struct ibscif_cm *cm_context; ++}; ++ ++#define is_sq(wq) (wq == &wq->qp->sq) ++#define is_rq(wq) (wq == &wq->qp->rq) ++#define is_iq(wq) (wq == &wq->qp->iq) ++ ++/* Info about MR registered via SCIF API */ ++struct ibscif_mreg_info { ++ struct list_head entry; ++ struct ibscif_conn *conn; ++ u64 offset; ++ u64 aligned_offset; ++ u32 aligned_length; ++}; ++ ++struct ibscif_mr { ++ int magic; /* Must be first */ ++# define MR_MAGIC 0x5b4d525d /* "[MR]" */ ++ struct list_head entry; ++ struct kref ref; ++ struct completion done; ++ struct ib_mr ibmr; ++ struct ib_umem *umem; ++ enum ib_access_flags access; ++ u64 addr; ++ u32 length; ++ int npages; ++ struct page **page; ++ scif_pinned_pages_t pinned_pages; ++ struct list_head mreg_list; ++}; ++ ++/* Canonical virtual address on X86_64 falls in the range 0x0000000000000000-0x00007fffffffffff ++ * and 0xffff800000000000-0xffffffffffffffff. The range 0x0000800000000000-0xffff7fffffffffff ++ * are unused. This basically means only 48 bits are used and the highest 16 bits are just sign ++ * extensions. We can put rkey into these 16 bits and use the result as the "offset" of SCIF's ++ * registered address space. By doing this, the SCIF_MAP_FIXED flag can be used so that the offset ++ * can be calculated directly from rkey and virtual address w/o using the "remote registration cache" ++ * mechanism. ++ * ++ * SCIF reserve the top 2 bits of the offset for internal uses, leaving 14 bits for rkey. ++ */ ++#define IBSCIF_MR_MAX_KEY (0x3FFF) ++#define IBSCIF_MR_VADDR_MASK (0x0000FFFFFFFFFFFFUL) ++#define IBSCIF_MR_SIGN_MASK (0x0000800000000000UL) ++#define IBSCIF_MR_SIGN_EXT (0xFFFF000000000000UL) ++#define IBSCIF_MR_RKEY_MASK (0x3FFF000000000000UL) ++ ++#define IBSCIF_MR_VADDR_TO_OFFSET(rkey, vaddr) ((((unsigned long)rkey) << 48) | \ ++ (vaddr & IBSCIF_MR_VADDR_MASK)) ++ ++#define IBSCIF_MR_OFFSET_TO_VADDR(offset) ((offset & IBSCIF_MR_SIGN_MASK) ? \ ++ (offset | IBSCIF_MR_SIGN_EXT) : \ ++ (offset & IBSCIF_MR_VADDR_MASK)) ++ ++#define IBSCIF_MR_OFFSET_TO_RKEY(offset) ((offset & IBSCIF_MR_RKEY_MASK) >> 48) ++ ++#define TO_OBJ(name, src, dst, field) \ ++static inline struct dst *name(struct src *field) \ ++{ \ ++ return container_of(field, struct dst, field); \ ++} ++TO_OBJ(to_dev, ib_device, ibscif_dev, ibdev) ++TO_OBJ(to_pd, ib_pd, ibscif_pd, ibpd) ++TO_OBJ(to_cq, ib_cq, ibscif_cq, ibcq) ++TO_OBJ(to_qp, ib_qp, ibscif_qp, ibqp) ++TO_OBJ(to_mr, ib_mr, ibscif_mr, ibmr) ++TO_OBJ(to_ah, ib_ah, ibscif_ah, ibah) ++ ++#define OBJ_GET(obj, type) \ ++static inline struct ibscif_##obj *ibscif_get_##obj(int id) \ ++{ \ ++ struct ibscif_##obj *obj; \ ++ read_lock_bh(&wiremap_lock); \ ++ obj = idr_find(&wiremap, id); \ ++ if (likely(obj)) { \ ++ if (likely(obj->magic == type)) \ ++ kref_get(&obj->ref); \ ++ else \ ++ obj = ERR_PTR(-ENXIO); \ ++ } else \ ++ obj = ERR_PTR(-ENOENT); \ ++ read_unlock_bh(&wiremap_lock); \ ++ return obj; \ ++} ++OBJ_GET(mr, MR_MAGIC) ++OBJ_GET(qp, QP_MAGIC) ++ ++void ibscif_complete_mr(struct kref *kref); ++void ibscif_complete_qp(struct kref *kref); ++ ++#define OBJ_PUT(obj) \ ++static inline void ibscif_put_##obj(struct ibscif_##obj *obj) \ ++{ \ ++ if (likely(obj)) \ ++ kref_put(&obj->ref, ibscif_complete_##obj); \ ++} ++OBJ_PUT(mr) ++OBJ_PUT(qp) ++ ++#define RHEL61_AND_ABOVE 0 ++#if defined(RHEL_MAJOR) && defined(RHEL_MINOR) ++#if (RHEL_MAJOR==6) && (RHEL_MINOR>0) ++#undef RHEL61_AND_ABOVE ++#define RHEL61_AND_ABOVE 1 ++#endif ++#endif ++ ++#if (LINUX_VERSION_CODEwr + (wq->wr_size * index)); ++} ++ ++/* This function assumes the WQ is protected by a lock. */ ++static inline void ibscif_append_wq(struct ibscif_wq *wq) ++{ ++ wq->tail = (wq->tail + 1) % wq->size; ++ wq->depth++; ++ wq->next_msg_id++; ++} ++ ++static inline void ibscif_clear_ds_ref(struct ibscif_ds *ds) ++{ ++ if (ds->in_use) { ++ ds->in_use = 0; ++ ibscif_put_mr(ds->mr); ++ } ++} ++ ++static inline void ibscif_clear_ds_refs(struct ibscif_ds *ds, int num_ds) ++{ ++ while(num_ds--) ++ ibscif_clear_ds_ref(ds++); ++} ++ ++static inline enum ib_wc_opcode to_ib_wc_opcode(enum ib_wr_opcode opcode) ++{ ++ /* SQ only - RQ is either IB_WC_RECV or IB_WC_RECV_RDMA_WITH_IMM. */ ++ switch (opcode) { ++ case IB_WR_RDMA_WRITE: return IB_WC_RDMA_WRITE; ++ case IB_WR_RDMA_WRITE_WITH_IMM: return IB_WC_RDMA_WRITE; ++ case IB_WR_SEND: return IB_WC_SEND; ++ case IB_WR_SEND_WITH_IMM: return IB_WC_SEND; ++ case IB_WR_RDMA_READ: return IB_WC_RDMA_READ; ++ case IB_WR_ATOMIC_CMP_AND_SWP: return IB_WC_COMP_SWAP; ++ case IB_WR_ATOMIC_FETCH_AND_ADD: return IB_WC_FETCH_ADD; ++ default: return -1; ++ } ++} ++ ++static inline void *ibscif_map_src(struct page *page) ++{ ++ return KMAP_ATOMIC(page, KM_SOFTIRQ0); ++} ++ ++static inline void *ibscif_map_dst(struct page *page) ++{ ++ return KMAP_ATOMIC(page, KM_SOFTIRQ1); ++} ++ ++static inline void ibscif_unmap_src(struct page *page, void *addr) ++{ ++ if (likely(addr)) ++ KUNMAP_ATOMIC(addr, KM_SOFTIRQ0); ++} ++ ++static inline void ibscif_unmap_dst(struct page *page, void *addr) ++{ ++ if (likely(addr)) ++ KUNMAP_ATOMIC(addr, KM_SOFTIRQ1); ++ if (likely(page)) { ++ flush_dcache_page(page); ++ if (!PageReserved(page)) ++ set_page_dirty(page); ++ } ++} ++ ++#ifdef IBSCIF_PERF_TEST ++#define IBSCIF_PERF_SAMPLE(counter,next) ibscif_perf_sample(counter,next) ++#else ++#define IBSCIF_PERF_SAMPLE(counter,next) ++#endif ++ ++int ibscif_atomic_copy(void *dst_addr, void *src_addr, u32 copy_len, int head_copied); ++ ++int ibscif_wiremap_add(void *obj, int *id); ++void ibscif_wiremap_del(int id); ++ ++int ibscif_dev_init(void); ++void ibscif_protocol_init_pre(void); ++void ibscif_protocol_init_post(void); ++ ++void ibscif_dev_cleanup(void); ++void ibscif_protocol_cleanup(void); ++ ++int ibscif_procfs_add_dev(struct ibscif_dev *dev); ++void ibscif_procfs_remove_dev(struct ibscif_dev *dev); ++ ++int ibscif_reserve_quota(int *npages); ++void ibscif_release_quota(int npages); ++ ++void ibscif_scheduler_add_qp(struct ibscif_qp *qp); ++void ibscif_scheduler_remove_qp(struct ibscif_qp *qp); ++void ibscif_schedule(struct ibscif_wq *wq); ++ ++struct ib_ah *ibscif_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr); ++int ibscif_destroy_ah(struct ib_ah *ibah); ++ ++struct ib_pd *ibscif_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata); ++int ibscif_dealloc_pd(struct ib_pd *ibpd); ++ ++struct ib_qp *ibscif_create_qp(struct ib_pd *ibpd, struct ib_qp_init_attr *attr, struct ib_udata *udata); ++int ibscif_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_qp_init_attr *init_attr); ++int ibscif_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); ++int ibscif_destroy_qp(struct ib_qp *ibqp); ++void ibscif_qp_internal_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason); ++void ibscif_qp_remote_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason); ++void ibscif_qp_add_ud_conn(struct ibscif_qp *qp, struct ibscif_conn *conn); ++ ++#ifdef MOFED ++struct ib_cq *ibscif_create_cq(struct ib_device *ibdev, struct ib_cq_init_attr *attr, ++ struct ib_ucontext *context, struct ib_udata *udata); ++#else ++struct ib_cq *ibscif_create_cq(struct ib_device *ibdev, int entries, int comp_vector, ++ struct ib_ucontext *context, struct ib_udata *udata); ++#endif ++int ibscif_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); ++int ibscif_destroy_cq(struct ib_cq *ibcq); ++int ibscif_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); ++int ibscif_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify); ++void ibscif_notify_cq(struct ibscif_cq *cq); ++void ibscif_clear_cqes(struct ibscif_cq *cq, struct ibscif_wq *wq); ++int ibscif_reserve_cqe(struct ibscif_cq *cq, struct ibscif_wc **wc); ++void ibscif_append_cqe(struct ibscif_cq *cq, struct ibscif_wc *wc, int solicited); ++ ++struct ib_mr *ibscif_get_dma_mr(struct ib_pd *ibpd, int access); ++struct ib_mr *ibscif_reg_phys_mr(struct ib_pd *ibpd, struct ib_phys_buf *phys_buf_array, ++ int num_phys_buf, int access, u64 *iova_start); ++#ifdef MOFED ++struct ib_mr *ibscif_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, ++ u64 virt_addr, int access, struct ib_udata *udata, int mr_id); ++#else ++struct ib_mr *ibscif_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, ++ u64 virt_addr, int access, struct ib_udata *udata); ++#endif ++int ibscif_dereg_mr(struct ib_mr *ibmr); ++struct ibscif_mr *ibscif_validate_mr(u32 key, u64 addr, int length, ++ struct ib_pd *ibpd, enum ib_access_flags access); ++struct ibscif_mreg_info *ibscif_mr_get_mreg(struct ibscif_mr *mr, struct ibscif_conn *conn); ++void ibscif_refresh_mreg( struct ibscif_conn *conn ); ++ ++int ibscif_post_send(struct ib_qp *ibqp, struct ib_send_wr *ibwr, struct ib_send_wr **bad_wr); ++int ibscif_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ibwr, struct ib_recv_wr **bad_wr); ++ ++void ibscif_send_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason); ++void ibscif_send_close(struct ibscif_conn *conn); ++void ibscif_send_reopen(struct ibscif_conn *conn); ++ ++void ibscif_loopback_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason); ++void ibscif_loopback(struct ibscif_wq *sq); ++ ++int ibscif_xmit_wr(struct ibscif_wq *wq, struct ibscif_wr *wr, int tx_limit, int retransmit, ++ u32 from_seq, u32 *posted); ++int ibscif_process_sq_completions(struct ibscif_qp *qp); ++ ++struct ibscif_conn *ibscif_get_conn( int node_id, int remote_node_id, int find_local_peer ); ++void ibscif_put_conn( struct ibscif_conn *conn ); ++void ibscif_do_accept(struct ibscif_dev *dev); ++void ibscif_get_pollep_list(struct scif_pollepd *polleps, struct ibscif_dev **devs, ++ int *types, struct ibscif_conn **conns, int *count); ++void ibscif_refresh_pollep_list(void); ++void ibscif_get_ep_list(scif_epd_t *eps, int *count); ++void ibscif_remove_ep(struct ibscif_dev *dev, scif_epd_t ep); ++void ibscif_free_conn(struct ibscif_conn *conn); ++int ibscif_cleanup_idle_conn( void ); ++void ibscif_perf_sample(int counter, int next); ++ ++int ibscif_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); ++int ibscif_cm_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); ++int ibscif_cm_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); ++int ibscif_cm_create_listen(struct iw_cm_id *cm_id, int backlog); ++int ibscif_cm_destroy_listen(struct iw_cm_id *cm_id); ++struct ib_qp *ibscif_cm_get_qp(struct ib_device *ibdev, int qpn); ++void ibscif_cm_add_ref(struct ib_qp *ibqp); ++void ibscif_cm_rem_ref(struct ib_qp *ibqp); ++void ibscif_cm_async_callback(void *cm_context); ++int ibscif_process_cm_skb(struct sk_buff *skb, struct ibscif_conn *conn); ++int ibscif_send_cm_req(struct ibscif_cm *cm_ctx); ++int ibscif_send_cm_rep(struct ibscif_cm *cm_ctx); ++int ibscif_send_cm_rej(struct ibscif_cm *cm_ctx, const void *pdata, u8 plen); ++int ibscif_send_cm_rtu(struct ibscif_cm *cm_ctx); ++ ++#endif /* IBSCIF_DRIVER_H */ +diff -urN a7/drivers/infiniband/hw/scif/ibscif_loopback.c a8/drivers/infiniband/hw/scif/ibscif_loopback.c +--- a7/drivers/infiniband/hw/scif/ibscif_loopback.c 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_loopback.c 2015-02-23 10:14:37.484809663 -0800 +@@ -0,0 +1,582 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "ibscif_driver.h" ++ ++struct ibscif_seg { ++ enum ib_access_flags access; ++ struct ibscif_ds *ds; ++ struct ibscif_mr *mr; ++ struct page **page; ++ void *addr; ++ u32 offset; ++ u32 ds_len; ++ u32 pg_len; ++ void *(*map)(struct page *page); ++ void (*unmap)(struct page *page, void *addr); ++}; ++ ++static void ibscif_seg_init(struct ibscif_seg *seg, struct ibscif_ds *ds, ++ void *(*map)(struct page *page), void (*unmap)(struct page *page, void *addr), ++ enum ib_access_flags access) ++{ ++ memset(seg, 0, sizeof *seg); ++ seg->ds = ds; ++ seg->map = map; ++ seg->unmap = unmap; ++ seg->access = access; ++} ++ ++static void ibscif_seg_fini(struct ibscif_seg *seg) ++{ ++ seg->unmap(*seg->page, seg->addr); ++ if (likely(seg->mr)) ++ ibscif_put_mr(seg->mr); ++} ++ ++static int ibscif_seg_set(struct ibscif_seg *seg, u32 length, u32 copy_len) ++{ ++ struct page **prev_page; ++ ++ if (!seg->ds_len) { ++ ++ if (seg->mr) ++ ibscif_put_mr(seg->mr); ++ ++ seg->mr = ibscif_get_mr(seg->ds->lkey); ++ if (unlikely(IS_ERR(seg->mr))) ++ return PTR_ERR(seg->mr); ++ ++ if (unlikely(seg->access && !(seg->mr->access & seg->access))) ++ return -EACCES; ++ ++ prev_page = seg->page; ++ seg->offset = seg->ds->offset + (seg->mr->addr & ~PAGE_MASK); ++ seg->page = &seg->mr->page[seg->offset >> PAGE_SHIFT]; ++ seg->offset &= ~PAGE_MASK; ++ seg->ds_len = seg->ds->length; ++ seg->pg_len = min(seg->ds_len, (u32)PAGE_SIZE - seg->offset); ++ seg->pg_len = min(seg->pg_len, length); ++ ++ if (seg->page != prev_page) ++ seg->addr = seg->map(*seg->page) + seg->offset; ++ ++ seg->ds++; ++ ++ } else if (!seg->pg_len) { ++ ++ seg->unmap(*seg->page, seg->addr); ++ ++ seg->page++; ++ seg->addr = seg->map(*seg->page); ++ seg->pg_len = min(seg->ds_len, (u32)PAGE_SIZE); ++ seg->pg_len = min(seg->pg_len, length); ++ } else ++ seg->addr += copy_len; ++ ++ return 0; ++} ++ ++static inline int ibscif_seg_copy(struct ibscif_seg *dst, struct ibscif_seg *src, u32 length, int head_copied) ++{ ++ src->ds_len -= length; ++ src->pg_len -= length; ++ ++ dst->ds_len -= length; ++ dst->pg_len -= length; ++ ++ return ibscif_atomic_copy(dst->addr, src->addr, length, head_copied); ++} ++ ++/* ++ * Copy data from the source to the destination data segment list. ++ * This is a bit complicated since we must map and copy each page ++ * individually and because each data segment can be split across ++ * multiple pages within the memory region as illustrated below: ++ * ++ * +---page---+ +---page---+ +---page---+ ++ * | .~~mr~~~|~~~|~~~~~~~~~~|~~~|~~~~~~. | ++ * | | | | [==ds===|===|====] | | ++ * | '~~~~~~~|~~~|~~~~~~~~~~|~~~|~~~~~~' | ++ * +----------+ +----------+ +----------+ ++ * ++ * For example, due to different buffer page offsets, copying data ++ * between the following buffers will result in five separate copy ++ * operations as shown by the numeric labels below: ++ * ++ * +----------+ +----------+ ++ * | | | | ++ * |1111111111| | | ++ * |2222222222| |1111111111| ++ * +----------+ +----------+ ++ * ++ * +----------+ +----------+ ++ * |3333333333| |2222222222| ++ * |3333333333| |3333333333| ++ * |4444444444| |3333333333| ++ * +----------+ +----------+ ++ * ++ * +----------+ +----------+ ++ * |5555555555| |4444444444| ++ * | | |5555555555| ++ * | | | | ++ * +----------+ +----------+ ++ * ++ * The source and destination data segment list lengths are ++ * assumed to have been validated outside of this function. ++ */ ++static int ibscif_dscopy(struct ibscif_ds *dst_ds, struct ibscif_ds *src_ds, u32 length) ++{ ++ struct ibscif_seg src, dst; ++ int head_copied; ++ u32 copy_len; ++ int err = 0; ++ ++ ibscif_seg_init(&src, src_ds, ibscif_map_src, ibscif_unmap_src, 0); ++ ibscif_seg_init(&dst, dst_ds, ibscif_map_dst, ibscif_unmap_dst, IB_ACCESS_LOCAL_WRITE); ++ ++ head_copied = 0; ++ for (copy_len = 0; length; length -= copy_len) { ++ ++ err = ibscif_seg_set(&src, length, copy_len); ++ if (unlikely(err)) ++ break; ++ err = ibscif_seg_set(&dst, length, copy_len); ++ if (unlikely(err)) ++ break; ++ ++ copy_len = min(src.pg_len, dst.pg_len); ++ head_copied = ibscif_seg_copy(&dst, &src, copy_len, head_copied); ++ } ++ ++ ibscif_seg_fini(&src); ++ ibscif_seg_fini(&dst); ++ ++ return err; ++} ++ ++/* Hold sq->lock during this call for synchronization. */ ++static int ibscif_complete_sq_wr(struct ibscif_wq *sq, struct ibscif_wr *send_wr, enum ib_wc_status status) ++{ ++ struct ibscif_qp *qp = sq->qp; ++ struct ibscif_wc *wc; ++ int err; ++ ++ ibscif_clear_ds_refs(send_wr->ds_list, send_wr->num_ds); ++ sq->completions++; ++ sq->reap++; ++ ++ if (send_wr->flags & IB_SEND_SIGNALED) { ++ struct ibscif_cq *cq = to_cq(qp->ibqp.send_cq); ++ ++ err = ibscif_reserve_cqe(cq, &wc); ++ if (unlikely(err)) ++ return err; ++ ++ wc->ibwc.qp = &qp->ibqp; ++ wc->ibwc.src_qp = qp->remote_qpn; ++ wc->ibwc.wr_id = send_wr->id; ++ wc->ibwc.opcode = to_ib_wc_opcode(send_wr->opcode); ++ wc->ibwc.status = status; ++ wc->ibwc.ex.imm_data = 0; ++ wc->ibwc.port_num = 1; ++ ++ if ((enum ib_wr_opcode)send_wr->opcode == IB_WR_RDMA_READ) ++ wc->ibwc.byte_len = send_wr->read.remote_length; ++ else if (((enum ib_wr_opcode)send_wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) || ++ ((enum ib_wr_opcode)send_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) ++ wc->ibwc.byte_len = sizeof send_wr->atomic_rsp.orig_data; ++ else ++ wc->ibwc.byte_len = send_wr->length; ++ ++ wc->wq = sq; ++ wc->reap = sq->reap; ++ sq->reap = 0; ++ ++ ibscif_append_cqe(cq, wc, 0); ++ } ++ ++ return 0; ++} ++ ++/* Hold rq->lock during this call for synchronization. */ ++static int ibscif_complete_rq_wr(struct ibscif_wq *rq, struct ibscif_wr *recv_wr, ++ struct ibscif_wr *send_wr, enum ib_wc_status status) ++{ ++ struct ibscif_qp *qp = rq->qp; ++ struct ibscif_cq *cq = to_cq(qp->ibqp.recv_cq); ++ struct ibscif_wc *wc; ++ int err; ++ ++ ibscif_clear_ds_refs(recv_wr->ds_list, recv_wr->num_ds); ++ ++ err = ibscif_reserve_cqe(cq, &wc); ++ if (unlikely(err)) ++ return err; ++ ++ wc->ibwc.qp = &qp->ibqp; ++ wc->ibwc.src_qp = qp->remote_qpn; ++ wc->ibwc.wr_id = recv_wr->id; ++ wc->ibwc.status = status; ++ wc->ibwc.byte_len = send_wr->length; ++ wc->ibwc.port_num = 1; ++ ++ if ((enum ib_wr_opcode)send_wr->opcode == IB_WR_SEND_WITH_IMM) { ++ DEV_STAT(qp->dev, recv_imm++); ++ wc->ibwc.opcode = IB_WC_RECV_RDMA_WITH_IMM; ++ wc->ibwc.ex.imm_data = cpu_to_be32(send_wr->send.immediate_data); ++ } else if ((enum ib_wr_opcode)send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { ++ DEV_STAT(qp->dev, recv_imm++); ++ wc->ibwc.opcode = IB_WC_RECV_RDMA_WITH_IMM; ++ wc->ibwc.ex.imm_data = cpu_to_be32(send_wr->write.immediate_data); ++ } else { ++ DEV_STAT(qp->dev, recv++); ++ wc->ibwc.opcode = IB_WC_RECV; ++ wc->ibwc.ex.imm_data = 0; ++ } ++ ++ wc->wq = rq; ++ wc->reap = 1; ++ rq->completions++; ++ ++ ibscif_append_cqe(cq, wc, !!(send_wr->flags & IB_SEND_SOLICITED)); ++ ++ return 0; ++} ++ ++/* Hold wq lock during this call for synchronization. */ ++static int ibscif_validate_wq(struct ibscif_wq *wq, struct ibscif_wr **wr, enum ib_access_flags access) ++{ ++ if (unlikely(wq->qp->state != QP_CONNECTED)) ++ return -ENOTCONN; ++ ++ if (unlikely(access && !(wq->qp->access & access))) ++ return -EACCES; ++ ++ if (wr) { ++ int next; ++ ++ if (unlikely(!wq->size)) ++ return -ENOSPC; ++ ++ next = (wq->head + wq->completions) % wq->size; ++ ++ if (unlikely(next == wq->tail)) ++ return -ENOBUFS; ++ ++ *wr = ibscif_get_wr(wq, next); ++ } ++ ++ return 0; ++} ++ ++static int ibscif_loopback_send(struct ibscif_wq *sq, struct ibscif_wq *rq, struct ibscif_wr *send_wr) ++{ ++ struct ibscif_wr *recv_wr; ++ int err; ++ ++ spin_lock_bh(&rq->lock); ++ ++ err = ibscif_validate_wq(rq, &recv_wr, 0); ++ if (unlikely(err)) ++ goto out; ++ ++ if (likely(send_wr->length)) { ++ if (unlikely(send_wr->length > recv_wr->length)) { ++ err = -EMSGSIZE; ++ goto out; ++ } ++ ++ err = ibscif_dscopy(recv_wr->ds_list, send_wr->ds_list, send_wr->length); ++ if (unlikely(err)) ++ goto out; ++ } ++ ++ err = ibscif_complete_rq_wr(rq, recv_wr, send_wr, IB_WC_SUCCESS); ++out: ++ spin_unlock_bh(&rq->lock); ++ ++ return err; ++} ++ ++static int ibscif_loopback_write(struct ibscif_wq *sq, struct ibscif_wq *rq, struct ibscif_wr *write_wr) ++{ ++ struct ibscif_wr *recv_wr = NULL; ++ struct ibscif_mr *dst_mr = ERR_PTR(-ENOENT); ++ int err; ++ ++ spin_lock_bh(&rq->lock); ++ ++ err = ibscif_validate_wq(rq, ((enum ib_wr_opcode)write_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) ? ++ &recv_wr : NULL, IB_ACCESS_REMOTE_WRITE); ++ if (unlikely(err)) ++ goto out; ++ ++ if (likely(write_wr->length)) { ++ struct ibscif_ds dst_ds; ++ ++ dst_mr = ibscif_validate_mr(write_wr->write.rkey, write_wr->write.remote_address, ++ write_wr->length, rq->qp->ibqp.pd, IB_ACCESS_REMOTE_WRITE); ++ if (unlikely(IS_ERR(dst_mr))) { ++ err = PTR_ERR(dst_mr); ++ goto out; ++ } ++ ++ dst_ds.mr = dst_mr; ++ dst_ds.offset = write_wr->write.remote_address - dst_mr->addr; ++ dst_ds.length = write_wr->length; ++ dst_ds.lkey = dst_mr->ibmr.lkey; ++ ++ err = ibscif_dscopy(&dst_ds, write_wr->ds_list, dst_ds.length); ++ if (unlikely(err)) ++ goto out; ++ } else ++ err = 0; ++ ++ if (recv_wr) ++ err = ibscif_complete_rq_wr(rq, recv_wr, write_wr, IB_WC_SUCCESS); ++out: ++ if (likely(!IS_ERR(dst_mr))) ++ ibscif_put_mr(dst_mr); ++ ++ spin_unlock_bh(&rq->lock); ++ ++ return err; ++} ++ ++static int ibscif_loopback_read(struct ibscif_wq *sq, struct ibscif_wq *iq, struct ibscif_wr *read_wr) ++{ ++ struct ibscif_mr *src_mr = ERR_PTR(-ENOENT); ++ int err; ++ ++ spin_lock_bh(&iq->lock); ++ ++ err = ibscif_validate_wq(iq, NULL, IB_ACCESS_REMOTE_READ); ++ if (unlikely(err)) ++ goto out; ++ ++ if (!iq->size) { ++ err = -ENOBUFS; ++ goto out; ++ } ++ ++ if (likely(read_wr->read.remote_length)) { ++ struct ibscif_ds src_ds; ++ ++ src_mr = ibscif_validate_mr(read_wr->read.rkey, read_wr->read.remote_address, ++ read_wr->read.remote_length, iq->qp->ibqp.pd, ++ IB_ACCESS_REMOTE_READ); ++ if (unlikely(IS_ERR(src_mr))) { ++ err = PTR_ERR(src_mr); ++ goto out; ++ } ++ ++ src_ds.mr = src_mr; ++ src_ds.offset = read_wr->read.remote_address - src_mr->addr; ++ src_ds.length = read_wr->read.remote_length; ++ src_ds.lkey = src_mr->ibmr.lkey; ++ ++ err = ibscif_dscopy(read_wr->ds_list, &src_ds, src_ds.length); ++ } else ++ err = 0; ++out: ++ if (likely(!IS_ERR(src_mr))) ++ ibscif_put_mr(src_mr); ++ ++ spin_unlock_bh(&iq->lock); ++ ++ atomic_dec(&sq->qp->or_posted); ++ ++ return err; ++} ++ ++static int ibscif_loopback_atomic(struct ibscif_wq *sq, struct ibscif_wq *iq, struct ibscif_wr *atomic_wr) ++{ ++ struct ibscif_mr *src_mr = ERR_PTR(-ENOENT); ++ struct ibscif_ds src_ds; ++ struct page *src_page; ++ u64 *src_addr, addr; ++ u32 src_offset, rkey; ++ int err; ++ ++ if ((enum ib_wr_opcode)atomic_wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { ++ addr = atomic_wr->cmp_swp.remote_address; ++ rkey = atomic_wr->cmp_swp.rkey; ++ } else { ++ addr = atomic_wr->fetch_add.remote_address; ++ rkey = atomic_wr->fetch_add.rkey; ++ } ++ ++ spin_lock_bh(&iq->lock); ++ ++ err = ibscif_validate_wq(iq, NULL, IB_ACCESS_REMOTE_ATOMIC); ++ if (unlikely(err)) ++ goto out; ++ ++ if (!iq->size) { ++ err = -ENOBUFS; ++ goto out; ++ } ++ ++ src_mr = ibscif_validate_mr(rkey, addr, sizeof atomic_wr->atomic_rsp.orig_data, ++ iq->qp->ibqp.pd, IB_ACCESS_REMOTE_ATOMIC); ++ if (unlikely(IS_ERR(src_mr))) { ++ err = PTR_ERR(src_mr); ++ goto out; ++ } ++ ++ /* Build a source data segment to copy the original data. */ ++ src_ds.mr = src_mr; ++ src_ds.offset = addr - src_mr->addr; ++ src_ds.length = sizeof atomic_wr->atomic_rsp.orig_data; ++ src_ds.lkey = src_mr->ibmr.lkey; ++ ++ /* Determine which page to map. */ ++ src_offset = src_ds.offset + (src_mr->addr & ~PAGE_MASK); ++ src_page = src_mr->page[src_offset >> PAGE_SHIFT]; ++ src_offset &= ~PAGE_MASK; ++ ++ /* Lock to perform the atomic operation atomically. */ ++ spin_lock_bh(&iq->qp->dev->atomic_op); ++ ++ /* Copy the original data; this handles any ds_list crossing. */ ++ err = ibscif_dscopy(atomic_wr->ds_list, &src_ds, sizeof atomic_wr->atomic_rsp.orig_data); ++ if (likely(!err)) { ++ src_addr = ibscif_map_src(src_page) + src_offset; ++ if ((enum ib_wr_opcode)atomic_wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ++ *src_addr += atomic_wr->fetch_add.add_operand; ++ else if (*src_addr == atomic_wr->cmp_swp.cmp_operand) ++ *src_addr = atomic_wr->cmp_swp.swp_operand; ++ ibscif_unmap_src(src_page, src_addr); ++ } ++ ++ /* Atomic operation is complete. */ ++ spin_unlock_bh(&iq->qp->dev->atomic_op); ++out: ++ if (likely(!IS_ERR(src_mr))) ++ ibscif_put_mr(src_mr); ++ ++ spin_unlock_bh(&iq->lock); ++ ++ atomic_dec(&sq->qp->or_posted); ++ ++ return err; ++} ++ ++void ibscif_loopback_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason) ++{ ++ struct ibscif_qp *remote_qp; ++ ++ remote_qp = ibscif_get_qp(qp->remote_qpn); ++ if (unlikely(IS_ERR(remote_qp))) ++ return; ++ ++ /* Don't bother if the SQ is connected to the RQ on the same QP. */ ++ if (remote_qp != qp) ++ ibscif_qp_remote_disconnect(remote_qp, reason); ++ ++ ibscif_put_qp(remote_qp); ++} ++ ++/* ++ * Loopback QPs connected through the same MAC address. ++ * This includes an SQ connected to the RQ on the same QP. ++ */ ++void ibscif_loopback(struct ibscif_wq *sq) ++{ ++ struct ibscif_wq *rq, *iq; ++ struct ibscif_qp *remote_qp; ++ struct ibscif_wr *wr; ++ int status = 0, err = 0; ++ ++ BUG_ON(!is_sq(sq)); ++ ++again: ++ remote_qp = ibscif_get_qp(sq->qp->remote_qpn); ++ if (unlikely(IS_ERR(remote_qp))) { ++ ibscif_qp_remote_disconnect(sq->qp, IBSCIF_REASON_INVALID_QP); ++ return; ++ } ++ rq = &remote_qp->rq; ++ iq = &remote_qp->iq; ++ ++ DEV_STAT(sq->qp->dev, loopback++); ++ ++ spin_lock_bh(&sq->lock); ++ for (wr = ibscif_get_wr(sq, sq->next_wr); ++ (sq->next_wr != sq->tail) && !err; ++ sq->next_wr = (sq->next_wr + 1) % sq->size) { ++ ++ switch (wr->opcode) { ++ ++ case WR_SEND: ++ case WR_SEND_WITH_IMM: ++ status = ibscif_loopback_send(sq, rq, wr); ++ break; ++ case WR_RDMA_WRITE: ++ case WR_RDMA_WRITE_WITH_IMM: ++ status = ibscif_loopback_write(sq, rq, wr); ++ break; ++ case WR_RDMA_READ: ++ status = ibscif_loopback_read(sq, iq, wr); ++ break; ++ case WR_ATOMIC_CMP_AND_SWP: ++ case WR_ATOMIC_FETCH_AND_ADD: ++ status = ibscif_loopback_atomic(sq, iq, wr); ++ break; ++ default: ++ status = -ENOSYS; ++ break; ++ } ++ ++ if (likely(!status)) { ++ err = ibscif_complete_sq_wr(sq, wr, IB_WC_SUCCESS); ++ ++ spin_unlock_bh(&sq->lock); ++ ibscif_notify_cq(to_cq(sq->qp->ibqp.send_cq)); ++ ibscif_notify_cq(to_cq(remote_qp->ibqp.recv_cq)); ++ spin_lock_bh(&sq->lock); ++ } else ++ break; ++ } ++ spin_unlock_bh(&sq->lock); ++ ++ if (unlikely(status) && status != -ENOBUFS) ++ ibscif_qp_remote_disconnect(sq->qp, IBSCIF_REASON_QP_FATAL); ++ else if (unlikely(err)) ++ ibscif_qp_internal_disconnect(sq->qp, IBSCIF_REASON_QP_FATAL); ++ ++ ibscif_put_qp(remote_qp); ++ ++ if (status == -ENOBUFS) { ++ schedule(); ++ goto again; ++ } ++} +diff -urN a7/drivers/infiniband/hw/scif/ibscif_main.c a8/drivers/infiniband/hw/scif/ibscif_main.c +--- a7/drivers/infiniband/hw/scif/ibscif_main.c 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_main.c 2015-02-23 10:14:37.484809663 -0800 +@@ -0,0 +1,357 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "ibscif_driver.h" ++ ++static const char ibscif_signon[] = DRV_SIGNON DRV_BUILD; ++ ++MODULE_AUTHOR("Intel Corporation"); ++MODULE_LICENSE("Dual BSD/GPL"); ++MODULE_DESCRIPTION(DRV_DESC); ++MODULE_VERSION(DRV_VERSION); ++ ++#define MODULE_PARAM(type, name, value, desc) \ ++ type name = value; \ ++ module_param(name, type, 0664); \ ++ MODULE_PARM_DESC(name, desc) ++ ++#define MODULE_ARRAY(name, size, value, desc) \ ++ unsigned int name##_argc; \ ++ char *name[size] = { [0 ... size-1] = value }; \ ++ module_param_array(name, charp, &name##_argc, 0644); \ ++ MODULE_PARM_DESC(name, desc) ++ ++#define DEFAULT_MAX_PINNED 50 ++MODULE_PARAM(int, max_pinned, DEFAULT_MAX_PINNED, ++ "Maximum percent of physical memory that may be pinned"); ++ ++#define DEFAULT_WINDOW_SIZE 40 ++MODULE_PARAM(int, window_size, DEFAULT_WINDOW_SIZE, ++ "Maximum number of outstanding unacknowledged packets"); ++ ++#define DEFAULT_RMA_THRESHOLD 1024 ++MODULE_PARAM(int, rma_threshold, DEFAULT_RMA_THRESHOLD, ++ "Maximum message size sent through scif_send()"); ++ ++MODULE_PARAM(int, fast_rdma, 1, ++ "Use scif_writeto()/scif_readfrom() directly for RDMA write/read"); ++ ++MODULE_PARAM(int, blocking_send, 0, ++ "Use blocking version of scif_send()"); ++ ++MODULE_PARAM(int, blocking_recv, 1, ++ "Use blocking version of scif_recv()"); ++ ++MODULE_PARAM(int, scif_loopback, 1, ++ "Use SCIF lookback instead of kernel copy based loopback"); ++ ++MODULE_PARAM(int, host_proxy, 0, ++ "Proxy card side RDMA operations to host"); ++ ++#if ((LINUX_VERSION_CODE>=KERNEL_VERSION(3,5,0)) || CONFIG_MK1OM || CONFIG_ML1OM) ++#define USE_NEW_IB_TYPE 1 ++#else ++#define USE_NEW_IB_TYPE 0 ++#endif ++MODULE_PARAM(int, new_ib_type, USE_NEW_IB_TYPE, ++ "Use new transport type dedicated to IBSCIF"); ++ ++MODULE_PARAM(int, verbose, 0, ++ "Produce more log info for debugging purpose"); ++ ++MODULE_PARAM(int, check_grh, 1, ++ "Detect outside-box connection by checking the global routing header"); ++ ++static atomic_t avail_pages; /* Calculated from max_pinned and totalram_pages */ ++ ++LIST_HEAD(devlist); ++DECLARE_MUTEX(devlist_mutex); ++ ++DEFINE_IDR(wiremap); ++DEFINE_RWLOCK(wiremap_lock); ++static u32 reserved_0 = 0; ++ ++void ibscif_dump(char *str, unsigned char* buf, int len) ++{ ++ unsigned char *p, tmp[(16*3)+1]; ++ int i; ++ return; ++ len = len > 64 ? 64 : len; ++ while (len) { ++ p = tmp; ++ for (i = len > 16 ? 16 : len; i; i--, len--) ++ p += sprintf(p, "%2x ", *buf++); ++ printk("(%d)%s: %s\n", smp_processor_id(), str, tmp); ++ } ++} ++ ++int ibscif_reserve_quota(int *npages) ++{ ++ int c, old, err; ++ ++ if (!*npages) ++ return 0; ++ ++ err = 0; ++ c = atomic_read(&avail_pages); ++ for (;;) { ++ if (unlikely(c < *npages)) ++ break; ++ old = atomic_cmpxchg(&avail_pages, c, c - *npages); ++ if (likely(old == c)) ++ break; ++ c = old; ++ } ++ ++ if (c < *npages) { ++ *npages = 0; ++ err = -EDQUOT; ++ } ++ ++ return err; ++} ++ ++void ibscif_release_quota(int npages) ++{ ++ if (npages) ++ atomic_add(npages, &avail_pages); ++} ++ ++/* ++ * To work around MPI's assumptions that data is written atomically in their ++ * header structures, write the first 16 integers of a transfer atomically. ++ * ++ * Update: the assumption of MPI's ofa module is different in that the last ++ * four bytes needs to be written last and atomically. The buffers used in ++ * this case is always aligned. ++ */ ++int ibscif_atomic_copy(void *dst_addr, void *src_addr, u32 copy_len, int head_copied) ++{ ++ volatile int *src_x = (int *)src_addr; ++ volatile int *dst_x = (int *)dst_addr; ++ volatile u8 *src_c, *dst_c; ++ int head_aligned, tail_aligned; ++ ++ if (unlikely(!copy_len)) ++ return head_copied; ++ ++ head_aligned = !((unsigned long)src_addr & (sizeof(int)-1)) && ++ !((unsigned long)dst_addr & (sizeof(int)-1)); ++ ++ ++ tail_aligned = !((unsigned long)(src_addr+copy_len) & (sizeof(int)-1)) && ++ !((unsigned long)(dst_addr+copy_len) & (sizeof(int)-1)); ++ ++ if (!head_copied && head_aligned) { ++ ++ switch (copy_len) { ++ case sizeof(int): ++ *dst_x = *src_x; ++ goto done; ++ case sizeof(int)*2: ++ *dst_x++ = *src_x++; ++ *dst_x = *src_x; ++ goto done; ++ case sizeof(int)*3: ++ *dst_x++ = *src_x++; ++ *dst_x++ = *src_x++; ++ *dst_x = *src_x; ++ goto done; ++ default: ++ if (copy_len >= (sizeof(int)*4)) { ++ /* We have at least a whole header to copy. */ ++ head_copied = 1; ++ copy_len -= sizeof(int)*4; ++ ++ *dst_x++ = *src_x++; ++ *dst_x++ = *src_x++; ++ *dst_x++ = *src_x++; ++ ++ if (copy_len == 0) { ++ *dst_x = *src_x; ++ goto done; ++ } ++ *dst_x++ = *src_x++; ++ } ++ break; ++ } ++ } ++ ++ /* The last integer is aligned. Copy all but the last int, then the last int */ ++ if (tail_aligned && copy_len >= sizeof(int)) { ++ copy_len -= sizeof(int); ++ if (copy_len) ++ memcpy((void *)dst_x, (void *)src_x, copy_len); ++ smp_wmb(); ++ src_x = (volatile int *)((char *)src_x + copy_len); ++ dst_x = (volatile int *)((char *)dst_x + copy_len); ++ *dst_x = *src_x; ++ goto done; ++ } ++ ++ /* Bad alignment. Copy all but the last byte, then the last byte */ ++ if (--copy_len) ++ memcpy((void *)dst_x, (void *)src_x, copy_len); ++ ++ src_c = ((volatile u8 *)src_x) + copy_len; ++ dst_c = ((volatile u8 *)dst_x) + copy_len; ++ smp_wmb(); ++ *dst_c = *src_c; ++done: ++ return head_copied; ++} ++ ++/* ++ * Because idr_pre_get acquires the same internal spinlock used by idr_pre_get/idr_remove ++ * calls under a write_lock_bh, we need to call idr_pre_get with bottom half disabled. ++ * We cannot simply take the write_lock_bh(&wiremap_lock) because idr_pre_get does a ++ * blocking memory allocation call. Since bh is disabled, mask must be GFP_ATOMIC. ++ */ ++static inline int ibscif_wiremap_pre_get(void) ++{ ++ int ret; ++ ++ local_bh_disable(); ++ ret = idr_pre_get(&wiremap, GFP_ATOMIC); ++ local_bh_enable(); ++ ++ return ret; ++} ++ ++int ibscif_wiremap_add(void *obj, int *id) ++{ ++ int ret; ++ ++ do { ++ if (!ibscif_wiremap_pre_get()) ++ return -ENOMEM; ++ ++ write_lock_bh(&wiremap_lock); ++ ret = idr_get_new(&wiremap, obj, id); ++ write_unlock_bh(&wiremap_lock); ++ } while (ret == -EAGAIN); ++ ++ return ret; ++} ++ ++void ibscif_wiremap_del(int id) ++{ ++ write_lock_bh(&wiremap_lock); ++ idr_remove(&wiremap, id); ++ write_unlock_bh(&wiremap_lock); ++} ++ ++static int ibscif_init_wiremap(void) ++{ ++ /* ++ * Instead of treating them as opaque, some applications assert that returned key ++ * values are non-zero. As a work-around, reserve the first key from the wiremap. ++ */ ++ int ret = ibscif_wiremap_add(&reserved_0, &reserved_0); ++ BUG_ON(reserved_0 != 0); ++ return ret; ++} ++ ++static void ibscif_free_wiremap(void) ++{ ++ write_lock_bh(&wiremap_lock); ++ idr_remove_all(&wiremap); ++ idr_destroy(&wiremap); ++ write_unlock_bh(&wiremap_lock); ++} ++ ++static void ibscif_init_params(void) ++{ ++ if ((max_pinned <= 0) || (max_pinned > 100)) { ++ max_pinned = DEFAULT_MAX_PINNED; ++ printk(KERN_WARNING PFX "Corrected max_pinned module parameter to %d.\n", ++ max_pinned); ++ } ++ if (window_size < MIN_WINDOW_SIZE) { ++ window_size = MIN_WINDOW_SIZE; ++ printk(KERN_WARNING PFX "Corrected window_size module parameter to %d.\n", ++ window_size); ++ } ++ if (rma_threshold < 0) { ++ rma_threshold = 0x7FFFFFFF; ++ printk(KERN_WARNING PFX "Corrected rma_threshold module parameter to %d.\n", ++ rma_threshold); ++ } ++ ++ /* ++ * Hardware RDMA devices have built-in limits on the number of registered pages. ++ * The avail_pages variable provides a limit for this software device. ++ */ ++ atomic_set(&avail_pages, max_pinned * (totalram_pages / 100)); ++} ++ ++static int __init ibscif_init(void) ++{ ++ int err; ++ ++ printk(KERN_INFO PFX "%s\n", ibscif_signon); ++ printk(KERN_INFO PFX "max_pinned=%d, window_size=%d, " ++ "blocking_send=%d, blocking_recv=%d, " ++ "fast_rdma=%d, " ++ "host_proxy=%d, " ++ "rma_threshold=%d, scif_loopback=%d, " ++ "new_ib_type=%d, verbose=%d, " ++ "check_grh=%d\n", ++ max_pinned, window_size, ++ blocking_send, blocking_recv, ++ fast_rdma, ++ host_proxy, ++ rma_threshold, scif_loopback, ++ new_ib_type, verbose, ++ check_grh); ++ ++ ibscif_init_params(); ++ ++ err = ibscif_init_wiremap(); ++ if (err) ++ return err; ++ ++ err = ibscif_dev_init(); ++ if (!err) ++ return 0; ++ ++ ibscif_free_wiremap(); ++ return err; ++} ++ ++static void __exit ibscif_exit(void) ++{ ++ ibscif_dev_cleanup(); ++ ibscif_free_wiremap(); ++ printk(KERN_INFO PFX "unloaded\n"); ++} ++ ++module_init(ibscif_init); ++module_exit(ibscif_exit); +diff -urN a7/drivers/infiniband/hw/scif/ibscif_mr.c a8/drivers/infiniband/hw/scif/ibscif_mr.c +--- a7/drivers/infiniband/hw/scif/ibscif_mr.c 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_mr.c 2015-02-23 10:14:37.484809663 -0800 +@@ -0,0 +1,559 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "ibscif_driver.h" ++ ++static int ibscif_mr_init_mreg(struct ibscif_mr *mr); ++ ++struct ib_mr *ibscif_get_dma_mr(struct ib_pd *ibpd, int access) ++{ ++ struct ibscif_dev *dev = to_dev(ibpd->device); ++ struct ibscif_mr *mr; ++ int err; ++ ++ if (!atomic_add_unless(&dev->mr_cnt, 1, MAX_MRS)) ++ return ERR_PTR(-EAGAIN); ++ ++ mr = kzalloc(sizeof *mr, GFP_KERNEL); ++ if (!mr) { ++ err = -ENOMEM; ++ printk(KERN_ALERT PFX "%s: unable to allocate mr.\n", __func__); ++ goto out1; ++ } ++ ++ kref_init(&mr->ref); ++ init_completion(&mr->done); ++ ++ err = ibscif_wiremap_add(mr, &mr->ibmr.lkey); ++ if (err) { ++ printk(KERN_ALERT PFX "%s: unable to allocate lkey.\n", __func__); ++ goto out2; ++ } ++ ++ if (mr->ibmr.lkey > IBSCIF_MR_MAX_KEY) { ++ err = -ENOSPC; ++ printk(KERN_ALERT PFX "%s: lkey (%x) out of range.\n", __func__, mr->ibmr.lkey); ++ goto out3; ++ } ++ ++ mr->ibmr.device = ibpd->device; /* For ibscif_dereg_mr() calls below. */ ++ mr->ibmr.rkey = mr->ibmr.lkey; ++ mr->access = access; ++ mr->magic = MR_MAGIC; ++ INIT_LIST_HEAD(&mr->mreg_list); ++ ++ return &mr->ibmr; ++ ++out3: ++ ibscif_wiremap_del(mr->ibmr.lkey); ++out2: ++ kfree(mr); ++out1: ++ atomic_dec(&dev->mr_cnt); ++ return ERR_PTR(err); ++} ++ ++struct ib_mr *ibscif_reg_phys_mr(struct ib_pd *ibpd, struct ib_phys_buf *phys_buf_array, ++ int num_phys_buf, int access, u64 *iova_start) ++{ ++ struct ibscif_mr *mr; ++ struct ib_mr *ibmr; ++ int i, j, k, err; ++ u64 mask; ++ ++ ibmr = ibscif_get_dma_mr(ibpd, access); ++ if (IS_ERR(ibmr)) ++ return ibmr; ++ ++ mr = to_mr(ibmr); ++ mr->addr = *iova_start; ++ ++ mask = 0; ++ for (i = 0; i < num_phys_buf; i++) { ++ if (i != 0) ++ mask |= phys_buf_array[i].addr; /* All but 1st are aligned */ ++ if (i != num_phys_buf - 1) ++ mask |= phys_buf_array[i].addr + phys_buf_array[i].size; /* Middle bufs are full pages */ ++ ++ mr->length += phys_buf_array[i].size; ++ } ++ if ((mask & ~PAGE_MASK) || (mr->length > MAX_MR_SIZE)) { ++ err = -EINVAL; ++ goto out; ++ } ++ if (mr->length && ((mr->addr + mr->length - 1) < mr->addr)) { ++ err = -EOVERFLOW; ++ goto out; ++ } ++ ++ phys_buf_array[0].size += phys_buf_array[0].addr & ~PAGE_MASK; /* Adjust 1st buf size by page offset */ ++ phys_buf_array[0].addr &= PAGE_MASK; /* Truncate 1st buf to start of page */ ++ ++ for (i = 0; i < num_phys_buf; i++) ++ mr->npages += PAGE_ALIGN(phys_buf_array[i].size) >> PAGE_SHIFT; ++ ++ if (!mr->npages) ++ return &mr->ibmr; ++ ++ err = ibscif_reserve_quota(&mr->npages); ++ if (err) ++ goto out; ++ ++ mr->page = vzalloc(mr->npages * sizeof *mr->page); ++ if (!mr->page) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ k = 0; ++ for (i = 0; i < num_phys_buf; i++) ++ for (j = 0; j < PAGE_ALIGN(phys_buf_array[i].size) >> PAGE_SHIFT; j++) ++ mr->page[k++] = pfn_to_page((phys_buf_array[i].addr >> PAGE_SHIFT) + j); ++ ++ return &mr->ibmr; ++out: ++ ibscif_dereg_mr(ibmr); ++ return ERR_PTR(err); ++} ++ ++#ifdef MOFED ++struct ib_mr *ibscif_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, ++ u64 virt_addr, int access, struct ib_udata *udata, int mr_id) ++#else ++struct ib_mr *ibscif_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, ++ u64 virt_addr, int access, struct ib_udata *udata) ++#endif ++{ ++ struct ib_mr *ibmr; ++ struct ibscif_mr *mr; ++ struct scatterlist *sg; ++ struct ibscif_dev *dev; ++ int i, k, err; ++ ++ if (length && ((start + length - 1) < start)) ++ return ERR_PTR(-EOVERFLOW); ++ ++ ibmr = ibscif_get_dma_mr(ibpd, access); ++ if (IS_ERR(ibmr)) ++ return ibmr; ++ ++ mr = to_mr(ibmr); ++ mr->addr = start; ++ ++ mr->umem = ib_umem_get(ibpd->uobject->context, start, length, access, 0/*dma_sync*/); ++ if (IS_ERR(mr->umem)) { ++ err = PTR_ERR(mr->umem); ++ printk(KERN_ALERT PFX "%s: ib_umem_get returns %d.\n", __func__, err); ++ goto out; ++ } ++ ++ mr->npages = ib_umem_page_count(mr->umem); ++ if (!mr->npages) ++ return &mr->ibmr; ++ ++ mr->length = mr->umem->length; ++ ++ err = ibscif_reserve_quota(&mr->npages); ++ if (err) ++ goto out; ++ ++ mr->page = vzalloc(mr->npages * sizeof *mr->page); ++ if (!mr->page) { ++ err = -ENOMEM; ++ printk(KERN_ALERT PFX "%s: unable to allocate mr->page.\n", __func__); ++ goto out; ++ } ++ ++ k = 0; ++ for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, i) ++ mr->page[k++] = sg_page(sg); ++ ++ err = ibscif_mr_init_mreg(mr); ++ if (err) ++ goto out; ++ ++ dev = to_dev(mr->ibmr.device); ++ down(&dev->mr_list_mutex); ++ list_add_tail(&mr->entry, &dev->mr_list); ++ up(&dev->mr_list_mutex); ++ ++ return &mr->ibmr; ++out: ++ ibscif_dereg_mr(ibmr); ++ return ERR_PTR(err); ++} ++ ++void ibscif_complete_mr(struct kref *ref) ++{ ++ struct ibscif_mr *mr = container_of(ref, struct ibscif_mr, ref); ++ complete(&mr->done); ++} ++ ++int ibscif_dereg_mr(struct ib_mr *ibmr) ++{ ++ struct ibscif_dev *dev = to_dev(ibmr->device); ++ struct ibscif_mr *mr = to_mr(ibmr); ++ struct ibscif_mreg_info *mreg, *next; ++ struct ibscif_mr *mr0, *next0; ++ int ret; ++ ++ ibscif_put_mr(mr); ++ wait_for_completion(&mr->done); ++ ++ list_for_each_entry_safe(mreg, next, &mr->mreg_list, entry) { ++ do { ++ ret = scif_unregister(mreg->conn->ep, mreg->aligned_offset, mreg->aligned_length); ++ } ++ while (ret == -ERESTARTSYS); ++ ++ if (ret && ret != -ENOTCONN) ++ printk(KERN_ALERT PFX "%s: scif_unregister returns %d. ep=%p, offset=%llx, length=%x\n", ++ __func__, ret, mreg->conn->ep, mreg->aligned_offset, mreg->aligned_length); ++ ++ ibscif_put_conn(mreg->conn); ++ list_del(&mreg->entry); ++ kfree(mreg); ++ } ++ ++ down(&dev->mr_list_mutex); ++ list_for_each_entry_safe(mr0, next0, &dev->mr_list, entry) { ++ if (mr0 == mr) { ++ list_del(&mr0->entry); ++ break; ++ } ++ } ++ up(&dev->mr_list_mutex); ++ ++ if (mr->pinned_pages) ++ scif_unpin_pages(mr->pinned_pages); ++ ++ if (mr->umem && !IS_ERR(mr->umem)) ++ ib_umem_release(mr->umem); ++ if (mr->page) ++ vfree(mr->page); ++ ++ ibscif_release_quota(mr->npages); ++ atomic_dec(&dev->mr_cnt); ++ ++ ibscif_wiremap_del(mr->ibmr.lkey); ++ ++ kfree(mr); ++ return 0; ++} ++ ++/* ++ * Lookup and validate the given memory region access. A reference is held on success. ++ */ ++struct ibscif_mr *ibscif_validate_mr(u32 key, u64 addr, int length, ++ struct ib_pd *ibpd, enum ib_access_flags access) ++{ ++ struct ibscif_mr *mr; ++ int err; ++ ++ mr = ibscif_get_mr(key); ++ if (unlikely(IS_ERR(mr))) ++ return mr; ++ ++ if (unlikely(mr->ibmr.pd != ibpd)) { ++ err = -EPERM; ++ goto out; ++ } ++ if (unlikely(access && !(mr->access & access))) { ++ err = -EACCES; ++ goto out; ++ } ++ if (unlikely((addr < mr->addr) || ((addr + length) > (mr->addr + mr->length)))) { ++ err = -ERANGE; ++ goto out; ++ } ++ ++ return mr; ++out: ++ ibscif_put_mr(mr); ++ return ERR_PTR(err); ++} ++ ++static void ibscif_dma_nop(struct ib_device *ibdev, u64 addr, size_t size, enum dma_data_direction direction) ++{ ++} ++ ++static int ibscif_mapping_error(struct ib_device *ibdev, u64 dma_addr) ++{ ++ return !dma_addr; ++} ++ ++static u64 ibscif_dma_map_single(struct ib_device *ibdev, void *cpu_addr, size_t size, ++ enum dma_data_direction direction) ++{ ++ return (u64)cpu_addr; ++} ++ ++static u64 ibscif_dma_map_page(struct ib_device *ibdev, struct page *page, unsigned long offset, size_t size, ++ enum dma_data_direction direction) ++{ ++ u64 addr; ++ ++ if (offset + size > PAGE_SIZE) ++ return 0; ++ ++ addr = (u64)page_address(page); ++ if (addr) ++ addr += offset; ++ ++ return addr; ++} ++ ++static int ibscif_map_sg(struct ib_device *ibdev, struct scatterlist *sg, int nents, ++ enum dma_data_direction direction) ++{ ++ u64 addr; ++ int i, ret = nents; ++ ++ for (i = 0; i < nents; i++, sg++) { ++ addr = (u64)page_address(sg_page(sg)); ++ if (!addr) { ++ ret = 0; ++ break; ++ } ++ ++ sg->dma_address = sg->offset + addr; ++ sg->dma_length = sg->length; ++ } ++ return ret; ++} ++ ++static void ibscif_unmap_sg(struct ib_device *ibdev, struct scatterlist *sg, int nents, ++ enum dma_data_direction direction) ++{ ++} ++ ++static u64 ibscif_sg_dma_address(struct ib_device *ibdev, struct scatterlist *sg) ++{ ++ return (u64)sg->dma_address; ++} ++ ++static unsigned int ibscif_sg_dma_len(struct ib_device *ibdev, struct scatterlist *sg) ++{ ++ return sg->dma_length; ++} ++ ++static void *ibscif_dma_alloc_coherent(struct ib_device *ibdev, size_t size, u64 *dma_handle, gfp_t flag) ++{ ++ struct page *p = alloc_pages(flag, get_order(size)); ++ void *addr = p ? page_address(p) : NULL; ++ ++ if (dma_handle) ++ *dma_handle = (u64)addr; ++ ++ return addr; ++} ++ ++static void ibscif_dma_free_coherent(struct ib_device *ibdev, size_t size, void *cpu_addr, u64 dma_handle) ++{ ++ free_pages((unsigned long)cpu_addr, get_order(size)); ++} ++ ++struct ib_dma_mapping_ops ibscif_dma_mapping_ops = { ++ ibscif_mapping_error, ++ ibscif_dma_map_single, ++ ibscif_dma_nop, ++ ibscif_dma_map_page, ++ ibscif_dma_nop, ++ ibscif_map_sg, ++ ibscif_unmap_sg, ++ ibscif_sg_dma_address, ++ ibscif_sg_dma_len, ++ ibscif_dma_nop, ++ ibscif_dma_nop, ++ ibscif_dma_alloc_coherent, ++ ibscif_dma_free_coherent ++}; ++ ++static void ibscif_dump_mr_list( struct ibscif_dev *dev ) ++{ ++ struct ibscif_mr *mr; ++ ++ list_for_each_entry(mr, &dev->mr_list, entry){ ++ printk(KERN_ALERT PFX "%s: mr=%p [%llx, %x, %x]\n", __func__, mr, mr->addr, mr->length, mr->ibmr.rkey); ++ } ++} ++ ++static int ibscif_mr_reg_with_conn(struct ibscif_mr *mr, struct ibscif_conn *conn, struct ibscif_mreg_info **new_mreg) ++{ ++ struct ibscif_mreg_info *mreg; ++ off_t offset, aligned_offset; ++ u64 aligned_addr; ++ int aligned_length; ++ int offset_in_page; ++ int err; ++ ++ aligned_addr = mr->addr & PAGE_MASK; ++ offset_in_page = (int)(mr->addr & ~PAGE_MASK); ++ aligned_length = (mr->length + offset_in_page + PAGE_SIZE - 1) & PAGE_MASK; ++ aligned_offset = IBSCIF_MR_VADDR_TO_OFFSET(mr->ibmr.rkey, aligned_addr); ++ ++ offset = scif_register_pinned_pages(conn->ep, mr->pinned_pages, aligned_offset, SCIF_MAP_FIXED); ++ ++ if (IS_ERR_VALUE(offset)) { ++ printk(KERN_ALERT PFX "%s: scif_register_pinned_pages returns %d\n", __func__, (int)offset); ++ printk(KERN_ALERT PFX "%s: conn=%p, ep=%p, mr=%p, addr=%llx, length=%x, rkey=%x, " ++ "aligned_addr=%llx, aligned_length=%x, aligned_offset=%llx\n", ++ __func__, conn, conn->ep, mr, mr->addr, mr->length, mr->ibmr.rkey, ++ aligned_addr, aligned_length, (uint64_t)aligned_offset); ++ ibscif_dump_mr_list(conn->dev); ++ return (int)offset; ++ } ++ ++ BUG_ON(offset != aligned_offset); ++ ++ offset += offset_in_page; ++ ++ mreg = kzalloc(sizeof(struct ibscif_mreg_info), GFP_KERNEL); ++ if (!mreg) { ++ do { ++ err = scif_unregister(conn->ep, aligned_offset, aligned_length); ++ } ++ while (err == -ERESTARTSYS); ++ ++ if (err && err != -ENOTCONN) ++ printk(KERN_ALERT PFX "%s: scif_unregister returns %d. ep=%p, offset=%llx, length=%x\n", ++ __func__, err, conn->ep, (uint64_t)aligned_offset, aligned_length); ++ ++ return -ENOMEM; ++ } ++ mreg->conn = conn; ++ mreg->offset = (u64)offset; ++ mreg->aligned_offset = aligned_offset; ++ mreg->aligned_length = aligned_length; ++ list_add_tail(&mreg->entry, &mr->mreg_list); ++ ++ atomic_inc(&conn->refcnt); ++ if (conn->local_close) { ++ conn->local_close = 0; ++ ibscif_send_reopen(conn); ++ } ++ ++ if (new_mreg) ++ *new_mreg = mreg; ++ ++ return 0; ++} ++ ++struct ibscif_mreg_info *ibscif_mr_get_mreg(struct ibscif_mr *mr, struct ibscif_conn *conn) ++{ ++ struct ibscif_mreg_info *mreg; ++ int err; ++ int i; ++ ++ if (unlikely(!conn)) { ++ printk(KERN_ALERT PFX "%s: conn==NULL\n", __func__); ++ return NULL; ++ } ++ ++ list_for_each_entry(mreg, &mr->mreg_list, entry){ ++ if (mreg->conn == conn) ++ return mreg; ++ } ++ ++ mreg = NULL; ++ err = ibscif_mr_reg_with_conn(mr, conn, &mreg); ++ if (err != -EADDRINUSE) ++ return mreg; ++ ++ /* another thread is performing the registration */ ++ if (verbose) ++ printk(KERN_INFO PFX "%s: mr is being registered by another thread. mr=%p, conn=%p.\n", __func__, mr, conn); ++ for (i=0; i<10000; i++) { ++ list_for_each_entry(mreg, &mr->mreg_list, entry){ ++ if (mreg->conn == conn) { ++ if (verbose) ++ printk(KERN_INFO PFX "%s: got mreg after %d retries.\n", __func__, i+1); ++ return mreg; ++ } ++ } ++ schedule(); ++ } ++ if (verbose) ++ printk(KERN_INFO PFX "%s: failed to get mreg after %d retries.\n", __func__, i); ++ return NULL; ++} ++ ++static int ibscif_mr_init_mreg(struct ibscif_mr *mr) ++{ ++ struct ibscif_dev *dev = to_dev(mr->ibmr.device); ++ struct ibscif_conn *conn; ++ int prot; ++ u64 aligned_addr; ++ int aligned_length; ++ int offset_in_page; ++ int err; ++ ++ aligned_addr = mr->addr & PAGE_MASK; ++ offset_in_page = (int)(mr->addr & ~PAGE_MASK); ++ aligned_length = (mr->length + offset_in_page + PAGE_SIZE - 1) & PAGE_MASK; ++ ++#if 0 ++ prot = ((mr->access & IB_ACCESS_REMOTE_READ)?SCIF_PROT_READ:0) | ++ ((mr->access & IB_ACCESS_REMOTE_WRITE)?SCIF_PROT_WRITE:0); ++#else ++ // In IB, the same buffer can be registered multiple times with different access rights. ++ // SCIF doesn't have mechanism to support that. So we just turn on all the access rights. ++ // Otherwise we may end up with protection error. ++ prot = SCIF_PROT_READ | SCIF_PROT_WRITE; ++#endif ++ ++ err = scif_pin_pages((void *)aligned_addr, aligned_length, prot, 0/*user addr*/, &mr->pinned_pages); ++ if (err) { ++ printk(KERN_ALERT PFX "%s: scif_pin_pages returns %d\n", __func__, err); ++ return err; ++ } ++ ++ down(&dev->mutex); ++ list_for_each_entry(conn, &dev->conn_list, entry) { ++ err = ibscif_mr_reg_with_conn(mr, conn, NULL); ++ if (err) ++ break; ++ } ++ up(&dev->mutex); ++ ++ return err; ++} ++ ++void ibscif_refresh_mreg( struct ibscif_conn *conn ) ++{ ++ struct ibscif_mr *mr; ++ ++ down(&conn->dev->mr_list_mutex); ++ list_for_each_entry(mr, &conn->dev->mr_list, entry){ ++ ibscif_mr_get_mreg(mr, conn); ++ } ++ up(&conn->dev->mr_list_mutex); ++} ++ +diff -urN a7/drivers/infiniband/hw/scif/ibscif_pd.c a8/drivers/infiniband/hw/scif/ibscif_pd.c +--- a7/drivers/infiniband/hw/scif/ibscif_pd.c 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_pd.c 2015-02-23 10:14:37.484809663 -0800 +@@ -0,0 +1,56 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "ibscif_driver.h" ++ ++struct ib_pd *ibscif_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) ++{ ++ struct ibscif_dev *dev = to_dev(ibdev); ++ struct ibscif_pd *pd; ++ ++ if (!atomic_add_unless(&dev->pd_cnt, 1, MAX_PDS)) ++ return ERR_PTR(-EAGAIN); ++ ++ pd = kzalloc(sizeof *pd, GFP_KERNEL); ++ if (!pd) { ++ atomic_dec(&dev->pd_cnt); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ return &pd->ibpd; ++} ++ ++int ibscif_dealloc_pd(struct ib_pd *ibpd) ++{ ++ struct ibscif_dev *dev = to_dev(ibpd->device); ++ atomic_dec(&dev->pd_cnt); ++ kfree(to_pd(ibpd)); ++ return 0; ++} +diff -urN a7/drivers/infiniband/hw/scif/ibscif_post.c a8/drivers/infiniband/hw/scif/ibscif_post.c +--- a7/drivers/infiniband/hw/scif/ibscif_post.c 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_post.c 2015-02-23 10:14:37.485809663 -0800 +@@ -0,0 +1,306 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "ibscif_driver.h" ++ ++void ibscif_dump_sg(char *str, struct ib_sge *sge, int num) ++{ ++ extern void ibscif_dump(char*, void*, int); ++ if (!sge) ++ return; ++ while (num--) { ++ ibscif_dump(str, (void*)sge->addr, sge->length); ++ sge++; ++ } ++} ++ ++/* ++ * Build and validate the wr->ds_list from the given sg_list. ++ * If successful, a reference is held on each mr in the wr->ds_list. ++ */ ++static int ibscif_wr_ds(struct ib_pd *ibpd, struct ib_sge *sg_list, int num_sge, ++ struct ibscif_wr *wr, int *total_length, enum ib_access_flags access) ++{ ++ struct ibscif_ds *ds_list = wr->ds_list; ++ int err; ++ ++ *total_length = 0; ++ for (wr->num_ds = 0; wr->num_ds < num_sge; sg_list++, ds_list++) { ++ ++ ds_list->mr = ibscif_validate_mr(sg_list->lkey, sg_list->addr, sg_list->length, ibpd, access); ++ if (unlikely(IS_ERR(ds_list->mr))) { ++ err = PTR_ERR(ds_list->mr); ++ goto out; ++ } ++ ++ ds_list->in_use = 1; ++ wr->num_ds++; ++ ++ if (unlikely((*total_length + sg_list->length) < *total_length)) { ++ err = -EOVERFLOW; ++ goto out; ++ } ++ ++ ds_list->offset = sg_list->addr - ds_list->mr->addr; ++ ds_list->length = sg_list->length; ++ ds_list->lkey = sg_list->lkey; ++ ds_list->current_mreg = NULL; ++ ++ *total_length += ds_list->length; ++ } ++ ++ return 0; ++out: ++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds); ++ return err; ++} ++ ++int ibscif_post_send(struct ib_qp *ibqp, struct ib_send_wr *ibwr, struct ib_send_wr **bad_wr) ++{ ++ struct ibscif_qp *qp = to_qp(ibqp); ++ struct ibscif_wq *sq = &qp->sq; ++ struct ibscif_wr *wr; ++ int nreq = 0, err; ++ ++ IBSCIF_PERF_SAMPLE(0, 0); ++ ++ spin_lock_bh(&sq->lock); ++ ++ if (unlikely(ibqp->qp_type != IB_QPT_UD && qp->state != QP_CONNECTED)) { ++ err = -ENOTCONN; ++ goto out; ++ } ++ if (unlikely(!sq->size)) { ++ err = -ENOSPC; ++ goto out; ++ } ++ ++ for (err = 0; ibwr; ibwr = ibwr->next, nreq++) { ++ ++ if (unlikely(sq->depth == sq->size)) { ++ err = -ENOBUFS; ++ goto out; ++ } ++ if (unlikely(ibwr->num_sge > sq->max_sge)) { ++ err = -E2BIG; ++ goto out; ++ } ++ ++ wr = ibscif_get_wr(sq, sq->tail); ++ ++ memset(&wr->sar, 0, sizeof wr->sar); ++ ++ wr->id = ibwr->wr_id; ++ wr->opcode = ibwr->opcode; ++ wr->flags = ibwr->send_flags | ((qp->sq_policy == IB_SIGNAL_ALL_WR) ? IB_SEND_SIGNALED : 0); ++ wr->state = WR_WAITING; ++ wr->use_rma = 0; ++ wr->rma_id = 0; ++ ++ if (ibqp->qp_type == IB_QPT_UD) { ++ wr->opcode = WR_UD; ++ wr->ud.remote_node_id = IBSCIF_LID_TO_NODE_ID(be16_to_cpu(to_ah(ibwr->wr.ud.ah)->dlid)); ++ wr->ud.remote_qpn = ibwr->wr.ud.remote_qpn; ++ ++ /* the remainings are the same as IB_WR_SEND */ ++ err = ibscif_wr_ds(ibqp->pd, ibwr->sg_list, ibwr->num_sge, wr, &wr->length, 0); ++ if (unlikely(err)) ++ goto out; ++ wr->msg_id = sq->wirestate->tx.next_msg_id++; ++ } ++ ++ else switch (ibwr->opcode) { ++ ++ case IB_WR_SEND_WITH_IMM: ++ wr->send.immediate_data = ibwr->ex.imm_data; ++ case IB_WR_SEND: ++ err = ibscif_wr_ds(ibqp->pd, ibwr->sg_list, ibwr->num_sge, wr, &wr->length, 0); ++ if (unlikely(err)) ++ goto out; ++ wr->msg_id = sq->wirestate->tx.next_msg_id++; ++ if (wr->length > rma_threshold) { ++ wr->use_rma = 1; ++ wr->rma_id = sq->next_msg_id; ++ } ++ break; ++ ++ case IB_WR_RDMA_WRITE_WITH_IMM: ++ wr->msg_id = sq->wirestate->tx.next_msg_id++; ++ wr->write.immediate_data = ibwr->ex.imm_data; ++ case IB_WR_RDMA_WRITE: ++ err = ibscif_wr_ds(ibqp->pd, ibwr->sg_list, ibwr->num_sge, wr, &wr->length, 0); ++ if (unlikely(err)) ++ goto out; ++ if (wr->length && ++ ((ibwr->wr.rdma.remote_addr + wr->length - 1) < ibwr->wr.rdma.remote_addr)) { ++ err = -EOVERFLOW; ++ goto out; ++ } ++ wr->write.remote_address = ibwr->wr.rdma.remote_addr; ++ wr->write.rkey = ibwr->wr.rdma.rkey; ++ if (ibwr->opcode == IB_WR_RDMA_WRITE) ++ wr->msg_id = 0; ++ if (wr->length > rma_threshold) { ++ wr->use_rma = 1; ++ wr->rma_id = sq->next_msg_id; ++ } ++ break; ++ ++ case IB_WR_RDMA_READ: ++ if (unlikely(!qp->max_or)) { ++ err = -ENOBUFS; ++ goto out; ++ } ++ err = ibscif_wr_ds(ibqp->pd, ibwr->sg_list, ibwr->num_sge, wr, &wr->length, IB_ACCESS_LOCAL_WRITE); ++ if (unlikely(err)) ++ goto out; ++ if (wr->length && ++ ((ibwr->wr.rdma.remote_addr + wr->length - 1) < ibwr->wr.rdma.remote_addr)) { ++ err = -EOVERFLOW; ++ goto out; ++ } ++ wr->read.remote_address = ibwr->wr.rdma.remote_addr; ++ wr->read.remote_length = wr->length; ++ wr->read.rkey = ibwr->wr.rdma.rkey; ++ wr->length = 0; /* No tx data with this opcode */ ++ wr->msg_id = sq->next_msg_id; ++ atomic_inc(&qp->or_posted); ++ if (wr->read.remote_length > rma_threshold) { ++ wr->use_rma = 1; ++ wr->rma_id = wr->msg_id; ++ } ++ break; ++ ++ case IB_WR_ATOMIC_CMP_AND_SWP: ++ case IB_WR_ATOMIC_FETCH_AND_ADD: ++ if (unlikely(!qp->max_or)) { ++ err = -ENOBUFS; ++ goto out; ++ } ++ if (unlikely(ibwr->wr.atomic.remote_addr & (sizeof wr->atomic_rsp.orig_data - 1))) { ++ err = -EADDRNOTAVAIL; ++ goto out; ++ } ++ err = ibscif_wr_ds(ibqp->pd, ibwr->sg_list, ibwr->num_sge, wr, &wr->length, IB_ACCESS_LOCAL_WRITE); ++ if (unlikely(err)) ++ goto out; ++ if (unlikely(wr->length < sizeof wr->atomic_rsp.orig_data)) { ++ err = -EINVAL; ++ goto out; ++ } ++ if (ibwr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { ++ wr->cmp_swp.cmp_operand = ibwr->wr.atomic.compare_add; ++ wr->cmp_swp.swp_operand = ibwr->wr.atomic.swap; ++ wr->cmp_swp.remote_address = ibwr->wr.atomic.remote_addr; ++ wr->cmp_swp.rkey = ibwr->wr.atomic.rkey; ++ } else { ++ wr->fetch_add.add_operand = ibwr->wr.atomic.compare_add; ++ wr->fetch_add.remote_address = ibwr->wr.atomic.remote_addr; ++ wr->fetch_add.rkey = ibwr->wr.atomic.rkey; ++ } ++ wr->length = 0; /* No tx data with these opcodes */ ++ wr->msg_id = sq->next_msg_id; ++ atomic_inc(&qp->or_posted); ++ break; ++ ++ default: ++ err = -ENOMSG; ++ goto out; ++ } ++ ++ DEV_STAT(qp->dev, wr_opcode[wr->opcode]++); ++ ibscif_append_wq(sq); ++ } ++out: ++ spin_unlock_bh(&sq->lock); ++ ++ IBSCIF_PERF_SAMPLE(1, 0); ++ ++ if (err) ++ *bad_wr = ibwr; ++ if (nreq) ++ ibscif_schedule(sq); ++ ++ IBSCIF_PERF_SAMPLE(9, 1); ++ ++ return err; ++} ++ ++int ibscif_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ibwr, struct ib_recv_wr **bad_wr) ++{ ++ struct ibscif_qp *qp = to_qp(ibqp); ++ struct ibscif_wq *rq = &qp->rq; ++ struct ibscif_wr *wr; ++ int err; ++ ++ spin_lock_bh(&rq->lock); ++ ++ if ((qp->state != QP_IDLE) && (qp->state != QP_CONNECTED)) { ++ err = -ENOTCONN; ++ goto out; ++ } ++ if (unlikely(!rq->size)) { ++ err = -ENOSPC; ++ goto out; ++ } ++ ++ for (err = 0; ibwr; ibwr = ibwr->next) { ++ ++ if (unlikely(rq->depth == rq->size)) { ++ err = -ENOBUFS; ++ goto out; ++ } ++ if (unlikely(ibwr->num_sge > rq->max_sge)) { ++ err = -E2BIG; ++ goto out; ++ } ++ ++ wr = ibscif_get_wr(rq, rq->tail); ++ ++ memset(&wr->sar, 0, sizeof wr->sar); ++ ++ wr->id = ibwr->wr_id; ++ wr->msg_id = rq->next_msg_id; ++ wr->state = WR_WAITING; ++ ++ err = ibscif_wr_ds(ibqp->pd, ibwr->sg_list, ibwr->num_sge, wr, &wr->length, IB_ACCESS_LOCAL_WRITE); ++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds); ++ if (unlikely(err)) ++ goto out; ++ ++ ibscif_append_wq(rq); ++ } ++out: ++ spin_unlock_bh(&rq->lock); ++ if (err) ++ *bad_wr = ibwr; ++ ++ return err; ++} +diff -urN a7/drivers/infiniband/hw/scif/ibscif_procfs.c a8/drivers/infiniband/hw/scif/ibscif_procfs.c +--- a7/drivers/infiniband/hw/scif/ibscif_procfs.c 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_procfs.c 2015-02-23 10:14:37.485809663 -0800 +@@ -0,0 +1,180 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "ibscif_driver.h" ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) ++static int ibscif_stats_show(struct seq_file *m, void *v) ++#else ++static int ibscif_stats_read(char *page, char **start, off_t offset, ++ int count, int *eof, void *data) ++#endif ++{ ++ int l = 0; ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) ++ struct ibscif_dev *dev = m->private; ++#else ++ struct ibscif_dev *dev = data; ++ char *m = page; ++ ++ if (offset) ++ return 0; ++ ++ *eof = 1; ++#endif ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) ++ seq_printf ++#else ++ l += sprintf ++#endif ++ (m, ++ "%s statistics:\n" ++ " tx_bytes %lu rx_bytes %lu\n" ++ " tx_pkts %lu rx_pkts %lu loopback_pkts %lu\n" ++ " sched_exhaust %lu unavailable %lu\n" ++ " tx_errors %lu duplicates %lu\n" ++ " total wr %lu :\n" ++ " send %lu send_imm %lu write %lu write_imm %lu\n" ++ " recv %lu recv_imm %lu read %lu comp %lu fetch %lu\n" ++ " read_rsp %lu atomic_rsp %lu ud %lu\n" ++ " fast_rdma :\n" ++ " write %lu read %lu unavailable %lu fallback %lu force_ack %lu tail_write %lu\n", ++ dev->ibdev.name, ++ DEV_STAT(dev, bytes_sent), ++ DEV_STAT(dev, bytes_rcvd), ++ DEV_STAT(dev, packets_sent), ++ DEV_STAT(dev, packets_rcvd), ++ DEV_STAT(dev, loopback), ++ DEV_STAT(dev, sched_exhaust), ++ DEV_STAT(dev, unavailable), ++ DEV_STAT(dev, tx_errors), ++ DEV_STAT(dev, duplicates), ++ DEV_STAT(dev, wr_opcode[WR_SEND]) + ++ DEV_STAT(dev, wr_opcode[WR_SEND_WITH_IMM]) + ++ DEV_STAT(dev, wr_opcode[WR_RDMA_WRITE]) + ++ DEV_STAT(dev, wr_opcode[WR_RDMA_WRITE_WITH_IMM]) + ++ DEV_STAT(dev, recv) + ++ DEV_STAT(dev, recv_imm) + ++ DEV_STAT(dev, wr_opcode[WR_RDMA_READ]) + ++ DEV_STAT(dev, wr_opcode[WR_ATOMIC_CMP_AND_SWP]) + ++ DEV_STAT(dev, wr_opcode[WR_ATOMIC_FETCH_AND_ADD]) + ++ DEV_STAT(dev, wr_opcode[WR_RDMA_READ_RSP]) + ++ DEV_STAT(dev, wr_opcode[WR_ATOMIC_RSP]), ++ DEV_STAT(dev, wr_opcode[WR_SEND]), ++ DEV_STAT(dev, wr_opcode[WR_SEND_WITH_IMM]), ++ DEV_STAT(dev, wr_opcode[WR_RDMA_WRITE]), ++ DEV_STAT(dev, wr_opcode[WR_RDMA_WRITE_WITH_IMM]), ++ DEV_STAT(dev, recv), ++ DEV_STAT(dev, recv_imm), ++ DEV_STAT(dev, wr_opcode[WR_RDMA_READ]), ++ DEV_STAT(dev, wr_opcode[WR_ATOMIC_CMP_AND_SWP]), ++ DEV_STAT(dev, wr_opcode[WR_ATOMIC_FETCH_AND_ADD]), ++ DEV_STAT(dev, wr_opcode[WR_RDMA_READ_RSP]), ++ DEV_STAT(dev, wr_opcode[WR_ATOMIC_RSP]), ++ DEV_STAT(dev, wr_opcode[WR_UD]), ++ DEV_STAT(dev, fast_rdma_write), ++ DEV_STAT(dev, fast_rdma_read), ++ DEV_STAT(dev, fast_rdma_unavailable), ++ DEV_STAT(dev, fast_rdma_fallback), ++ DEV_STAT(dev, fast_rdma_force_ack), ++ DEV_STAT(dev, fast_rdma_tail_write) ++ ); ++ ++ return l; ++} ++ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)) ++static ssize_t ibscif_stats_write(struct file *file, const char __user *buffer, ++ size_t count, loff_t *ppos) ++{ ++ struct ibscif_dev *dev = PDE_DATA(file_inode(file)); ++ memset(&dev->stats, 0, sizeof dev->stats); ++ return count; ++} ++ ++static int ibscif_stats_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, ibscif_stats_show, PDE_DATA(inode)); ++} ++ ++struct file_operations ibscif_fops = { ++ .owner = THIS_MODULE, ++ .open = ibscif_stats_open, ++ .read = seq_read, ++ .write = ibscif_stats_write, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++int ibscif_procfs_add_dev(struct ibscif_dev *dev) ++{ ++ dev->procfs = proc_mkdir(dev->ibdev.name, init_net.proc_net); ++ if (!dev->procfs) ++ return -ENOENT; ++ ++ if (proc_create_data("stats", S_IRUGO | S_IWUGO, dev->procfs, ++ &ibscif_fops ,dev)) ++ return -ENOENT; ++ ++ return 0; ++} ++#else /* (LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)) */ ++static int ibscif_stats_write(struct file *file, const char __user *buffer, unsigned long count, void *data) ++{ ++ struct ibscif_dev *dev = data; ++ memset(&dev->stats, 0, sizeof dev->stats); ++ return count; ++} ++ ++int ibscif_procfs_add_dev(struct ibscif_dev *dev) ++{ ++ struct proc_dir_entry *entry; ++ ++ dev->procfs = proc_mkdir(dev->ibdev.name, init_net.proc_net); ++ if (!dev->procfs) ++ return -ENOENT; ++ ++ entry = create_proc_read_entry("stats", S_IRUGO | S_IWUGO, dev->procfs, ibscif_stats_read, dev); ++ if (!entry) ++ return -ENOENT; ++ entry->write_proc = ibscif_stats_write; ++ ++ return 0; ++} ++#endif ++ ++void ibscif_procfs_remove_dev(struct ibscif_dev *dev) ++{ ++ if (dev->procfs) ++ remove_proc_entry("stats", dev->procfs); ++ remove_proc_entry(dev->ibdev.name, init_net.proc_net); ++} +diff -urN a7/drivers/infiniband/hw/scif/ibscif_protocol.c a8/drivers/infiniband/hw/scif/ibscif_protocol.c +--- a7/drivers/infiniband/hw/scif/ibscif_protocol.c 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_protocol.c 2015-02-23 10:14:37.487809663 -0800 +@@ -0,0 +1,2816 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "ibscif_driver.h" ++ ++#include ++/* dev/wr/qp backpointers overlayed in skb cb[] */ ++struct ibscif_skb_cb { ++ struct ibscif_dev *dev; ++ struct ibscif_wr *wr; ++ scif_epd_t scif_ep; ++ struct ibscif_qp *qp; /* for UD only */ ++}; ++ ++#define SET_SKB_DEV(skb,dev0) ((struct ibscif_skb_cb *)&skb->cb)->dev = dev0 ++#define SET_SKB_WR(skb,wr0) ((struct ibscif_skb_cb *)&skb->cb)->wr = wr0 ++#define SET_SKB_EP(skb,ep0) ((struct ibscif_skb_cb *)&skb->cb)->scif_ep = ep0 ++#define SET_SKB_QP(skb,qp0) ((struct ibscif_skb_cb *)&skb->cb)->qp = qp0 ++ ++#define GET_SKB_DEV(skb) ((struct ibscif_skb_cb *)&skb->cb)->dev ++#define GET_SKB_WR(skb) ((struct ibscif_skb_cb *)&skb->cb)->wr ++#define GET_SKB_EP(skb) ((struct ibscif_skb_cb *)&skb->cb)->scif_ep ++#define GET_SKB_QP(skb) ((struct ibscif_skb_cb *)&skb->cb)->qp ++ ++#define hw_addr_equal(h1, h2) (!memcmp(h1, h2, ETH_ALEN)) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0) ++ #define KMAP(x) kmap(x->page) ++ #define KUNMAP(x) kunmap(x->page) ++ #define SET_PAGE(x,y) x->page = y ++ #define GET_PAGE(x) get_page(x->page) ++#else ++ #define KMAP(x) kmap(skb_frag_page(x)) ++ #define KUNMAP(x) kunmap(skb_frag_page(x)) ++ #define SET_PAGE(x,y) __skb_frag_set_page(x, y) ++ #define GET_PAGE(x) __skb_frag_ref(x) ++#endif ++ ++void ibscif_skb_destructor(struct sk_buff *skb) ++{ ++ struct ibscif_dev *dev = GET_SKB_DEV(skb); ++ ++ /* A sk_buff is now available. */ ++ if (atomic_inc_return(&dev->available) == 1) ++ ; /* Could invoke the scheduler here. */ ++ ++ /* Release the module reference held for this sk_buff. */ ++ module_put(THIS_MODULE); ++} ++ ++static struct sk_buff *ibscif_alloc_tx_skb(struct ibscif_dev *dev, int hdr_size, int payload_size) ++{ ++ struct sk_buff *skb; ++ ++ skb = dev_alloc_skb(hdr_size); ++ if (unlikely(!skb)) ++ return NULL; ++ ++ skb_reset_mac_header(skb); ++ skb_reset_network_header(skb); ++ ++ skb->protocol = IBSCIF_PACKET_TYPE; ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ skb->priority = TC_PRIO_CONTROL; /* highest defined priority */ ++ skb->dev = (void *) dev; ++ skb->len = hdr_size + payload_size; ++ skb->data_len = payload_size; ++ skb->tail += hdr_size; ++ ++ return skb; ++} ++ ++static struct sk_buff_head xmit_queue; ++static void ibscif_xmit_work_handler( struct work_struct *context ); ++static DECLARE_WORK(ibscif_xmit_work, ibscif_xmit_work_handler); ++static atomic_t xmit_busy = ATOMIC_INIT(0); ++ ++static void ibscif_xmit_work_handler( struct work_struct *context ) ++{ ++ struct sk_buff *skb; ++ scif_epd_t scif_ep; ++ int num_frags; ++ skb_frag_t *frag; ++ void *vaddr; ++ int ret; ++ int hdr_size; ++ int i; ++ struct ibscif_qp *qp; ++ ++again: ++ while ((skb = skb_dequeue(&xmit_queue))) { ++ scif_ep = GET_SKB_EP(skb); ++ if (!scif_ep) { ++ printk(KERN_ALERT PFX "%s: NULL scif_ep, skb=%p\n", __func__, skb); ++ goto next; ++ } ++ ++ hdr_size = skb->len - skb->data_len; ++ for (i=0; idata+i, hdr_size-i, ++ blocking_send ? SCIF_SEND_BLOCK : 0); ++ if (ret < 0) { ++ printk(KERN_ALERT PFX "%s: fail to send header, hdr_size=%d, ret=%d\n", __func__, hdr_size, ret); ++ goto next; ++ } ++ i += ret; ++ } ++ ++ num_frags = skb_shinfo(skb)->nr_frags; ++ frag = skb_shinfo(skb)->frags; ++ while (num_frags--) { ++ vaddr = KMAP(frag); /* because scif_send() may cause scheduling */ ++ for (i=0; isize; ) { ++ ret = scif_send(scif_ep, vaddr + frag->page_offset + i, ++ frag->size - i, ++ blocking_send ? SCIF_SEND_BLOCK : 0); ++ if (ret < 0) { ++ printk(KERN_ALERT PFX "%s: scif_send returns %d, frag_size=%d\n", __func__, ret, frag->size); ++ break; ++ } ++ i += ret; ++ } ++ KUNMAP(frag); ++ frag++; ++ } ++next: ++ qp = GET_SKB_QP(skb); ++ if (qp && qp->ibqp.qp_type == IB_QPT_UD) { ++ struct ibscif_full_frame *pdu = (struct ibscif_full_frame*)skb->data; ++ u16 opcode = __be16_to_cpu(pdu->ibscif.hdr.opcode); ++ if (ibscif_pdu_is_last(opcode)) { ++ struct ibscif_wr *wr = GET_SKB_WR(skb); ++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds); ++ wr->state = WR_COMPLETED; ++ ibscif_process_sq_completions(GET_SKB_QP(skb)); ++ } ++ /* Release the reference held on UD QPs */ ++ ibscif_put_qp(qp); ++ } ++ kfree_skb(skb); ++ } ++ ++ if (!skb_queue_empty(&xmit_queue)) ++ goto again; ++ ++ atomic_set(&xmit_busy, 0); ++} ++ ++static void ibscif_dev_queue_xmit(struct sk_buff *skb) ++{ ++ struct ibscif_dev *dev=NULL; ++ int len = 0; ++ ++ if (skb) { ++ dev = GET_SKB_DEV(skb); ++ len = skb->len; ++ skb_queue_tail(&xmit_queue, skb); ++ } ++ ++ /* only one instance can be enqueued, otherwise there is race condition between scif_send() calls. */ ++ /* notice that the current running worker may miss the newly added item, but it will be picked up in the poll_thread */ ++ if (!atomic_xchg(&xmit_busy, 1)) ++ schedule_work(&ibscif_xmit_work); ++ ++ if (likely(dev)) { ++ DEV_STAT(dev, packets_sent++); ++ DEV_STAT(dev, bytes_sent += len); ++ } ++} ++ ++static int ibscif_create_hdr(struct ibscif_qp *qp, struct ibscif_wr *wr, struct sk_buff *skb, ++ u32 seq_num, u32 wr_len_remaining, int force) ++{ ++ struct ibscif_full_frame *pdu = (struct ibscif_full_frame*)skb->data; ++ u32 sq_seq, iq_seq; ++ u16 opcode; ++ int i; ++ ++ sq_seq = qp->wire.sq.rx.last_in_seq; ++ iq_seq = qp->wire.iq.rx.last_in_seq; ++ qp->wire.sq.rx.last_seq_acked = sq_seq; ++ qp->wire.iq.rx.last_seq_acked = iq_seq; ++ ++ pdu->ibscif.hdr.length = __cpu_to_be16(skb->data_len); ++ if (qp->ibqp.qp_type == IB_QPT_UD) { ++ pdu->ibscif.hdr.dst_qp = __cpu_to_be32(wr->ud.remote_qpn); ++ } ++ else { ++ pdu->ibscif.hdr.dst_qp = __cpu_to_be32(qp->remote_qpn); ++ } ++ pdu->ibscif.hdr.src_qp = __cpu_to_be32(qp->ibqp.qp_num); ++ pdu->ibscif.hdr.seq_num = __cpu_to_be32(seq_num); ++ pdu->ibscif.hdr.sq_ack_num = __cpu_to_be32(sq_seq); ++ pdu->ibscif.hdr.iq_ack_num = __cpu_to_be32(iq_seq); ++ ++ switch (wr->opcode) { ++ case WR_UD: ++ opcode = ibscif_op_ud; ++ if (skb->data_len == wr_len_remaining) { ++ opcode = ibscif_pdu_set_last(opcode); ++ if (wr->flags & IB_SEND_SIGNALED) ++ force = 1; ++ if (wr->flags & IB_SEND_SOLICITED) ++ opcode = ibscif_pdu_set_se(opcode); ++ } ++ pdu->ibscif.ud.msg_length = __cpu_to_be32(wr->length); ++ pdu->ibscif.ud.msg_offset = __cpu_to_be32(wr->length - wr_len_remaining); ++ memset(&pdu->ibscif.ud.grh, 0, 40); ++ break; ++ ++ case WR_SEND: ++ case WR_SEND_WITH_IMM: ++ opcode = ibscif_op_send; ++ if (skb->data_len == wr_len_remaining || opcode == ibscif_op_send_rma) { ++ opcode = ibscif_pdu_set_last(opcode); ++ if (wr->flags & IB_SEND_SIGNALED) ++ force = 1; ++ if (wr->opcode == WR_SEND_WITH_IMM) { ++ opcode = ibscif_pdu_set_immed(opcode); ++ pdu->ibscif.send.immed_data = __cpu_to_be32(wr->send.immediate_data); ++ } else pdu->ibscif.send.immed_data = 0; ++ if (wr->flags & IB_SEND_SOLICITED) ++ opcode = ibscif_pdu_set_se(opcode); ++ } ++ pdu->ibscif.send.msg_id = __cpu_to_be32(wr->msg_id); ++ pdu->ibscif.send.msg_length = __cpu_to_be32(wr->length); ++ pdu->ibscif.send.msg_offset = __cpu_to_be32(wr->length - wr_len_remaining); ++ if (wr->use_rma) { ++ opcode = ibscif_op_send_rma; ++ pdu->ibscif.send.rma_id = __cpu_to_be32(wr->rma_id); ++ pdu->ibscif.send.num_rma_addrs = __cpu_to_be32(wr->num_ds); ++ for (i=0; inum_ds; i++) { ++ pdu->ibscif.send.rma_addrs[i].offset = __cpu_to_be64(wr->ds_list[i].current_mreg->offset + wr->ds_list[i].offset); ++ pdu->ibscif.send.rma_addrs[i].length = __cpu_to_be32(wr->ds_list[i].length); ++ } ++ } ++ break; ++ ++ case WR_RDMA_READ: ++ opcode = ibscif_op_read; ++ pdu->ibscif.read_req.rdma_id = __cpu_to_be32(wr->msg_id); ++ pdu->ibscif.read_req.rdma_key = __cpu_to_be32(wr->read.rkey); ++ pdu->ibscif.read_req.rdma_length= __cpu_to_be32(wr->read.remote_length); ++ pdu->ibscif.read_req.rdma_address = __cpu_to_be64(wr->read.remote_address); ++ if (wr->use_rma) { ++ opcode = ibscif_op_read_rma; ++ pdu->ibscif.read_req.num_rma_addrs = __cpu_to_be32(wr->num_ds); ++ for (i=0; inum_ds; i++) { ++ pdu->ibscif.read_req.rma_addrs[i].offset = __cpu_to_be64(wr->ds_list[i].current_mreg->offset + wr->ds_list[i].offset); ++ pdu->ibscif.read_req.rma_addrs[i].length = __cpu_to_be32(wr->ds_list[i].length); ++ } ++ } ++ break; ++ ++ case WR_RDMA_WRITE: ++ case WR_RDMA_WRITE_WITH_IMM: ++ opcode = ibscif_op_write; ++ if ((enum ib_wr_opcode)wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { ++ opcode = ibscif_pdu_set_immed(opcode); ++ pdu->ibscif.write.immed_data = __cpu_to_be32(wr->write.immediate_data); ++ if (wr->flags & IB_SEND_SOLICITED) ++ opcode = ibscif_pdu_set_se(opcode); ++ } else pdu->ibscif.write.immed_data = 0; ++ if (skb->data_len == wr_len_remaining || opcode == ibscif_op_write_rma) { ++ opcode = ibscif_pdu_set_last(opcode); ++ if (wr->flags & IB_SEND_SIGNALED) ++ force = 1; ++ } ++ pdu->ibscif.write.msg_id = __cpu_to_be32(wr->msg_id); ++ pdu->ibscif.write.rdma_key = __cpu_to_be32(wr->write.rkey); ++ pdu->ibscif.write.rdma_address = __cpu_to_be64(wr->write.remote_address + ++ (wr->length - wr_len_remaining)); ++ if (wr->use_rma) { ++ opcode = ibscif_op_write_rma; ++ if (wr->opcode == WR_RDMA_WRITE_WITH_IMM) ++ opcode = ibscif_pdu_set_immed(opcode); ++ pdu->ibscif.write.rma_id = __cpu_to_be32(wr->rma_id); ++ pdu->ibscif.write.rma_length = __cpu_to_be32(wr->length); ++ pdu->ibscif.write.num_rma_addrs = __cpu_to_be32(wr->num_ds); ++ for (i=0; inum_ds; i++) { ++ pdu->ibscif.write.rma_addrs[i].offset = __cpu_to_be64(wr->ds_list[i].current_mreg->offset + wr->ds_list[i].offset); ++ pdu->ibscif.write.rma_addrs[i].length = __cpu_to_be32(wr->ds_list[i].length); ++ } ++ } ++ break; ++ ++ case WR_ATOMIC_CMP_AND_SWP: ++ opcode = ibscif_pdu_set_last(ibscif_op_comp_swap); ++ pdu->ibscif.comp_swap.atomic_id = __cpu_to_be32(wr->msg_id); ++ pdu->ibscif.comp_swap.atomic_key = __cpu_to_be32(wr->cmp_swp.rkey); ++ pdu->ibscif.comp_swap.comp_data = __cpu_to_be64(wr->cmp_swp.cmp_operand); ++ pdu->ibscif.comp_swap.swap_data = __cpu_to_be64(wr->cmp_swp.swp_operand); ++ pdu->ibscif.comp_swap.atomic_address = __cpu_to_be64(wr->cmp_swp.remote_address); ++ break; ++ ++ case WR_ATOMIC_FETCH_AND_ADD: ++ opcode = ibscif_pdu_set_last(ibscif_op_fetch_add); ++ pdu->ibscif.fetch_add.atomic_id = __cpu_to_be32(wr->msg_id); ++ pdu->ibscif.fetch_add.atomic_key = __cpu_to_be32(wr->fetch_add.rkey); ++ pdu->ibscif.fetch_add.add_data = __cpu_to_be64(wr->fetch_add.add_operand); ++ pdu->ibscif.fetch_add.atomic_address = __cpu_to_be64(wr->fetch_add.remote_address); ++ break; ++ ++ case WR_RDMA_READ_RSP: ++ opcode = ibscif_op_read_rsp; ++ if (skb->data_len == wr_len_remaining) ++ opcode = ibscif_pdu_set_last(opcode); ++ pdu->ibscif.read_rsp.rdma_id = __cpu_to_be32(wr->msg_id); ++ pdu->ibscif.read_rsp.rdma_offset = __cpu_to_be32(wr->length - wr_len_remaining); ++ break; ++ ++ case WR_ATOMIC_RSP: ++ opcode = ibscif_pdu_set_last(wr->atomic_rsp.opcode); ++ pdu->ibscif.atomic_rsp.atomic_id = __cpu_to_be32(wr->msg_id); ++ pdu->ibscif.atomic_rsp.orig_data = __cpu_to_be64(wr->atomic_rsp.orig_data); ++ break; ++ ++ case WR_RMA_RSP: ++ opcode = ibscif_op_rma_rsp; ++ pdu->ibscif.rma_rsp.rma_id = __cpu_to_be32(wr->msg_id); ++ pdu->ibscif.rma_rsp.xfer_length = __cpu_to_be32(wr->rma_rsp.xfer_length); ++ pdu->ibscif.rma_rsp.error = __cpu_to_be32(wr->rma_rsp.error); ++ break; ++ default: ++ printk(KERN_ERR PFX "%s() invalid opcode %d\n", __func__, wr->opcode); ++ return 1; ++ } ++ ++ if (force) ++ opcode = ibscif_pdu_set_force_ack(opcode); ++ ++ pdu->ibscif.hdr.opcode = __cpu_to_be16(opcode); ++ ++ return 0; ++} ++ ++static struct sk_buff* ibscif_alloc_pdu(struct ibscif_dev *dev, struct ibscif_qp *qp, struct ibscif_wr *wr, ++ int hdr_size, u32 seq_num, u32 payload_size, u32 len_remaining, int force) ++{ ++ struct sk_buff *skb; ++ struct ibscif_full_frame *pdu; ++ ++ if (unlikely(!qp->conn && qp->ibqp.qp_type != IB_QPT_UD)) { ++ printk(KERN_ALERT PFX "%s: ERROR: qp->conn == NULL\n", __func__); ++ return NULL; ++ } ++ ++ if (!atomic_add_unless(&dev->available, -1, 0)) { ++ printk(KERN_NOTICE PFX "%s throttled by available tx buffer limit\n", dev->ibdev.name); ++ DEV_STAT(dev, unavailable++); ++ return NULL; ++ } ++ ++ /* Get an skb for this protocol packet. */ ++ skb = ibscif_alloc_tx_skb(dev, hdr_size, payload_size); ++ if (unlikely(!skb)) ++ goto bail; ++ ++ /* Hold a reference on the module until skb->destructor is called. */ ++ __module_get(THIS_MODULE); ++ skb->destructor = ibscif_skb_destructor; ++ ++ SET_SKB_DEV(skb, dev); ++ SET_SKB_WR(skb, wr); ++ ++ if (qp->ibqp.qp_type == IB_QPT_UD) { ++ struct ibscif_conn *conn; ++ int flag = qp->ibqp.qp_num > wr->ud.remote_qpn; ++ conn = ibscif_get_conn(qp->local_node_id, wr->ud.remote_node_id, flag); ++ if (unlikely(!conn)) { ++ kfree_skb(skb); ++ goto bail; ++ } ++ ++ ibscif_qp_add_ud_conn(qp, conn); ++ ibscif_put_conn(conn); ++ SET_SKB_EP(skb, conn->ep); ++ SET_SKB_QP(skb, qp); ++ ++ /* Reference UD QPs until the wr is transmitted by ibscif_xmit_work_handler */ ++ kref_get(&qp->ref); ++ } ++ else { ++ SET_SKB_EP(skb, qp->conn->ep); ++ } ++ ++ /* Construct the header and copy it to the skb. */ ++ if (unlikely(ibscif_create_hdr(qp, wr, skb, seq_num, len_remaining, force))) { ++ kfree_skb(skb); ++ goto bail; ++ } ++ ++ pdu = (struct ibscif_full_frame *)skb->data; ++ pdu->ibscif.hdr.hdr_size = __cpu_to_be16(hdr_size); ++ ++ return skb; ++bail: ++ atomic_inc(&dev->available); ++ return NULL; ++} ++ ++static int ibscif_send_null_pdu(struct ibscif_dev *dev, struct ibscif_qp *qp, struct ibscif_wr *wr, u32 hdr_size) ++{ ++ struct sk_buff *skb; ++ ++ /* Allocate an initialized skb with a PDU header. */ ++ skb = ibscif_alloc_pdu(dev, qp, wr, hdr_size, wr->sar.seg.starting_seq, 0, 0, 0); ++ if (unlikely(!skb)) ++ return 0; ++ ++ ibscif_dev_queue_xmit(skb); ++ return 1; ++} ++ ++static int get_hdr_size_from_wr(struct ibscif_wr *wr) ++{ ++ switch (wr->opcode) { ++ case WR_UD: return sizeof(struct ud_hdr); ++ case WR_SEND: ++ case WR_SEND_WITH_IMM: return sizeof(struct send_hdr); ++ case WR_RDMA_WRITE: ++ case WR_RDMA_WRITE_WITH_IMM: return sizeof(struct write_hdr); ++ case WR_RDMA_READ: return sizeof(struct read_req_hdr); ++ case WR_ATOMIC_CMP_AND_SWP: return sizeof(struct comp_swap_hdr); ++ case WR_ATOMIC_FETCH_AND_ADD: return sizeof(struct fetch_add_hdr); ++ case WR_RDMA_READ_RSP: return sizeof(struct read_rsp_hdr); ++ case WR_ATOMIC_RSP: return sizeof(struct atomic_rsp_hdr); ++ case WR_RMA_RSP: return sizeof(struct rma_rsp_hdr); ++ default: return 0; ++ } ++} ++ ++static int get_rma_addr_size_from_wr(struct ibscif_wr *wr) ++{ ++ switch (wr->opcode) { ++ case WR_UD: return 0; ++ case WR_SEND: ++ case WR_SEND_WITH_IMM: ++ case WR_RDMA_WRITE: ++ case WR_RDMA_WRITE_WITH_IMM: ++ case WR_RDMA_READ: return wr->num_ds * sizeof(struct rma_addr); ++ case WR_ATOMIC_CMP_AND_SWP: return 0; ++ case WR_ATOMIC_FETCH_AND_ADD: return 0; ++ case WR_RDMA_READ_RSP: return 0; ++ case WR_ATOMIC_RSP: return 0; ++ case WR_RMA_RSP: return 0; ++ default: return 0; ++ } ++} ++ ++static int setup_rma_addrs(struct ibscif_wq *wq, struct ibscif_wr *wr) ++{ ++ struct ibscif_ds *ds; ++ int i; ++ ++ if (!wr->num_ds) ++ return 1; ++ ++ for (i=0; inum_ds; i++) { ++ ds = &wr->ds_list[i]; ++ if (!ds->current_mreg) ++ ds->current_mreg = ibscif_mr_get_mreg(ds->mr, wq->qp->conn); ++ ++ if (!ds->current_mreg) ++ return 0; ++ } ++ ++ return 1; ++} ++ ++/* when necessary SCIF will allocate temp buffer to align up cache line offset. ++ * * so we only need to use roffset to calculate the dma size. ++ * */ ++static inline int ibscif_dma_size(u32 len, u64 roffset) ++{ ++ u32 head, tail; ++ ++ tail = (roffset + len) % 64; ++ head = (64 - roffset % 64) % 64; ++ if (len >= head + tail) ++ return (len - head - tail); ++ else ++ return 0; ++} ++ ++static void ibscif_send_ack(struct ibscif_qp *qp); /* defined later in this file */ ++ ++static int ibscif_try_fast_rdma(struct ibscif_wq *wq, struct ibscif_wr *wr) ++{ ++ struct ibscif_qp *qp; ++ int i, err; ++ u64 loffset, roffset; ++ u32 total_length, rdma_length, xfer_len; ++ u64 raddress; ++ u32 rkey; ++ enum ib_access_flags access; ++ u32 dma_size = 0; ++ int rma_flag = 0; ++ ++ IBSCIF_PERF_SAMPLE(2, 0); ++ ++ switch (wr->opcode) { ++ case WR_RDMA_WRITE: ++ raddress = wr->write.remote_address; ++ rkey = wr->write.rkey; ++ total_length = rdma_length = wr->length; ++ access = IB_ACCESS_REMOTE_WRITE; ++ break; ++ ++ case WR_RDMA_READ: ++ raddress = wr->read.remote_address; ++ rkey = wr->read.rkey; ++ total_length = rdma_length = wr->read.remote_length; /* wr->length is 0 */ ++ access = IB_ACCESS_REMOTE_READ; ++ break; ++ ++ default: ++ return 0; ++ } ++ ++ qp = wq->qp; ++ ++ if (unlikely(!qp->conn)) { ++ printk(KERN_ALERT PFX "%s: ERROR: qp->conn == NULL\n", __func__); ++ return 0; ++ } ++ ++ if (!setup_rma_addrs(wq, wr)) { ++ DEV_STAT(qp->dev, fast_rdma_fallback++); ++ return 0; ++ } ++ ++ roffset = IBSCIF_MR_VADDR_TO_OFFSET( rkey, raddress ); ++ ++ for (i=0; inum_ds; i++) { ++ if (rdma_length == 0) ++ break; ++ ++ loffset = wr->ds_list[i].current_mreg->offset + wr->ds_list[i].offset; ++ xfer_len = min(wr->ds_list[i].length, rdma_length); ++ if (xfer_len == 0) ++ continue; ++ ++ IBSCIF_PERF_SAMPLE(3, 0); ++ ++ dma_size = ibscif_dma_size(xfer_len, roffset); ++ ++ if (i==wr->num_ds-1) ++ rma_flag = dma_size ? SCIF_RMA_SYNC : 0; ++ ++ if (wr->opcode == WR_RDMA_WRITE) { ++ err = scif_writeto(wq->qp->conn->ep, loffset, xfer_len, roffset, rma_flag|SCIF_RMA_ORDERED); ++ if (err) ++ printk(KERN_INFO PFX "%s(): error writing ordered message, size=%d, err=%d.\n", __func__, xfer_len, err); ++ } ++ else { ++ err = scif_readfrom(wq->qp->conn->ep, loffset, xfer_len, roffset, rma_flag); ++ if (err) ++ printk(KERN_INFO PFX "%s(): error reading the message, size=%d, err=%d.\n", __func__, xfer_len, err); ++ } ++ ++ IBSCIF_PERF_SAMPLE(4, 0); ++ ++ if (err){ ++ DEV_STAT(qp->dev, fast_rdma_fallback++); ++ return 0; ++ } ++ ++ roffset += xfer_len; ++ rdma_length -= xfer_len; ++ } ++ ++ if (rdma_length) ++ printk(KERN_INFO PFX "%s(): remaining rdma_length=%d.\n", __func__, rdma_length); ++ ++ IBSCIF_PERF_SAMPLE(5, 0); ++ ++ /* complete the wr */ ++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds); ++ wr->state = WR_COMPLETED; ++ wr->sar.rea.final_length = total_length - rdma_length; ++ ++ /* we can't call ibscif_process_sq_completions here because we are holding the sq lock. ++ * set the flag and let the upper level make the call */ ++ wq->fast_rdma_completions = 1; ++ ++ if (wr->opcode == WR_RDMA_WRITE) ++ DEV_STAT(qp->dev, fast_rdma_write++); ++ else ++ DEV_STAT(qp->dev, fast_rdma_read++); ++ ++ /* the fast rdma protocol doesn't send any packet, and thus can not piggyback any ack ++ * for the peer. send separate ack packet when necessary. */ ++ if (qp->wire.sq.rx.last_seq_acked < qp->wire.sq.rx.last_in_seq || ++ qp->wire.iq.rx.last_seq_acked < qp->wire.iq.rx.last_in_seq) { ++ ibscif_send_ack(qp); ++ DEV_STAT(qp->dev, fast_rdma_force_ack++); ++ } ++ ++ IBSCIF_PERF_SAMPLE(8, 0); ++ ++ return 1; ++} ++ ++/* ++ * Setup for a fresh data descriptor. ++ */ ++#define DS_SETUP(ds, mr, page_offset, page_index, ds_len_left) \ ++do { \ ++ mr = ds->mr; \ ++ ds_len_left = ds->length; \ ++ page_offset = ds->offset + (mr->addr & ~PAGE_MASK); \ ++ page_index = page_offset >> PAGE_SHIFT; \ ++ page_offset &= ~PAGE_MASK; \ ++} while(0) ++ ++/* ++ * Setup for page crossing within a data descriptor. ++ */ ++#define NEXT_PAGE(ds, mr, page_offset, page_index, ds_len_left) \ ++do { \ ++ if (!ds_len_left) { \ ++ ds++; \ ++ DS_SETUP(ds, mr, page_offset, page_index, ds_len_left); \ ++ } else { \ ++ page_index++; \ ++ BUG_ON(!(mr->npages > page_index)); \ ++ page_offset = 0; \ ++ } \ ++} while(0) ++ ++/* ++ * Setup the data descriptor, page, and offset for specified sequence number ++ */ ++#define SETUP_BY_SEQ(wr, ds, mr, from_seq, wr_length, page_offset, page_index, \ ++ ds_len_left, max_payload) \ ++do { \ ++ u32 i, frag_len_max; \ ++ \ ++ DS_SETUP(ds, mr, page_offset, page_index, ds_len_left); \ ++ for (i = wr->sar.seg.starting_seq; seq_before(i, from_seq); i++) { \ ++ num_frags = 0; \ ++ payload_left = max_payload; \ ++ while (payload_left && (num_frags < MAX_SKB_FRAGS)) { \ ++ frag_len_max = min(ds_len_left, (u32)(PAGE_SIZE - page_offset));\ ++ if (wr_length > payload_left) { \ ++ if (payload_left > frag_len_max) { \ ++ ds_len_left -= frag_len_max; \ ++ NEXT_PAGE(ds, mr, page_offset, \ ++ page_index, ds_len_left); \ ++ } else { \ ++ frag_len_max = payload_left; /* frag->size */ \ ++ ds_len_left -= payload_left; \ ++ page_offset += payload_left; \ ++ } \ ++ } else { \ ++ if (wr_length > frag_len_max) { \ ++ ds_len_left -= frag_len_max; \ ++ NEXT_PAGE(ds, mr, page_offset, \ ++ page_index, ds_len_left); \ ++ } else { \ ++ printk(KERN_ERR PFX \ ++ "from_seq (%d) botch wr %p opcode %d length %d\n", \ ++ from_seq, wr, wr->opcode, wr_length); \ ++ return 0; \ ++ } \ ++ } \ ++ wr_length -= frag_len_max; \ ++ payload_left -= frag_len_max; \ ++ num_frags++; \ ++ } \ ++ } \ ++} while(0) ++ ++int ibscif_xmit_wr(struct ibscif_wq *wq, struct ibscif_wr *wr, int tx_limit, int retransmit, u32 from_seq, u32 *posted) ++{ ++ struct ibscif_dev *dev; ++ struct ibscif_qp *qp; ++ struct ibscif_ds *ds; ++ struct ibscif_mr *mr; ++ int hdr_size, page_index, num_frags, num_xmited; ++ u32 max_payload, wr_length, page_offset, ds_len_left, payload_left; ++ ++ /* Try to process RDMA read/write directly with SCIF functions. ++ * The usual reason for failure is that the remote memory has not yet been ++ * registered with SCIF. The normal packet based path should handle that. ++ */ ++ if (host_proxy && wq->qp->local_node_id>0 && wq->qp->remote_node_id==0) { ++ /* don't try fast rdma becasue we want to let the host do the data transfer */ ++ } ++ else if (fast_rdma) { ++ num_xmited = 0; ++ if (ibscif_try_fast_rdma(wq, wr)) ++ goto finish2; ++ } ++ ++ if (!tx_limit) { ++ printk(KERN_INFO PFX "%s() called with tx_limit of zero\n", __func__); ++ return 0; ++ } ++ ++ qp = wq->qp; ++ dev = qp->dev; ++ hdr_size = get_hdr_size_from_wr(wr); ++ max_payload = qp->mtu - hdr_size; ++ ++ if (wr->use_rma) { ++ struct sk_buff *skb; ++ ++ wr_length = wr->length; ++ wr->sar.seg.starting_seq = from_seq; ++ wr->sar.seg.ending_seq = from_seq; ++ wr->state = WR_STARTED; ++ ++ num_xmited = 0; ++ if (setup_rma_addrs(wq, wr)) { ++ /* Make room in the header for RMA addresses */ ++ hdr_size += get_rma_addr_size_from_wr(wr); ++ ++ /* Allocate an initialized skb with PDU header. */ ++ skb = ibscif_alloc_pdu(dev, qp, wr, hdr_size, from_seq, 0, wr_length, 0); ++ if (likely(skb)) { ++ ibscif_dev_queue_xmit(skb); ++ num_xmited++; ++ from_seq++; ++ } ++ } ++ else ++ printk(KERN_ALERT PFX "%s: fail to set up RMA addresses for the work request.\n", __func__); ++ ++ goto finish; ++ } ++ ++ if (!wr->sar.seg.current_ds) { ++ /* ++ * This is a fresh send so intialize the wr by setting the static ++ * parts of the header and sequence number range for this wr. ++ */ ++ wr_length = wr->length; ++ wr->sar.seg.starting_seq = from_seq; ++ wr->sar.seg.ending_seq = from_seq; ++ if (wr_length > max_payload) { ++ wr->sar.seg.ending_seq += (wr_length / max_payload); ++ if (!(wr_length % max_payload)) ++ wr->sar.seg.ending_seq--; ++ } ++ ++ wr->state = WR_STARTED; ++ ++ /* ++ * If this request has a payload, setup for fragmentation. ++ * Otherwise, send it on its way. ++ */ ++ if (wr_length) { ++ ds = wr->ds_list; ++ DS_SETUP(ds, mr, page_offset, page_index, ds_len_left); ++ } else { ++ num_xmited = ibscif_send_null_pdu(dev, qp, wr, hdr_size); ++ /* from_seq must always advanced even in null PDU cases. */ ++ from_seq++; ++ goto finish; ++ } ++ } else { ++ /* We're picking up from a paritally sent request. */ ++ ds = wr->sar.seg.current_ds; ++ mr = ds->mr; ++ wr_length = wr->sar.seg.wr_length_remaining; ++ ds_len_left = wr->sar.seg.ds_length_remaining; ++ page_index = wr->sar.seg.current_page_index; ++ page_offset = wr->sar.seg.current_page_offset; ++ from_seq = wr->sar.seg.next_seq; ++ } ++ ++ /* Ok, let's break this bad-boy up. */ ++ num_xmited = 0; ++ while (wr_length && (num_xmited < tx_limit) && (qp->state == QP_CONNECTED)) { ++ struct sk_buff *skb; ++ skb_frag_t *frag; ++ ++ /* Allocate an initialized skb with PDU header. */ ++ skb = ibscif_alloc_pdu(dev, qp, wr, hdr_size, from_seq, min(wr_length, max_payload), ++ wr_length, retransmit && (num_xmited == (tx_limit - 1))); ++ if (unlikely(!skb)) ++ break; ++ ++ /* Update sequence number for next pass. */ ++ from_seq++; ++ ++ /* Fill the skb fragment list. */ ++ frag = skb_shinfo(skb)->frags; ++ num_frags = 0; ++ payload_left = max_payload; ++ ++ while (payload_left && (num_frags < MAX_SKB_FRAGS)) { ++ u32 frag_len_max; ++ ++ SET_PAGE(frag, mr->page[page_index]); ++ frag->page_offset = page_offset; ++ ++ /* Take a reference on the page - kfree_skb will release. */ ++ GET_PAGE(frag); ++ ++ frag_len_max = min(ds_len_left, (u32)(PAGE_SIZE - page_offset)); ++ if (wr_length > payload_left) { ++ if (payload_left > frag_len_max) { ++ /* Deal with page boundary crossing. */ ++ frag->size = frag_len_max; ++ ds_len_left -= frag_len_max; ++ NEXT_PAGE(ds, mr, page_offset, page_index, ds_len_left); ++ } else { ++ frag->size = payload_left; ++ ds_len_left -= payload_left; ++ page_offset += payload_left; ++ } ++ } else { ++ if (wr_length > frag_len_max) { ++ /* Deal with page boundary crossing. */ ++ frag->size = frag_len_max; ++ ds_len_left -= frag_len_max; ++ NEXT_PAGE(ds, mr, page_offset, page_index, ds_len_left); ++ } else { ++ frag->size = wr_length; ++ payload_left -= wr_length; ++ wr_length = 0; ++ num_frags++; /* Change from index to number. */ ++ break; ++ } ++ } ++ ++ wr_length -= frag->size; ++ payload_left -= frag->size; ++ num_frags++; ++ frag++; ++ } ++ skb_shinfo(skb)->nr_frags = num_frags; ++ ++ /* Check if we need to do a fixup because we ran out of frags. */ ++ if ((num_frags == MAX_SKB_FRAGS) && wr_length) { ++ struct ibscif_full_frame *pdu = (struct ibscif_full_frame*)skb->data; ++ skb->len = hdr_size + (max_payload - payload_left); ++ skb->data_len = (max_payload - payload_left); ++ pdu->ibscif.hdr.length = __cpu_to_be16(skb->data_len); ++ pdu->ibscif.hdr.opcode = __cpu_to_be16(__be16_to_cpu(pdu->ibscif.hdr.opcode) & ~ibscif_last_flag); ++ } ++ ++ /* ++ * Send it. ++ */ ++ ibscif_dev_queue_xmit(skb); ++ num_xmited++; ++ } ++ ++ /* ++ * Update state. If this is a retransmit, don't update anything. If not and ++ * there's more to do on the wr, save state. Otherwise, setup for next wr. ++ */ ++ if (wr_length && !wr->use_rma) { ++ wr->sar.seg.current_ds = ds; ++ wr->sar.seg.wr_length_remaining = wr_length; ++ wr->sar.seg.ds_length_remaining = ds_len_left; ++ wr->sar.seg.current_page_index = page_index; ++ wr->sar.seg.current_page_offset = page_offset; ++ } else { ++finish: if (wr->opcode != WR_UD) ++ wr->state = WR_WAITING_FOR_ACK; ++finish2: wq->next_wr = (wq->next_wr + 1) % wq->size; ++ } ++ wr->sar.seg.next_seq = from_seq; ++ if (posted) ++ *posted = from_seq; ++ ++ return num_xmited; ++} ++ ++static struct sk_buff *ibscif_create_disconnect_hdr(struct ibscif_dev *dev, u32 src_qpn, ++ u32 dst_qpn, enum ibscif_reason reason) ++{ ++ struct ibscif_full_frame *pdu; ++ struct sk_buff *skb; ++ ++ skb = ibscif_alloc_tx_skb(dev, sizeof pdu->ibscif.disconnect, 0); ++ if (unlikely(!skb)) { ++ printk(KERN_ERR PFX "%s() can't allocate skb\n", __func__); ++ return NULL; ++ } ++ ++ pdu = (struct ibscif_full_frame *)skb->data; ++ ++ /* The eth_hdr and ack fields are set by the caller. */ ++ pdu->ibscif.disconnect.hdr.opcode = __cpu_to_be16(ibscif_op_disconnect); ++ pdu->ibscif.disconnect.hdr.length = 0; /* Length has no meaning. */ ++ pdu->ibscif.disconnect.hdr.dst_qp = __cpu_to_be32(dst_qpn); ++ pdu->ibscif.disconnect.hdr.src_qp = __cpu_to_be32(src_qpn); ++ pdu->ibscif.disconnect.hdr.seq_num = 0; /* seq_num has no meaning. */ ++ pdu->ibscif.disconnect.hdr.hdr_size = __cpu_to_be16(sizeof(pdu->ibscif.disconnect)); ++ pdu->ibscif.disconnect.reason = __cpu_to_be32(reason); ++ ++ SET_SKB_DEV(skb, dev); ++ SET_SKB_WR(skb, NULL); ++ ++ return skb; ++} ++ ++void ibscif_send_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason) ++{ ++ struct ibscif_dev *dev = qp->dev; ++ struct ibscif_full_frame *pdu; ++ struct sk_buff *skb; ++ ++ if (qp->ibqp.qp_type == IB_QPT_UD) ++ return; ++ ++ if (qp->loopback) { ++ ibscif_loopback_disconnect(qp, reason); ++ return; ++ } ++ ++ if (unlikely(!qp->conn)) { ++ printk(KERN_ALERT PFX "%s: ERROR: qp->conn == NULL\n", __func__); ++ return; ++ } ++ ++ skb = ibscif_create_disconnect_hdr(dev, qp->ibqp.qp_num, qp->remote_qpn, reason); ++ if (unlikely(!skb)) ++ return; ++ ++ SET_SKB_EP(skb, qp->conn->ep); ++ ++ pdu = (struct ibscif_full_frame *)skb->data; ++ ++ pdu->ibscif.disconnect.hdr.sq_ack_num = __cpu_to_be32(qp->wire.sq.rx.last_in_seq); ++ pdu->ibscif.disconnect.hdr.iq_ack_num = __cpu_to_be32(qp->wire.iq.rx.last_in_seq); ++ ++ ibscif_dev_queue_xmit(skb); ++} ++ ++void ibscif_reflect_disconnect(struct ibscif_qp *qp, struct base_hdr *hdr, struct sk_buff *in_skb, enum ibscif_reason reason) ++{ ++ struct ibscif_full_frame *pdu; ++ struct sk_buff *skb; ++ ++ if (!qp || IS_ERR(qp)) { ++ if (qp != ERR_PTR(-ENOENT) && verbose) ++ printk(KERN_ALERT PFX "%s: qp=%p hdr=%p in_skb=%p reason=%d\n", __func__, qp, hdr, in_skb, reason); ++ return; ++ } ++ ++ /* Don't send a disconnect for a disconnect. */ ++ if (ibscif_pdu_base_type(hdr->opcode) == ibscif_op_disconnect) ++ return; ++ ++ if (!qp->conn || !qp->conn->ep) ++ return; ++ ++ skb = ibscif_create_disconnect_hdr((void *)in_skb->dev, hdr->dst_qp, hdr->src_qp, reason); ++ if (unlikely(!skb)) ++ return; ++ ++ SET_SKB_EP(skb, qp->conn->ep); ++ ++ pdu = (struct ibscif_full_frame *)skb->data; ++ ++ pdu->ibscif.disconnect.hdr.sq_ack_num = 0; /* sq_ack_num has no meaning. */ ++ pdu->ibscif.disconnect.hdr.iq_ack_num = 0; /* iq_ack_num has no meaning. */ ++ ++ ibscif_dev_queue_xmit(skb); ++} ++ ++static struct sk_buff *ibscif_create_ack_hdr(struct ibscif_qp *qp, int size) ++{ ++ struct ibscif_full_frame *pdu; ++ struct sk_buff *skb; ++ u32 sq_seq, iq_seq; ++ ++ if (unlikely(!qp->conn)) { ++ printk(KERN_ALERT PFX "%s: ERROR: qp->conn == NULL\n", __func__); ++ return NULL; ++ } ++ ++ skb = ibscif_alloc_tx_skb(qp->dev, size, 0); ++ if (unlikely(!skb)) { ++ printk(KERN_ERR PFX "%s() can't allocate skb\n", __func__); ++ return NULL; ++ } ++ ++ SET_SKB_DEV(skb, qp->dev); ++ SET_SKB_WR(skb, NULL); ++ SET_SKB_EP(skb, qp->conn->ep); ++ ++ sq_seq = qp->wire.sq.rx.last_in_seq; ++ iq_seq = qp->wire.iq.rx.last_in_seq; ++ qp->wire.sq.rx.last_seq_acked = sq_seq; ++ qp->wire.iq.rx.last_seq_acked = iq_seq; ++ ++ pdu = (struct ibscif_full_frame *)skb->data; ++ ++ /* The opcode field set by the caller. */ ++ pdu->ibscif.hdr.length = 0; /* Length has no meaning. */ ++ pdu->ibscif.hdr.dst_qp = __cpu_to_be32(qp->remote_qpn); ++ pdu->ibscif.hdr.src_qp = __cpu_to_be32(qp->ibqp.qp_num); ++ pdu->ibscif.hdr.seq_num = 0; /* seq_num has no meaning. */ ++ pdu->ibscif.hdr.sq_ack_num = __cpu_to_be32(sq_seq); ++ pdu->ibscif.hdr.iq_ack_num = __cpu_to_be32(iq_seq); ++ pdu->ibscif.hdr.hdr_size = __cpu_to_be16(size); ++ ++ return skb; ++} ++ ++static void ibscif_send_ack(struct ibscif_qp *qp) ++{ ++ struct ibscif_full_frame *pdu; ++ struct sk_buff *skb; ++ ++ skb = ibscif_create_ack_hdr(qp, sizeof pdu->ibscif.ack); ++ if (unlikely(!skb)) ++ return; ++ ++ pdu = (struct ibscif_full_frame *)skb->data; ++ pdu->ibscif.ack.hdr.opcode = __cpu_to_be16(ibscif_op_ack); ++ ++ ibscif_dev_queue_xmit(skb); ++} ++ ++static struct sk_buff *ibscif_create_close_hdr(struct ibscif_conn *conn, int size) ++{ ++ struct ibscif_full_frame *pdu; ++ struct sk_buff *skb; ++ ++ if (unlikely(!conn)) { ++ printk(KERN_ALERT PFX "%s: ERROR: conn == NULL\n", __func__); ++ return NULL; ++ } ++ ++ skb = ibscif_alloc_tx_skb(conn->dev, size, 0); ++ if (unlikely(!skb)) { ++ printk(KERN_ERR PFX "%s() can't allocate skb\n", __func__); ++ return NULL; ++ } ++ ++ SET_SKB_DEV(skb, conn->dev); ++ SET_SKB_WR(skb, NULL); ++ SET_SKB_EP(skb, conn->ep); ++ ++ pdu = (struct ibscif_full_frame *)skb->data; ++ ++ /* The opcode field set by the caller. */ ++ pdu->ibscif.hdr.length = 0; /* Length has no meaning. */ ++ pdu->ibscif.hdr.dst_qp = 0; /* unused */ ++ pdu->ibscif.hdr.src_qp = 0; /* unused */ ++ pdu->ibscif.hdr.seq_num = 0; /* seq_num has no meaning. */ ++ pdu->ibscif.hdr.sq_ack_num = 0; /* unused */ ++ pdu->ibscif.hdr.iq_ack_num = 0; /* unused */ ++ pdu->ibscif.hdr.hdr_size = __cpu_to_be16(size); ++ ++ return skb; ++} ++ ++void ibscif_send_close(struct ibscif_conn *conn) ++{ ++ struct ibscif_full_frame *pdu; ++ struct sk_buff *skb; ++ ++ skb = ibscif_create_close_hdr(conn, sizeof pdu->ibscif.close); ++ if (unlikely(!skb)) ++ return; ++ ++ pdu = (struct ibscif_full_frame *)skb->data; ++ pdu->ibscif.close.hdr.opcode = __cpu_to_be16(ibscif_op_close); ++ ++ ibscif_dev_queue_xmit(skb); ++} ++ ++void ibscif_send_reopen(struct ibscif_conn *conn) ++{ ++ struct ibscif_full_frame *pdu; ++ struct sk_buff *skb; ++ ++ skb = ibscif_create_close_hdr(conn, sizeof pdu->ibscif.close); ++ if (unlikely(!skb)) ++ return; ++ ++ pdu = (struct ibscif_full_frame *)skb->data; ++ pdu->ibscif.close.hdr.opcode = __cpu_to_be16(ibscif_op_reopen); ++ ++ ibscif_dev_queue_xmit(skb); ++} ++ ++static struct sk_buff *ibscif_create_cm_hdr(struct ibscif_conn *conn, int size) ++{ ++ struct ibscif_full_frame *pdu; ++ struct sk_buff *skb; ++ ++ if (unlikely(!conn)) { ++ printk(KERN_ALERT PFX "%s: ERROR: conn == NULL\n", __func__); ++ return NULL; ++ } ++ ++ skb = ibscif_alloc_tx_skb(conn->dev, size, 0); ++ if (unlikely(!skb)) { ++ printk(KERN_ERR PFX "%s() can't allocate skb\n", __func__); ++ return NULL; ++ } ++ ++ SET_SKB_DEV(skb, conn->dev); ++ SET_SKB_WR(skb, NULL); ++ SET_SKB_EP(skb, conn->ep); ++ ++ pdu = (struct ibscif_full_frame *)skb->data; ++ ++ pdu->ibscif.hdr.opcode = __cpu_to_be16(ibscif_op_cm); ++ pdu->ibscif.hdr.length = 0; /* Length has no meaning. */ ++ pdu->ibscif.hdr.dst_qp = 0; /* unused */ ++ pdu->ibscif.hdr.src_qp = 0; /* unused */ ++ pdu->ibscif.hdr.seq_num = 0; /* seq_num has no meaning. */ ++ pdu->ibscif.hdr.sq_ack_num = 0; /* unused */ ++ pdu->ibscif.hdr.iq_ack_num = 0; /* unused */ ++ pdu->ibscif.hdr.hdr_size = __cpu_to_be16(size); ++ ++ return skb; ++} ++ ++int ibscif_send_cm_req(struct ibscif_cm *cm_ctx) ++{ ++ struct ibscif_full_frame *pdu; ++ struct sk_buff *skb; ++ ++ skb = ibscif_create_cm_hdr(cm_ctx->conn, sizeof pdu->ibscif.cm + cm_ctx->plen); ++ if (unlikely(!skb)) ++ return -ENOMEM; ++ ++ pdu = (struct ibscif_full_frame *)skb->data; ++ pdu->ibscif.cm.req_ctx = __cpu_to_be64((u64)(uintptr_t)cm_ctx); ++ pdu->ibscif.cm.cmd = __cpu_to_be32(IBSCIF_CM_REQ); ++ pdu->ibscif.cm.port = __cpu_to_be32((u32)cm_ctx->remote_addr.sin_port); ++ pdu->ibscif.cm.qpn = __cpu_to_be32(cm_ctx->qpn); ++ pdu->ibscif.cm.plen = __cpu_to_be32(cm_ctx->plen); ++ memcpy(pdu->ibscif.cm.pdata, cm_ctx->pdata, cm_ctx->plen); ++ ++ ibscif_dev_queue_xmit(skb); ++ ++ return 0; ++} ++ ++int ibscif_send_cm_rep(struct ibscif_cm *cm_ctx) ++{ ++ struct ibscif_full_frame *pdu; ++ struct sk_buff *skb; ++ ++ skb = ibscif_create_cm_hdr(cm_ctx->conn, sizeof pdu->ibscif.cm + cm_ctx->plen); ++ if (unlikely(!skb)) ++ return -ENOMEM; ++ ++ pdu = (struct ibscif_full_frame *)skb->data; ++ pdu->ibscif.cm.req_ctx = __cpu_to_be64(cm_ctx->peer_context); ++ pdu->ibscif.cm.rep_ctx = __cpu_to_be64((__u64)cm_ctx); ++ pdu->ibscif.cm.cmd = __cpu_to_be32(IBSCIF_CM_REP); ++ pdu->ibscif.cm.qpn = __cpu_to_be32(cm_ctx->qpn); ++ pdu->ibscif.cm.status = __cpu_to_be32(0); ++ pdu->ibscif.cm.plen = __cpu_to_be32(cm_ctx->plen); ++ memcpy(pdu->ibscif.cm.pdata, cm_ctx->pdata, cm_ctx->plen); ++ ++ ibscif_dev_queue_xmit(skb); ++ ++ return 0; ++} ++ ++int ibscif_send_cm_rej(struct ibscif_cm *cm_ctx, const void *pdata, u8 plen) ++{ ++ struct ibscif_full_frame *pdu; ++ struct sk_buff *skb; ++ ++ skb = ibscif_create_cm_hdr(cm_ctx->conn, sizeof pdu->ibscif.cm + plen); ++ if (unlikely(!skb)) ++ return -ENOMEM; ++ ++ pdu = (struct ibscif_full_frame *)skb->data; ++ pdu->ibscif.cm.req_ctx = __cpu_to_be64(cm_ctx->peer_context); ++ pdu->ibscif.cm.cmd = __cpu_to_be32(IBSCIF_CM_REJ); ++ pdu->ibscif.cm.status = __cpu_to_be32(-ECONNREFUSED); ++ pdu->ibscif.cm.plen = __cpu_to_be32((u32)plen); ++ memcpy(pdu->ibscif.cm.pdata, pdata, plen); ++ ++ ibscif_dev_queue_xmit(skb); ++ ++ return 0; ++} ++ ++int ibscif_send_cm_rtu(struct ibscif_cm *cm_ctx) ++{ ++ struct ibscif_full_frame *pdu; ++ struct sk_buff *skb; ++ ++ skb = ibscif_create_cm_hdr(cm_ctx->conn, sizeof pdu->ibscif.cm); ++ if (unlikely(!skb)) ++ return -ENOMEM; ++ ++ pdu = (struct ibscif_full_frame *)skb->data; ++ pdu->ibscif.cm.rep_ctx = __cpu_to_be64(cm_ctx->peer_context); ++ pdu->ibscif.cm.cmd = __cpu_to_be32(IBSCIF_CM_RTU); ++ ++ ibscif_dev_queue_xmit(skb); ++ ++ return 0; ++} ++ ++/* ---------------------- tx routines above this line ---------------------- */ ++/* ---------------------- rx routines below this line ---------------------- */ ++ ++static void ibscif_protocol_error(struct ibscif_qp *qp, enum ibscif_reason reason) ++{ ++ printk(KERN_NOTICE PFX "Disconnect due to protocol error %d\n", reason); ++ ibscif_qp_internal_disconnect(qp, reason); ++} ++ ++int ibscif_process_sq_completions(struct ibscif_qp *qp) ++{ ++ struct ibscif_cq *cq = to_cq(qp->ibqp.send_cq); ++ struct ibscif_wq *sq = &qp->sq; ++ struct ibscif_wr *wr; ++ struct ibscif_wc *wc; ++ int index, err = 0, i; ++ ++ spin_lock_bh(&sq->lock); ++ ++ /* Prevent divide by zero traps on wrap math. */ ++ if (!sq->size) ++ goto out; ++ ++ /* Iterate the send queue looking for defered completions. */ ++ for (i=sq->completions; idepth; i++) { ++ index = (sq->head + i) % sq->size; ++ ++ wr = ibscif_get_wr(sq, index); ++ if (wr->state != WR_COMPLETED) ++ break; ++ ++ sq->completions++; ++ sq->reap++; ++ ++ /* An IQ request has been completed; update the throttling variables. */ ++ if ((wr->opcode == WR_RDMA_READ) || ++ (wr->opcode == WR_ATOMIC_CMP_AND_SWP) || ++ (wr->opcode == WR_ATOMIC_FETCH_AND_ADD)) { ++ BUG_ON(!atomic_read(&qp->or_depth)); ++ atomic_dec(&qp->or_depth); ++ atomic_dec(&qp->or_posted); ++ } ++ ++ /* See if we need to generate a completion. */ ++ if (!(wr->flags & IB_SEND_SIGNALED)) ++ continue; ++ ++ err = ibscif_reserve_cqe(cq, &wc); ++ if (unlikely(err)) ++ break; ++ ++ wc->ibwc.qp = &qp->ibqp; ++ wc->ibwc.src_qp = qp->remote_qpn; ++ wc->ibwc.wr_id = wr->id; ++ wc->ibwc.opcode = to_ib_wc_opcode(wr->opcode); ++ wc->ibwc.wc_flags = (((enum ib_wr_opcode)wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) || ++ ((enum ib_wr_opcode)wr->opcode == IB_WR_SEND_WITH_IMM)) ? ++ IB_WC_WITH_IMM : 0; ++ wc->ibwc.status = IB_WC_SUCCESS; ++ wc->ibwc.ex.imm_data = 0; ++ wc->ibwc.port_num = 1; ++ wc->ibwc.byte_len = (((enum ib_wr_opcode)wr->opcode == IB_WR_RDMA_READ) || ++ ((enum ib_wr_opcode)wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) || ++ ((enum ib_wr_opcode)wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) ? ++ wr->sar.rea.final_length : 0; ++ wc->wq = sq; ++ wc->reap = sq->reap; ++ sq->reap = 0; ++ ++ ibscif_append_cqe(cq, wc, 0); ++ } ++out: ++ spin_unlock_bh(&sq->lock); ++ ++ ibscif_notify_cq(cq); ++ return err; ++} ++ ++static int ibscif_schedule_rx_completions(struct ibscif_qp *qp, int iq_flag, struct ibscif_rx_state *rx) ++{ ++ struct ibscif_cq *cq = to_cq(qp->ibqp.recv_cq); ++ struct ibscif_wq *wq; ++ struct ibscif_wr *wr; ++ struct ibscif_wc *wc; ++ u32 last_in_seq; ++ int index, err, i; ++ ++ wq = iq_flag ? &qp->sq /* yep, the SQ */ : &qp->rq; ++ last_in_seq = rx->last_in_seq; ++ ++ /* Prevent divide by zero traps on wrap math. */ ++ if (!wq->size) ++ return 0; ++ ++ spin_lock_bh(&wq->lock); ++ for (i=wq->completions; idepth; i++) { ++ index = (wq->head + i) % wq->size; ++ ++ wr = ibscif_get_wr(wq, index); ++ ++ /* Skip over non-IQ entries. */ ++ if (iq_flag && ++ ((wr->opcode == WR_UD) || ++ (wr->opcode == WR_SEND) || ++ (wr->opcode == WR_SEND_WITH_IMM) || ++ (wr->opcode == WR_RDMA_WRITE) || ++ (wr->opcode == WR_RDMA_WRITE_WITH_IMM))) ++ continue; ++ ++ /* ++ * If this WR hasn't seen the final segment in sequence then ++ * there is nothing more to process in this queue. We use the ++ * last seen state as a qualifier because last_packet_seq will ++ * be uninitialized until last packet is seen. ++ */ ++ if ((wr->state != WR_LAST_SEEN) || ++ seq_before(last_in_seq, wr->sar.rea.last_packet_seq)) ++ break; ++ ++ /* Clear references on memory regions. */ ++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds); ++ ++ if (iq_flag) { ++ /* ++ * Completed IQ replies are defered until earlier ++ * non-IQ WR have completed. This is determined ++ * with a second iteration of the WQ below. ++ */ ++ wr->state = WR_COMPLETED; ++ continue; /* Look for more IQ completions. */ ++ } ++ ++ /* All receive queue completions are done here. */ ++ err = ibscif_reserve_cqe(cq, &wc); ++ if (unlikely(err)) { ++ spin_unlock_bh(&wq->lock); ++ return err; ++ } ++ ++ wc->ibwc.qp = &qp->ibqp; ++ wc->ibwc.src_qp = qp->remote_qpn; ++ wc->ibwc.wr_id = wr->id; ++ wc->ibwc.status = IB_WC_SUCCESS; ++ wc->ibwc.byte_len = wr->sar.rea.final_length; ++ wc->ibwc.port_num = 1; ++ ++ if (ibscif_pdu_is_immed(wr->sar.rea.opcode)) { ++ DEV_STAT(qp->dev, recv_imm++); ++ wc->ibwc.opcode = IB_WC_RECV_RDMA_WITH_IMM; ++ wc->ibwc.ex.imm_data = wr->sar.rea.immediate_data; ++ } else { ++ DEV_STAT(qp->dev, recv++); ++ wc->ibwc.opcode = IB_WC_RECV; ++ wc->ibwc.ex.imm_data = 0; ++ } ++ ++ wc->wq = wq; ++ wc->reap = 1; ++ wq->completions++; ++ ++ ibscif_append_cqe(cq, wc, !!ibscif_pdu_is_se(wr->sar.rea.opcode)); ++ } ++ spin_unlock_bh(&wq->lock); ++ ++ /* If this was the recieve queue, there is no more processing to be done. */ ++ if (!iq_flag) { ++ ibscif_notify_cq(cq); ++ return 0; ++ } ++ ++ err = ibscif_process_sq_completions(qp); ++ if (unlikely(err)) ++ return err; ++ ++ /* ++ * If we just created room for a backlogged IQ stream request ++ * and there is a tx window, reschedule to get it sent. ++ */ ++ if ((atomic_read(&qp->or_posted) > atomic_read(&qp->or_depth)) && ++ (atomic_read(&qp->or_depth) < qp->max_or) && ++ ibscif_tx_window(&qp->wire.sq.tx)) ++ qp->schedule |= SCHEDULE_RESUME | SCHEDULE_SQ; ++ ++ return 0; ++} ++ ++static enum ibscif_schedule ibscif_process_wq_ack(struct ibscif_wq *wq, u32 seq_num) ++{ ++ struct ibscif_tx_state *tx = &wq->wirestate->tx; ++ enum ibscif_schedule status = 0; ++ int throttled, index, err = 0, i; ++ ++ if (!wq->size || !wq->depth) ++ return 0; ++ ++ /* If this is old news, get out. */ ++ if (!seq_after(seq_num, tx->last_ack_seq_recvd)) ++ return 0; ++ ++ /* Capture if window was closed before updating. */ ++ throttled = !ibscif_tx_window(tx); ++ tx->last_ack_seq_recvd = seq_num; ++ ++ /* ++ * If were were throttled and now have an open window or ++ * simply up to date, resume streaming transfers. This ++ * can be overwritten with other schedule states below. ++ */ ++ if (throttled && ibscif_tx_window(tx)) ++ status = SCHEDULE_RESUME; ++ ++ spin_lock_bh(&wq->lock); ++ for (i=wq->completions; idepth; i++) { ++ struct ibscif_wr *wr; ++ ++ index = (wq->head + i) % wq->size; ++ ++ wr = ibscif_get_wr(wq, index); ++ ++ /* Get out if the WR hasn't been scheduled. */ ++ if (wr->state == WR_WAITING) ++ break; ++ ++ if (seq_after(wr->sar.seg.ending_seq, seq_num)) { ++ ++ if ((wr->state == WR_STARTED) && !ibscif_tx_unacked_window(tx)) ++ status = SCHEDULE_RESUME; ++ ++ break; ++ } ++ ++ /* We seem to have a completed WQ element. */ ++ ++ if (is_iq(wq)) { ++ /* ++ * We have a completed IQ reply. ++ * Clear references to the memory region. ++ */ ++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds); ++ ++ /* ++ * It's more effecient to retire an IQ wqe manually ++ * here instead of calling ibscif_retire_wqes(). ++ */ ++ wq->head = (wq->head + 1) % wq->size; ++ wq->depth -= 1; ++ ++ } else if ((wr->opcode == WR_RDMA_READ) || ++ (wr->opcode == WR_ATOMIC_CMP_AND_SWP) || ++ (wr->opcode == WR_ATOMIC_FETCH_AND_ADD)|| ++ (wr->opcode == WR_UD && wr->use_rma) || ++ (wr->opcode == WR_SEND && wr->use_rma) || ++ (wr->opcode == WR_SEND_WITH_IMM && wr->use_rma) || ++ (wr->opcode == WR_RDMA_WRITE && wr->use_rma) || ++ (wr->opcode == WR_RDMA_WRITE_WITH_IMM && wr->use_rma)) { ++ /* ++ * We have a request acknowledgment. ++ * Note the state change so it isn't retried. ++ * ++ * BTW, these request types are completed in the ++ * ibscif_schedule_rx_completions() routine when ++ * the data has arrived. ++ */ ++ if (wr->state == WR_WAITING_FOR_ACK) ++ wr->state = WR_WAITING_FOR_RSP; ++ ++ } else if (wr->state != WR_COMPLETED) { ++ /* Request is complete so no need to keep references. */ ++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds); ++ wr->state = WR_COMPLETED; ++ } ++ } ++ spin_unlock_bh(&wq->lock); ++ ++ if (is_sq(wq)) { ++ err = ibscif_process_sq_completions(wq->qp); ++ if (unlikely(err)) { ++ printk(KERN_ALERT PFX "%s: sq completion error: err=%d \n", __func__, err); ++ ibscif_protocol_error(wq->qp, IBSCIF_REASON_QP_FATAL); ++ status = 0; ++ } ++ } ++ ++ return status; ++} ++ ++static void ibscif_process_ack(struct ibscif_qp *qp, struct base_hdr *hdr) ++{ ++ qp->schedule |= ibscif_process_wq_ack(&qp->sq, hdr->sq_ack_num) | SCHEDULE_SQ; ++ qp->schedule |= ibscif_process_wq_ack(&qp->iq, hdr->iq_ack_num) | SCHEDULE_IQ; ++} ++ ++/* Note that the WQ lock is held on success. */ ++static struct ibscif_wr *ibscif_reserve_wqe(struct ibscif_wq *wq) ++{ ++ int err; ++ ++ spin_lock_bh(&wq->lock); ++ ++ if (unlikely(wq->qp->state != QP_CONNECTED)) { ++ err = -ENOTCONN; ++ goto out; ++ } ++ if (unlikely(!wq->size)) { ++ err = -ENOSPC; ++ goto out; ++ } ++ if (unlikely(wq->depth == wq->size)) { ++ err = -ENOBUFS; ++ goto out; ++ } ++ ++ return ibscif_get_wr(wq, wq->tail); ++out: ++ spin_unlock_bh(&wq->lock); ++ return ERR_PTR(err); ++} ++ ++/* Note that this assumes the WQ lock is currently held. */ ++static void ibscif_append_wqe(struct ibscif_wq *wq) ++{ ++ DEV_STAT(wq->qp->dev, wr_opcode[ibscif_get_wr(wq, wq->tail)->opcode]++); ++ ibscif_append_wq(wq); ++ spin_unlock_bh(&wq->lock); ++} ++ ++static struct ibscif_wr* ibscif_wr_by_msg_id(struct ibscif_wq *wq, u32 msg_id) ++{ ++ struct ibscif_wr *wr; ++ int size = wq->size; ++ ++ if (!size) ++ return NULL; ++ ++ wr = ibscif_get_wr(wq, msg_id % size); ++ if (wr->use_rma) ++ return (wr->rma_id == msg_id) ? wr : NULL; ++ else ++ return (wr->msg_id == msg_id) ? wr : NULL; ++} ++ ++static int ibscif_ds_dma(struct ibscif_qp *qp, struct page **page, u32 page_offset, struct sk_buff *skb, u32 dma_len, int head_copied) ++{ ++ void *dst, *src = skb->data; ++ u32 copy_len; ++ ++ while (dma_len) { ++ copy_len = min(dma_len, (u32)PAGE_SIZE - page_offset); ++ ++ dst = ibscif_map_dst(*page) + page_offset; ++ head_copied = ibscif_atomic_copy(dst, src, copy_len, head_copied); ++ ibscif_unmap_dst(*page, dst); ++ ++ src += copy_len; ++ dma_len -= copy_len; ++ ++ page++; ++ page_offset = 0; ++ } ++ ++ return head_copied; ++} ++ ++static int ibscif_place_data(struct ibscif_qp *qp, struct ibscif_wr *wr, struct sk_buff *skb, ++ u32 length, u32 offset, u32 seq_num) ++{ ++ struct ibscif_ds *ds; ++ struct ibscif_mr *mr; ++ int seg_num, page_index; ++ u32 dma_len, ds_offset, page_offset; ++ int head_copied = 0; ++ ++ if (!length) { ++ ds = NULL; ++ dma_len = 0; ++ ds_offset = 0; ++ goto no_data; ++ } ++ ++ /* See if we can use our ds cache. */ ++ if (likely((wr->sar.rea.current_ds) && (wr->sar.rea.last_seen_seq == seq_num - 1))) { ++ /* Take the cached entires. */ ++ ds = wr->sar.rea.current_ds; ++ mr = ds->mr; ++ ds_offset = wr->sar.rea.current_ds_offset; ++ seg_num = (ds - wr->ds_list) / sizeof *wr->ds_list; ++ } else { ++ ds_offset = offset; ++ ds = wr->ds_list; ++ seg_num = 0; ++ while ((ds_offset >= ds->length) && (seg_num < wr->num_ds)) { ++ ds_offset -= ds->length; ++ ds++; ++ seg_num++; ++ } ++next_ds: ++ if (unlikely(seg_num >= wr->num_ds)) ++ return -EMSGSIZE; ++ /* ++ * A memory region which may have posted receives against it can ++ * still be freed, therefore, we need to burn the cycles here to ++ * make sure it's still valid. We'll take a reference on it now ++ * that data is coming in. ++ */ ++ if (!ds->in_use) { ++ mr = ibscif_get_mr(ds->lkey); ++ if (unlikely(IS_ERR(mr))) ++ return PTR_ERR(mr); ++ ds->in_use = 1; ++ if (unlikely(mr != ds->mr)) ++ return -ENXIO; ++ if (unlikely(!(mr->access & IB_ACCESS_LOCAL_WRITE))) ++ return -EACCES; ++ } else ++ mr = ds->mr; ++ } ++ ++ /* Place data for this descriptor. Routine will handle page boundary crossings. */ ++ page_offset = ds->offset + ds_offset + (mr->addr & ~PAGE_MASK); ++ page_index = page_offset >> PAGE_SHIFT; ++ page_offset &= ~PAGE_MASK; ++ ++ dma_len = min(ds->length - ds_offset, length); ++ head_copied = ibscif_ds_dma(qp, &mr->page[page_index], page_offset, skb, dma_len, head_copied); ++ length -= dma_len; ++ if (length) { ++ ds++; ++ seg_num++; ++ ds_offset = 0; ++ skb_pull(skb, dma_len); ++ goto next_ds; ++ } ++no_data: ++ wr->sar.rea.last_seen_seq = seq_num; ++ ++ if (ds && ((ds_offset + dma_len) < ds->length)) { ++ wr->sar.rea.current_ds = ds; ++ wr->sar.rea.current_ds_offset = ds_offset + dma_len; ++ } else ++ wr->sar.rea.current_ds = NULL; /* Force a validation of the next ds. */ ++ ++ return 0; ++} ++ ++static int ibscif_process_ud(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb) ++{ ++ struct ibscif_wr *wr; ++ int err; ++ int grh_size = 40; ++ int msg_id; ++ ++ if (unlikely(qp->ibqp.qp_type != IB_QPT_UD)) { ++ printk(KERN_ALERT PFX "%s: UD packet received on non-UD QP\n", __func__); ++ return -EINVAL; ++ } ++ ++ pdu->ud.msg_length = __be32_to_cpu(pdu->ud.msg_length); ++ pdu->ud.msg_offset = __be32_to_cpu(pdu->ud.msg_offset); ++ ++ /* Only one pdu is allowed for one UD packet, otherwise drop the pdu */ ++ if (unlikely(pdu->ud.msg_length != pdu->hdr.length || pdu->ud.msg_offset)) { ++ printk(KERN_INFO PFX "%s: dropping fragmented UD packet. total_length=%d msg_length=%d msg_offset=%d\n", ++ __func__, pdu->hdr.length, pdu->ud.msg_length, pdu->ud.msg_offset); ++ return -EINVAL; ++ } ++ ++ spin_lock_bh(&qp->rq.lock); ++ if (unlikely(qp->rq.ud_msg_id >= qp->rq.next_msg_id)) { ++ spin_unlock_bh(&qp->rq.lock); ++ printk(KERN_ALERT PFX "%s: ERROR: message arrives before recv is posted. msg_id=%d, rq.next_msg_id=%d\n", ++ __func__, pdu->send.msg_id, qp->rq.next_msg_id); ++ return -EBADRQC; ++ } ++ msg_id = qp->rq.ud_msg_id++; ++ spin_unlock_bh(&qp->rq.lock); ++ ++ wr = ibscif_wr_by_msg_id(&qp->rq, msg_id); ++ if (unlikely(!wr)) ++ return -EBADR; ++ ++ if (unlikely((pdu->ud.msg_length + grh_size) > wr->length)) ++ return -EMSGSIZE; ++ ++ /* GRH is included as part of the received message */ ++ skb_pull(skb, sizeof(pdu->ud)-grh_size); ++ ++ err = ibscif_place_data(qp, wr, skb, pdu->hdr.length+grh_size, pdu->ud.msg_offset, pdu->hdr.seq_num); ++ if (unlikely(err)) ++ return err; ++ ++ wr->state = WR_LAST_SEEN; ++ wr->sar.rea.opcode = pdu->hdr.opcode; ++ wr->sar.rea.last_packet_seq = 0; ++ wr->sar.rea.immediate_data = 0; ++ wr->sar.rea.final_length = pdu->ud.msg_length+grh_size; ++ ++ return 0; ++} ++ ++static int ibscif_process_send(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb) ++{ ++ struct ibscif_wr *wr; ++ int err; ++ ++ pdu->send.msg_id = __be32_to_cpu(pdu->send.msg_id); ++ spin_lock_bh(&qp->rq.lock); ++ if (unlikely(pdu->send.msg_id >= qp->rq.next_msg_id)) { ++ spin_unlock_bh(&qp->rq.lock); ++ printk(KERN_ALERT PFX "%s: ERROR: message arrives before recv is posted. msg_id=%d, rq.next_msg_id=%d\n", ++ __func__, pdu->send.msg_id, qp->rq.next_msg_id); ++ return -EBADRQC; ++ } ++ spin_unlock_bh(&qp->rq.lock); ++ ++ wr = ibscif_wr_by_msg_id(&qp->rq, pdu->send.msg_id); ++ if (unlikely(!wr)) ++ return -EBADR; ++ ++ pdu->send.msg_length = __be32_to_cpu(pdu->send.msg_length); ++ if (unlikely(pdu->send.msg_length > wr->length)) ++ return -EMSGSIZE; ++ ++ pdu->send.msg_offset = __be32_to_cpu(pdu->send.msg_offset); ++ if (unlikely(pdu->send.msg_offset > pdu->send.msg_length)) ++ return -EINVAL; ++ ++ if (unlikely((pdu->hdr.length + pdu->send.msg_offset) > wr->length)) ++ return -ESPIPE; ++ ++ skb_pull(skb, sizeof(pdu->send)); ++ ++ err = ibscif_place_data(qp, wr, skb, pdu->hdr.length, pdu->send.msg_offset, pdu->hdr.seq_num); ++ if (unlikely(err)) ++ return err; ++ ++ if (ibscif_pdu_is_last(pdu->hdr.opcode)) { ++ /* ++ * We've got the last of the message data. ++ * We always assume immediate data; if not needed, no harm, on foul. ++ */ ++ wr->state = WR_LAST_SEEN; ++ wr->sar.rea.opcode = pdu->hdr.opcode; ++ wr->sar.rea.last_packet_seq = pdu->hdr.seq_num; ++ wr->sar.rea.immediate_data = __be32_to_cpu(pdu->send.immed_data); ++ wr->sar.rea.final_length = pdu->send.msg_length; ++ } ++ ++ return 0; ++} ++ ++static int ibscif_process_write(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb) ++{ ++ struct ibscif_wr *wr; ++ struct ibscif_mr *mr; ++ u64 rdma_addr; ++ u32 rdma_len, page_offset; ++ int page_index; ++ ++ if (unlikely(!(qp->access & IB_ACCESS_REMOTE_WRITE))) ++ return -EACCES; ++ ++ /* Writes with immediate data consume an rq wqe. */ ++ if (ibscif_pdu_is_immed(pdu->hdr.opcode)) { ++ pdu->write.msg_id = __be32_to_cpu(pdu->write.msg_id); ++ spin_lock_bh(&qp->rq.lock); ++ if (unlikely(pdu->write.msg_id >= qp->rq.next_msg_id)) { ++ spin_unlock_bh(&qp->rq.lock); ++ printk(KERN_ALERT PFX "%s: ERROR: message arrives before recv is posted. msg_id=%d, rq.next_msg_id=%d\n", ++ __func__, pdu->write.msg_id, qp->rq.next_msg_id); ++ return -EBADRQC; ++ } ++ spin_unlock_bh(&qp->rq.lock); ++ ++ wr = ibscif_wr_by_msg_id(&qp->rq, pdu->write.msg_id); ++ if (unlikely(!wr)) ++ return -EBADR; ++ } else ++ wr = NULL; ++ ++ skb_pull(skb, sizeof(pdu->write)); ++ ++ rdma_addr = __be64_to_cpu(pdu->write.rdma_address); ++ rdma_len = pdu->hdr.length; ++ if (unlikely((rdma_addr + (rdma_len - 1)) < rdma_addr)) ++ return -EOVERFLOW; ++ ++ mr = ibscif_validate_mr(__be32_to_cpu(pdu->write.rdma_key), rdma_addr, ++ rdma_len, qp->ibqp.pd, IB_ACCESS_REMOTE_WRITE); ++ if (unlikely(IS_ERR(mr))) ++ return PTR_ERR(mr); ++ ++ page_offset = rdma_addr & ~PAGE_MASK; ++ page_index = ((rdma_addr - mr->addr) + (mr->addr & ~PAGE_MASK)) >> PAGE_SHIFT; ++ ++ ibscif_ds_dma(qp, &mr->page[page_index], page_offset, skb, rdma_len, 0); ++ ++ ibscif_put_mr(mr); ++ ++ if (wr) { ++ wr->sar.rea.final_length += rdma_len; ++ if (ibscif_pdu_is_last(pdu->hdr.opcode)) { ++ /* We've got the last of the write data. */ ++ wr->state = WR_LAST_SEEN; ++ wr->sar.rea.opcode = pdu->hdr.opcode; ++ wr->sar.rea.last_packet_seq = pdu->hdr.seq_num; ++ wr->sar.rea.immediate_data = __be32_to_cpu(pdu->write.immed_data); ++ } ++ } ++ ++ return 0; ++} ++ ++static int ibscif_process_read(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb) ++{ ++ struct ibscif_wr *wr; ++ struct ibscif_mr *mr; ++ u64 rdma_addr; ++ u32 rdma_len; ++ ++ if (unlikely(!(qp->access & IB_ACCESS_REMOTE_READ))) ++ return -EACCES; ++ ++ rdma_addr = __be64_to_cpu(pdu->read_req.rdma_address); ++ rdma_len = __be32_to_cpu(pdu->read_req.rdma_length); ++ if (unlikely((rdma_addr + (rdma_len - 1)) < rdma_addr)) ++ return -EOVERFLOW; ++ ++ mr = ibscif_validate_mr(__be32_to_cpu(pdu->read_req.rdma_key), rdma_addr, ++ rdma_len, qp->ibqp.pd, IB_ACCESS_REMOTE_READ); ++ if (unlikely(IS_ERR(mr))) ++ return PTR_ERR(mr); ++ ++ wr = ibscif_reserve_wqe(&qp->iq); ++ if (unlikely(IS_ERR(wr))) { ++ ibscif_put_mr(mr); ++ return PTR_ERR(wr); ++ } ++ ++ memset(&wr->sar, 0, sizeof wr->sar); ++ ++ wr->opcode = WR_RDMA_READ_RSP; ++ wr->state = WR_WAITING; ++ wr->length = rdma_len; ++ wr->msg_id = __be32_to_cpu(pdu->read_req.rdma_id); ++ wr->num_ds = 1; ++ wr->ds_list[0].mr = mr; ++ wr->ds_list[0].offset = rdma_addr - mr->addr; ++ wr->ds_list[0].length = rdma_len; ++ wr->ds_list[0].in_use = 1; ++ ++ ibscif_append_wqe(&qp->iq); ++ qp->schedule |= SCHEDULE_RESUME | SCHEDULE_IQ; ++ ++ return 0; ++} ++ ++static int ibscif_process_read_rsp(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb) ++{ ++ struct ibscif_wr *wr; ++ int err; ++ ++ /* Find the requesting sq wr. */ ++ wr = ibscif_wr_by_msg_id(&qp->sq, __be32_to_cpu(pdu->read_rsp.rdma_id)); ++ if (unlikely(!wr)) ++ return -EBADR; ++ if (unlikely(wr->opcode != WR_RDMA_READ)) ++ return -ENOMSG; ++ ++ skb_pull(skb, sizeof(pdu->read_rsp)); ++ ++ pdu->read_rsp.rdma_offset = __be32_to_cpu(pdu->read_rsp.rdma_offset); ++ ++ err = ibscif_place_data(qp, wr, skb, pdu->hdr.length, pdu->read_rsp.rdma_offset, pdu->hdr.seq_num); ++ if (unlikely(err)) ++ return err; ++ ++ if (ibscif_pdu_is_last(pdu->hdr.opcode)) { ++ /* We've got the last of the read data. */ ++ wr->state = WR_LAST_SEEN; ++ wr->sar.rea.opcode = pdu->hdr.opcode; ++ wr->sar.rea.last_packet_seq = pdu->hdr.seq_num; ++ wr->sar.rea.final_length = pdu->read_rsp.rdma_offset + pdu->hdr.length; ++ } ++ ++ return 0; ++} ++ ++static int ibscif_process_atomic_req(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb) ++{ ++ struct ibscif_wr *wr; ++ struct ibscif_mr *mr; ++ struct page *page; ++ u64 *addr; ++ u32 offset, rkey, msg_id; ++ u16 opcode; ++ ++ if (unlikely(!(qp->access & IB_ACCESS_REMOTE_ATOMIC))) ++ return -EACCES; ++ ++ opcode = ibscif_pdu_base_type(pdu->hdr.opcode); ++ if (opcode == ibscif_op_comp_swap) { ++ addr = (u64 *)__be64_to_cpu(pdu->comp_swap.atomic_address); ++ rkey = __be32_to_cpu(pdu->comp_swap.atomic_key); ++ msg_id = __be32_to_cpu(pdu->comp_swap.atomic_id); ++ } else { ++ addr = (u64 *)__be64_to_cpu(pdu->fetch_add.atomic_address); ++ rkey = __be32_to_cpu(pdu->fetch_add.atomic_key); ++ msg_id = __be32_to_cpu(pdu->fetch_add.atomic_id); ++ } ++ ++ if (unlikely((u64)addr & (sizeof *addr - 1))) ++ return -EADDRNOTAVAIL; ++ if (unlikely((addr + (sizeof *addr - 1)) < addr)) ++ return -EOVERFLOW; ++ ++ mr = ibscif_validate_mr(rkey, (u64)addr, sizeof *addr, qp->ibqp.pd, IB_ACCESS_REMOTE_ATOMIC); ++ if (unlikely(IS_ERR(mr))) ++ return PTR_ERR(mr); ++ ++ wr = ibscif_reserve_wqe(&qp->iq); ++ if (unlikely(IS_ERR(wr))) { ++ ibscif_put_mr(mr); ++ return PTR_ERR(wr); ++ } ++ ++ /* Determine which page to map. */ ++ offset = ((u64)addr - mr->addr) + (mr->addr & ~PAGE_MASK); ++ page = mr->page[offset >> PAGE_SHIFT]; ++ offset &= ~PAGE_MASK; ++ ++ /* Lock to perform the atomic operation atomically. */ ++ spin_lock_bh(&qp->dev->atomic_op); ++ ++ addr = ibscif_map_src(page) + offset; ++ wr->atomic_rsp.orig_data = *addr; ++ if (opcode == ibscif_op_fetch_add) ++ *addr += __be64_to_cpu(pdu->fetch_add.add_data); ++ else if (wr->atomic_rsp.orig_data == __be64_to_cpu(pdu->comp_swap.comp_data)) ++ *addr = __be64_to_cpu(pdu->comp_swap.swap_data); ++ ibscif_unmap_src(page, addr); ++ ++ ibscif_put_mr(mr); ++ ++ /* Atomic operation is complete. */ ++ spin_unlock_bh(&qp->dev->atomic_op); ++ ++ memset(&wr->sar, 0, sizeof wr->sar); ++ ++ wr->opcode = WR_ATOMIC_RSP; ++ wr->state = WR_WAITING; ++ wr->length = 0; ++ wr->msg_id = msg_id; ++ wr->num_ds = 0; ++ wr->atomic_rsp.opcode = (opcode==ibscif_op_comp_swap)? ibscif_op_comp_swap_rsp : ibscif_op_fetch_add_rsp; ++ /* The wr->atomic_rsp.orig_data field was set above. */ ++ ++ ibscif_append_wqe(&qp->iq); ++ qp->schedule |= SCHEDULE_RESUME | SCHEDULE_IQ; ++ ++ return 0; ++} ++ ++static int ibscif_process_atomic_rsp(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb) ++{ ++ struct ibscif_wr *wr; ++ u16 opcode; ++ int err; ++ ++ if (unlikely(!ibscif_pdu_is_last(pdu->atomic_rsp.hdr.opcode))) ++ return -EINVAL; ++ ++ /* Find the requesting sq wr. */ ++ wr = ibscif_wr_by_msg_id(&qp->sq, __be32_to_cpu(pdu->atomic_rsp.atomic_id)); ++ if (unlikely(!wr)) ++ return -EBADR; ++ ++ opcode = ibscif_pdu_base_type(pdu->hdr.opcode); ++ if (unlikely(wr->opcode != ((opcode == ibscif_op_comp_swap_rsp) ? ++ WR_ATOMIC_CMP_AND_SWP : WR_ATOMIC_FETCH_AND_ADD))) ++ return -ENOMSG; ++ ++ skb_pull(skb, (unsigned long)&pdu->atomic_rsp.orig_data - (unsigned long)pdu); ++ ++ pdu->atomic_rsp.orig_data = __be64_to_cpu(pdu->atomic_rsp.orig_data); ++ err = ibscif_place_data(qp, wr, skb, sizeof pdu->atomic_rsp.orig_data, 0, pdu->hdr.seq_num); ++ if (unlikely(err)) ++ return err; ++ ++ wr->state = WR_LAST_SEEN; ++ wr->sar.rea.opcode = pdu->hdr.opcode; ++ wr->sar.rea.last_packet_seq = pdu->hdr.seq_num; ++ wr->sar.rea.final_length = sizeof pdu->atomic_rsp.orig_data; ++ ++ return 0; ++} ++ ++static int ibscif_process_disconnect(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb) ++{ ++ ibscif_qp_remote_disconnect(qp, __be32_to_cpu(pdu->disconnect.reason)); ++ return 0; ++} ++ ++static int ibscif_process_send_rma(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb) ++{ ++ struct ibscif_ds *ds; ++ struct ibscif_wr *wr; ++ struct ibscif_mr *mr; ++ struct ibscif_mreg_info *mreg; ++ u32 num_rma_addrs; ++ u64 rma_offset; ++ u32 rma_length; ++ u32 total; ++ int seg_num; ++ int cur_rma_addr; ++ u32 xfer_len, ds_offset; ++ int err; ++ u64 loffset; ++ u32 dma_size = 0; ++ int rma_flag = 0; ++ ++ if (unlikely(!qp->conn)) { ++ printk(KERN_ALERT PFX "%s: ERROR: qp->conn == NULL\n", __func__); ++ return -EACCES; ++ } ++ ++ pdu->send.msg_id = __be32_to_cpu(pdu->send.msg_id); ++ spin_lock_bh(&qp->rq.lock); ++ if (unlikely(pdu->send.msg_id >= qp->rq.next_msg_id)) { ++ spin_unlock_bh(&qp->rq.lock); ++ printk(KERN_ALERT PFX "%s: ERROR: message arrives before recv is posted. msg_id=%d, rq.next_msg_id=%d\n", ++ __func__, pdu->send.msg_id, qp->rq.next_msg_id); ++ return -EBADRQC; ++ } ++ spin_unlock_bh(&qp->rq.lock); ++ ++ wr = ibscif_wr_by_msg_id(&qp->rq, pdu->send.msg_id); ++ if (unlikely(!wr)) ++ return -EBADR; ++ ++ pdu->send.msg_length = __be32_to_cpu(pdu->send.msg_length); ++ if (unlikely(pdu->send.msg_length > wr->length)) ++ return -EMSGSIZE; ++ ++ pdu->send.msg_offset = __be32_to_cpu(pdu->send.msg_offset); ++ if (unlikely(pdu->send.msg_offset > pdu->send.msg_length)) ++ return -EINVAL; ++ ++ if (unlikely((pdu->hdr.length + pdu->send.msg_offset) > wr->length)) ++ return -ESPIPE; ++ ++ total = 0; ++ ++ num_rma_addrs = __be32_to_cpu(pdu->send.num_rma_addrs); ++ cur_rma_addr = 0; ++ rma_offset = __be64_to_cpu(pdu->send.rma_addrs[cur_rma_addr].offset); ++ rma_length = __be32_to_cpu(pdu->send.rma_addrs[cur_rma_addr].length); ++ ++ ds_offset = pdu->send.msg_offset; ++ ds = wr->ds_list; ++ seg_num = 0; ++ while ((ds_offset >= ds->length) && (seg_num < wr->num_ds)) { ++ ds_offset -= ds->length; ++ ds++; ++ seg_num++; ++ } ++ ++ err = 0; ++ while (total < pdu->send.msg_length && !err) { ++ if (unlikely(seg_num >= wr->num_ds)) ++ return -EMSGSIZE; ++ ++ if (!ds->in_use) { ++ mr = ibscif_get_mr(ds->lkey); ++ if (unlikely(IS_ERR(mr))) ++ return PTR_ERR(mr); ++ ds->in_use = 1; ++ if (unlikely(mr != ds->mr)) ++ return -ENXIO; ++ if (unlikely(!(mr->access & IB_ACCESS_LOCAL_WRITE))) ++ return -EACCES; ++ } else ++ mr = ds->mr; ++ ++ mreg = ibscif_mr_get_mreg(mr, qp->conn); ++ if (!mreg) ++ return -EACCES; ++ ++ while (ds->length > ds_offset) { ++ xfer_len = min( ds->length - ds_offset, rma_length ); ++ if (xfer_len) { ++ loffset = mreg->offset + ds->offset + ds_offset; ++ dma_size += ibscif_dma_size(xfer_len, rma_offset); ++ ++ if ((total + xfer_len >= pdu->send.msg_length) && dma_size) ++ rma_flag = SCIF_RMA_SYNC; ++ ++ err = scif_readfrom(qp->conn->ep, loffset, xfer_len, rma_offset, rma_flag); ++ if (err) { ++ printk(KERN_ALERT PFX "%s: scif_readfrom (%d bytes) returns %d\n", __func__, xfer_len, err); ++ break; ++ } ++ ++ ds_offset += xfer_len; ++ rma_offset += xfer_len; ++ rma_length -= xfer_len; ++ total += xfer_len; ++ ++ if (total >= pdu->send.msg_length) ++ break; ++ } ++ if (rma_length == 0) { ++ cur_rma_addr++; ++ if (unlikely(cur_rma_addr >= num_rma_addrs)) ++ return -EMSGSIZE; ++ ++ rma_offset = __be64_to_cpu(pdu->send.rma_addrs[cur_rma_addr].offset); ++ rma_length = __be32_to_cpu(pdu->send.rma_addrs[cur_rma_addr].length); ++ } ++ } ++ ++ seg_num++; ++ ds++; ++ } ++ ++ wr->state = WR_LAST_SEEN; ++ wr->sar.rea.opcode = pdu->hdr.opcode; ++ wr->sar.rea.last_packet_seq = pdu->hdr.seq_num; ++ wr->sar.rea.immediate_data = __be32_to_cpu(pdu->send.immed_data); ++ wr->sar.rea.final_length = pdu->send.msg_length; ++ ++ /* Respond to the initiator with the result */ ++ wr = ibscif_reserve_wqe(&qp->iq); ++ if (unlikely(IS_ERR(wr))) { ++ return PTR_ERR(wr); ++ } ++ ++ memset(&wr->sar, 0, sizeof wr->sar); ++ ++ wr->opcode = WR_RMA_RSP; ++ wr->state = WR_WAITING; ++ wr->length = 0; ++ wr->msg_id = __be32_to_cpu(pdu->send.rma_id); ++ wr->num_ds = 0; ++ wr->rma_rsp.xfer_length = total; ++ wr->rma_rsp.error = err; ++ ++ ibscif_append_wqe(&qp->iq); ++ qp->schedule |= SCHEDULE_RESUME | SCHEDULE_IQ; ++ ++ return 0; ++} ++ ++static int ibscif_process_write_rma(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb) ++{ ++ struct ibscif_wr *wr; ++ struct ibscif_mr *mr; ++ u64 rdma_addr; ++ u32 rdma_len; ++ struct ibscif_mreg_info *mreg; ++ u32 num_rma_addrs; ++ u64 rma_offset; ++ u32 rma_length; ++ u32 total; ++ int i; ++ int err; ++ u64 loffset; ++ u32 dma_size = 0; ++ int rma_flag = 0; ++ ++ if (unlikely(!qp->conn)) { ++ printk(KERN_ALERT PFX "%s: ERROR: qp->conn == NULL\n", __func__); ++ return -EACCES; ++ } ++ ++ if (unlikely(!(qp->access & IB_ACCESS_REMOTE_WRITE))) ++ return -EACCES; ++ ++ /* Writes with immediate data consume an rq wqe. */ ++ if (ibscif_pdu_is_immed(pdu->hdr.opcode)) { ++ pdu->write.msg_id = __be32_to_cpu(pdu->write.msg_id); ++ spin_lock_bh(&qp->rq.lock); ++ if (unlikely(pdu->write.msg_id >= qp->rq.next_msg_id)) { ++ spin_unlock_bh(&qp->rq.lock); ++ return -EBADRQC; ++ } ++ spin_unlock_bh(&qp->rq.lock); ++ ++ wr = ibscif_wr_by_msg_id(&qp->rq, pdu->write.msg_id); ++ if (unlikely(!wr)) ++ return -EBADR; ++ } ++ else ++ wr = NULL; ++ ++ rdma_addr = __be64_to_cpu(pdu->write.rdma_address); ++ rdma_len = __be32_to_cpu(pdu->write.rma_length); ++ if (unlikely((rdma_addr + (rdma_len - 1)) < rdma_addr)) ++ return -EOVERFLOW; ++ ++ mr = ibscif_validate_mr(__be32_to_cpu(pdu->write.rdma_key), rdma_addr, ++ rdma_len, qp->ibqp.pd, IB_ACCESS_REMOTE_WRITE); ++ if (unlikely(IS_ERR(mr))) ++ return PTR_ERR(mr); ++ ++ mreg = ibscif_mr_get_mreg(mr, qp->conn); ++ if (!mreg) ++ return -EACCES; ++ ++ total = 0; ++ err = 0; ++ num_rma_addrs = __be32_to_cpu(pdu->write.num_rma_addrs); ++ for (i=0; iwrite.rma_addrs[i].offset); ++ rma_length = __be32_to_cpu(pdu->write.rma_addrs[i].length); ++ ++ if (rdma_len < rma_length) ++ rma_length = rdma_len; ++ ++ if (rma_length == 0) ++ continue; ++ ++ loffset = mreg->offset + (rdma_addr - mr->addr) + total; ++ dma_size += ibscif_dma_size(rma_length, rma_offset); ++ ++ if ((i==num_rma_addrs-1) && dma_size) ++ rma_flag = SCIF_RMA_SYNC; ++ ++ err = scif_readfrom(qp->conn->ep, loffset, rma_length, rma_offset, rma_flag); ++ if (err) { ++ printk(KERN_ALERT PFX "%s: scif_readfrom (%d bytes) returns %d\n", __func__, rma_length, err); ++ break; ++ } ++ ++ rdma_len -= rma_length; ++ total += rma_length; ++ } ++ ++ ibscif_put_mr(mr); ++ ++ if (wr) { ++ wr->sar.rea.final_length = total; ++ wr->state = WR_LAST_SEEN; ++ wr->sar.rea.opcode = pdu->hdr.opcode; ++ wr->sar.rea.last_packet_seq = pdu->hdr.seq_num; ++ wr->sar.rea.immediate_data = __be32_to_cpu(pdu->write.immed_data); ++ } ++ ++ /* Respond to the initiator with the result */ ++ wr = ibscif_reserve_wqe(&qp->iq); ++ if (unlikely(IS_ERR(wr))) { ++ return PTR_ERR(wr); ++ } ++ ++ memset(&wr->sar, 0, sizeof wr->sar); ++ ++ wr->opcode = WR_RMA_RSP; ++ wr->state = WR_WAITING; ++ wr->length = 0; ++ wr->msg_id = __be32_to_cpu(pdu->write.rma_id); ++ wr->num_ds = 0; ++ wr->rma_rsp.xfer_length = total; ++ wr->rma_rsp.error = err; ++ ++ ibscif_append_wqe(&qp->iq); ++ qp->schedule |= SCHEDULE_RESUME | SCHEDULE_IQ; ++ ++ return 0; ++} ++ ++static int ibscif_process_read_rma(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb) ++{ ++ struct ibscif_wr *wr; ++ struct ibscif_mr *mr; ++ u64 rdma_addr; ++ u32 rdma_len; ++ struct ibscif_mreg_info *mreg; ++ u32 num_rma_addrs; ++ u64 rma_offset; ++ u32 rma_length; ++ u32 total; ++ int i; ++ int err; ++ u64 loffset; ++ u32 dma_size = 0; ++ int rma_flag = 0; ++ ++ if (unlikely(!qp->conn)) { ++ printk(KERN_ALERT PFX "%s: ERROR: qp->conn == NULL\n", __func__); ++ return -EACCES; ++ } ++ ++ if (unlikely(!(qp->access & IB_ACCESS_REMOTE_READ))) ++ return -EACCES; ++ ++ rdma_addr = __be64_to_cpu(pdu->read_req.rdma_address); ++ rdma_len = __be32_to_cpu(pdu->read_req.rdma_length); ++ if (unlikely((rdma_addr + (rdma_len - 1)) < rdma_addr)) ++ return -EOVERFLOW; ++ ++ mr = ibscif_validate_mr(__be32_to_cpu(pdu->read_req.rdma_key), rdma_addr, ++ rdma_len, qp->ibqp.pd, IB_ACCESS_REMOTE_READ); ++ if (unlikely(IS_ERR(mr))) ++ return PTR_ERR(mr); ++ ++ mreg = ibscif_mr_get_mreg(mr, qp->conn); ++ if (!mreg) ++ return -EACCES; ++ ++ total = 0; ++ err = 0; ++ num_rma_addrs = __be32_to_cpu(pdu->read_req.num_rma_addrs); ++ for (i=0; iread_req.rma_addrs[i].offset); ++ rma_length = __be32_to_cpu(pdu->read_req.rma_addrs[i].length); ++ ++ if (rdma_len < rma_length) ++ rma_length = rdma_len; ++ ++ if (rma_length == 0) ++ continue; ++ ++ loffset = mreg->offset + (rdma_addr - mr->addr) + total; ++ dma_size += ibscif_dma_size(rma_length, rma_offset); ++ ++ if ((i==num_rma_addrs-1) && dma_size) ++ rma_flag = SCIF_RMA_SYNC; ++ ++ err = scif_writeto(qp->conn->ep, loffset, rma_length, rma_offset, rma_flag); ++ if (err) { ++ printk(KERN_ALERT PFX "%s: scif_writeto (%d bytes) returns %d\n", __func__, rma_length, err); ++ break; ++ } ++ ++ rdma_len -= rma_length; ++ total += rma_length; ++ } ++ ++ ibscif_put_mr(mr); ++ ++ /* Respond to the initiator with the result */ ++ wr = ibscif_reserve_wqe(&qp->iq); ++ if (unlikely(IS_ERR(wr))) { ++ return PTR_ERR(wr); ++ } ++ ++ memset(&wr->sar, 0, sizeof wr->sar); ++ ++ wr->opcode = WR_RMA_RSP; ++ wr->state = WR_WAITING; ++ wr->length = 0; ++ wr->msg_id = __be32_to_cpu(pdu->read_req.rdma_id); ++ wr->num_ds = 0; ++ wr->rma_rsp.xfer_length = total; ++ wr->rma_rsp.error = err; ++ ++ ibscif_append_wqe(&qp->iq); ++ qp->schedule |= SCHEDULE_RESUME | SCHEDULE_IQ; ++ ++ return 0; ++} ++ ++static int ibscif_process_rma_rsp(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb) ++{ ++ struct ibscif_wr *wr; ++ ++ wr = ibscif_wr_by_msg_id(&qp->sq, __be32_to_cpu(pdu->rma_rsp.rma_id)); ++ if (unlikely(!wr)) ++ return -EBADR; ++ if (unlikely(!wr->use_rma)) ++ return -ENOMSG; ++ ++ if (wr->opcode == WR_RDMA_READ) { ++ /* ibscif_clear_ds_refs() is called in ibscif_schedule_rx_completions() */ ++ wr->state = WR_LAST_SEEN; ++ } ++ else { ++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds); ++ wr->state = WR_COMPLETED; ++ } ++ ++ wr->sar.rea.opcode = pdu->hdr.opcode; ++ wr->sar.rea.last_packet_seq = pdu->hdr.seq_num; ++ wr->sar.rea.final_length = pdu->rma_rsp.xfer_length; ++ ++ return 0; ++} ++ ++static int ibscif_process_pdu(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct sk_buff *skb) ++{ ++ int err; ++ ++ switch (ibscif_pdu_base_type(pdu->hdr.opcode)) { ++ case ibscif_op_ud: ++ err = ibscif_process_ud(qp, pdu, skb); ++ break; ++ case ibscif_op_send: ++ err = ibscif_process_send(qp, pdu, skb); ++ break; ++ case ibscif_op_write: ++ err = ibscif_process_write(qp, pdu, skb); ++ break; ++ case ibscif_op_read: ++ err = ibscif_process_read(qp, pdu, skb); ++ break; ++ case ibscif_op_read_rsp: ++ err = ibscif_process_read_rsp(qp, pdu, skb); ++ break; ++ case ibscif_op_comp_swap_rsp: ++ case ibscif_op_fetch_add_rsp: ++ err = ibscif_process_atomic_rsp(qp, pdu, skb); ++ break; ++ case ibscif_op_comp_swap: ++ case ibscif_op_fetch_add: ++ err = ibscif_process_atomic_req(qp, pdu, skb); ++ break; ++ case ibscif_op_ack: ++ /* Handled in piggyback ack processing. */ ++ err = 0; ++ break; ++ case ibscif_op_disconnect: ++ /* Post send completions before the disconnect flushes the queues. */ ++ ibscif_process_ack(qp, &pdu->hdr); ++ /* Now disconnect the QP. */ ++ err = ibscif_process_disconnect(qp, pdu, skb); ++ break; ++ case ibscif_op_send_rma: ++ err = ibscif_process_send_rma(qp, pdu, skb); ++ break; ++ case ibscif_op_write_rma: ++ err = ibscif_process_write_rma(qp, pdu, skb); ++ break; ++ case ibscif_op_read_rma: ++ err = ibscif_process_read_rma(qp, pdu, skb); ++ break; ++ case ibscif_op_rma_rsp: ++ err = ibscif_process_rma_rsp(qp, pdu, skb); ++ break; ++ default: ++ printk(KERN_INFO PFX "Received invalid opcode (%x)\n", ++ ibscif_pdu_base_type(pdu->hdr.opcode)); ++ err = IBSCIF_REASON_INVALID_OPCODE; ++ break; ++ } ++ ++ if (unlikely(err)) { ++ printk(KERN_ALERT PFX "%s: ERROR: err=%d, opcode=%d\n", __func__, err, ibscif_pdu_base_type(pdu->hdr.opcode)); ++ ibscif_protocol_error(qp, IBSCIF_REASON_QP_FATAL); ++ } ++ ++ return err; ++} ++ ++static int update_rx_seq_numbers(struct ibscif_qp *qp, union ibscif_pdu *pdu, struct ibscif_rx_state *rx) ++{ ++ u32 seq_num = pdu->hdr.seq_num; ++ ++ if (pdu->hdr.opcode == ibscif_op_ack) ++ return 0; ++ ++ if (seq_num != rx->last_in_seq + 1) ++ return 0; ++ ++ rx->last_in_seq = seq_num; ++ ++ return 1; ++} ++ ++static void ibscif_process_qp_skb(struct ibscif_qp *qp, struct sk_buff *skb) ++{ ++ union ibscif_pdu *pdu = (union ibscif_pdu *)skb->data; ++ struct ibscif_rx_state *rx; ++ int err = 0; ++ ++ /* Start with no scheduling. */ ++ qp->schedule = 0; ++ ++ rx = ibscif_pdu_is_iq(pdu->hdr.opcode) ? &qp->wire.iq.rx : &qp->wire.sq.rx; ++ ++ if (ibscif_process_pdu(qp, pdu, skb) == IBSCIF_REASON_INVALID_OPCODE) ++ return; ++ ++ /* skip ack and seq_num for UD QP */ ++ if (qp->ibqp.qp_type == IB_QPT_UD) { ++ err = ibscif_schedule_rx_completions(qp, 0, rx); ++ if (unlikely(err)) { ++ printk(KERN_ALERT PFX "%s: rx completion error: err=%d, opcode=%d\n", __func__, err, ibscif_pdu_base_type(pdu->hdr.opcode)); ++ ibscif_protocol_error(qp, IBSCIF_REASON_QP_FATAL); ++ } ++ goto done; ++ } ++ ++ /* Process piggybacked acks. */ ++ ibscif_process_ack(qp, &pdu->hdr); ++ ++ if (update_rx_seq_numbers(qp, pdu, rx)) { ++ /* PDU is in sequence so schedule/remove completed work requests. */ ++ err = ibscif_schedule_rx_completions(qp, ibscif_pdu_is_iq(pdu->hdr.opcode), rx); ++ if (unlikely(err)) { ++ printk(KERN_ALERT PFX "%s: rx completion error: err=%d, opcode=%d\n", __func__, err, ibscif_pdu_base_type(pdu->hdr.opcode)); ++ ibscif_protocol_error(qp, IBSCIF_REASON_QP_FATAL); ++ goto done; ++ } ++ } ++ ++ /* Generate an ack if forced or if the current window dictates it. */ ++ if (ibscif_pdu_is_force_ack(pdu->hdr.opcode)) { ++ ibscif_send_ack(qp); ++ } else if (pdu->hdr.opcode != ibscif_op_ack) { ++ u32 window = ibscif_rx_window(rx); ++ if (window && (window % (window_size / MIN_WINDOW_SIZE)) == 0) ++ ibscif_send_ack(qp); ++ } ++done: ++ /* Run the scheduler if it was requested. */ ++ if (qp->schedule & SCHEDULE_RESUME) { ++ if (qp->schedule & SCHEDULE_SQ) ++ ibscif_schedule(&qp->sq); ++ if (qp->schedule & SCHEDULE_IQ) ++ ibscif_schedule(&qp->iq); ++ } ++ ++ return; ++} ++ ++#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,21) ++#define skb_mac_header(skb) (skb->mac.raw) ++#endif ++ ++static int ibscif_recv_pkt(struct sk_buff *skb, struct ibscif_dev *dev, scif_epd_t ep, struct ibscif_conn *conn) ++{ ++ union ibscif_pdu *pdu = (union ibscif_pdu *)skb->data; ++ struct ibscif_qp *qp = ERR_PTR(-ENOENT); ++ ++ /* Convert the base header. */ ++ pdu->hdr.opcode = __be16_to_cpu(pdu->hdr.opcode); ++ pdu->hdr.length = __be16_to_cpu(pdu->hdr.length); ++ pdu->hdr.dst_qp = __be32_to_cpu(pdu->hdr.dst_qp); ++ pdu->hdr.src_qp = __be32_to_cpu(pdu->hdr.src_qp); ++ pdu->hdr.seq_num = __be32_to_cpu(pdu->hdr.seq_num); ++ pdu->hdr.sq_ack_num = __be32_to_cpu(pdu->hdr.sq_ack_num); ++ pdu->hdr.iq_ack_num = __be32_to_cpu(pdu->hdr.iq_ack_num); ++ ++ if (pdu->hdr.opcode == ibscif_op_close) { ++ //printk(KERN_INFO PFX "%s: op_close, conn=%p, local_close=%d\n", __func__, conn, conn->local_close); ++ conn->remote_close = 1; ++ goto done_no_qp; ++ } ++ else if (pdu->hdr.opcode == ibscif_op_reopen) { ++ //printk(KERN_INFO PFX "%s: op_reopen, conn=%p, local_close=%d\n", __func__, conn, conn->local_close); ++ conn->remote_close = 0; ++ goto done_no_qp; ++ } ++ else if (pdu->hdr.opcode == ibscif_op_cm) { ++ ibscif_process_cm_skb(skb, conn); ++ goto done_no_qp; ++ } ++ ++ qp = ibscif_get_qp(pdu->hdr.dst_qp); ++ if (unlikely(IS_ERR(qp) || ++ (qp->state != QP_CONNECTED && qp->ibqp.qp_type != IB_QPT_UD) || ++ (qp->ibqp.qp_num != pdu->hdr.dst_qp) || ++ (qp->remote_qpn != pdu->hdr.src_qp && qp->ibqp.qp_type != IB_QPT_UD) || ++ 0)) { ++ /* Disconnect the rogue. */ ++ ibscif_reflect_disconnect(qp, &pdu->hdr, skb, IBSCIF_REASON_INVALID_QP); ++ goto done; ++ } ++ ++ if (qp->ibqp.qp_type == IB_QPT_UD) ++ ibscif_qp_add_ud_conn(qp, conn); ++ ++ DEV_STAT(qp->dev, packets_rcvd++); ++ DEV_STAT(qp->dev, bytes_rcvd += skb->len); ++ ++ ibscif_process_qp_skb(qp, skb); ++done: ++ if (likely(!IS_ERR(qp))) ++ ibscif_put_qp(qp); ++ ++done_no_qp: ++ kfree_skb(skb); ++ return 0; ++} ++ ++void ibscif_do_recv( struct ibscif_dev *dev, scif_epd_t ep, struct ibscif_conn *conn ) ++{ ++ struct sk_buff *skb; ++ union ibscif_pdu *pdu; ++ int hdr_size, payload_size, recv_size, pdu_size; ++ char *recv_buffer; ++ int ret; ++ ++ skb = dev_alloc_skb( IBSCIF_MTU ); ++ if (unlikely(skb==NULL)) { ++ printk(KERN_ALERT PFX "%s(): fail to allocate skb, exiting\n", __func__); ++ return; ++ } ++ ++ skb->protocol = IBSCIF_PACKET_TYPE; ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ skb->priority = TC_PRIO_CONTROL; /* highest defined priority */ ++ skb->dev = (void *) dev; ++ ++ pdu = (union ibscif_pdu *)skb->data; ++ ++ /* get the base header first so the packet size can be determinied */ ++ recv_size = sizeof(pdu->hdr); ++ recv_buffer = (char *)&pdu->hdr; ++ while (recv_size) { ++ ret = scif_recv(ep, recv_buffer, recv_size, blocking_recv ? SCIF_RECV_BLOCK : 0); ++ if (ret < 0) { ++ printk(KERN_ALERT PFX "%s(): fail to receive hdr, ret=%d, expecting %d\n", __func__, ret, (int)recv_size); ++ if (ret == -ENOTCONN || ret == -ECONNRESET) { ++ if (verbose) ++ printk(KERN_INFO PFX "%s: ep disconnected by peer (%d). conn=%p, local_close=%d\n", ++ __func__, ret, conn, conn->local_close); ++ ibscif_remove_ep( dev, ep ); ++ ibscif_refresh_pollep_list(); ++ conn->remote_close = 1; ++ if (conn->local_close) { ++ ibscif_free_conn(conn); ++ } ++ } ++ goto errout; ++ } ++ recv_size -= ret; ++ recv_buffer += ret; ++ } ++ ++ hdr_size = __be16_to_cpu(pdu->hdr.hdr_size); ++ payload_size = __be16_to_cpu(pdu->hdr.length); ++ pdu_size = hdr_size + payload_size; ++ if (unlikely(pdu_size > IBSCIF_MTU)) { ++ printk(KERN_ALERT PFX "%s(): packet size exceed MTU, size=%d\n", __func__, pdu_size); ++ goto errout; ++ } ++ ++ recv_size = pdu_size - sizeof(pdu->hdr); ++ recv_buffer = (char *)pdu + sizeof(pdu->hdr); ++ ++ /* get the remaining of the packet */ ++ //printk(KERN_INFO PFX "%s(): hdr_size=%d payload_size=%d pdu_size=%d recv_size=%d\n", __func__, hdr_size, payload_size, pdu_size, recv_size); ++ ret = 0; ++ while (recv_size) { ++ ret = scif_recv(ep, recv_buffer, recv_size, blocking_recv ? SCIF_RECV_BLOCK : 0); ++ ++ if (ret < 0) { ++ printk(KERN_ALERT PFX "%s(): fail to receive data, ret=%d, expecting %d\n", __func__, ret, recv_size); ++ break; ++ } ++ ++ recv_size -= ret; ++ recv_buffer += ret; ++ } ++ ++ if (ret < 0) ++ goto errout; ++ ++ skb->len = pdu_size; ++ skb->data_len = payload_size; ++ skb->tail += pdu_size; ++ ++ ibscif_recv_pkt(skb, dev, ep, conn); ++ return; ++ ++errout: ++ kfree_skb(skb); ++} ++ ++#define IBSCIF_MAX_POLL_COUNT (IBSCIF_MAX_DEVICES * 2) ++static struct scif_pollepd poll_eps[IBSCIF_MAX_POLL_COUNT]; ++static struct ibscif_dev *poll_devs[IBSCIF_MAX_POLL_COUNT]; ++static int poll_types[IBSCIF_MAX_POLL_COUNT]; ++static struct ibscif_conn *poll_conns[IBSCIF_MAX_POLL_COUNT]; ++static struct task_struct *poll_thread = NULL; ++static atomic_t poll_eps_changed = ATOMIC_INIT(0); ++static volatile int poll_thread_running = 0; ++ ++void ibscif_refresh_pollep_list( void ) ++{ ++ atomic_set(&poll_eps_changed, 1); ++} ++ ++int ibscif_poll_thread( void *unused ) ++{ ++ int poll_count = 0; ++ int ret; ++ int i; ++ int busy; ++ int idle_count = 0; ++ ++ poll_thread_running = 1; ++ while (!kthread_should_stop()) { ++ if (atomic_xchg(&poll_eps_changed, 0)) { ++ poll_count = IBSCIF_MAX_POLL_COUNT; ++ ibscif_get_pollep_list( poll_eps, poll_devs, poll_types, poll_conns, &poll_count ); ++ } ++ ++ if (poll_count == 0) { ++ schedule(); ++ continue; ++ } ++ ++ ret = scif_poll(poll_eps, poll_count, 1000); /* 1s timeout */ ++ ++ busy = 0; ++ if (ret > 0) { ++ for (i=0; ilocal_close); ++ conn->remote_close = 1; ++ if (conn->local_close) { ++ ibscif_free_conn(conn); ++ } ++ } ++ busy = 1; ++ } ++ } ++ } ++ ++ if (busy) { ++ idle_count = 0; ++ } ++ else { ++ idle_count++; ++ /* close unused endpoint after 60 seconds */ ++ if (idle_count == 60) { ++ if (ibscif_cleanup_idle_conn()) ++ ibscif_refresh_pollep_list(); ++ idle_count = 0; ++ } ++ /* pick up the unprocessed items in the xmit queue */ ++ if (!skb_queue_empty(&xmit_queue)) ++ ibscif_dev_queue_xmit(NULL); ++ schedule(); ++ } ++ } ++ ++ poll_thread_running = 0; ++ return 0; ++} ++ ++void ibscif_protocol_init_pre(void) ++{ ++ skb_queue_head_init(&xmit_queue); ++} ++ ++void ibscif_protocol_init_post(void) ++{ ++ poll_thread = kthread_run( ibscif_poll_thread, NULL, "ibscif_polld" ); ++} ++ ++void ibscif_protocol_cleanup(void) ++{ ++ kthread_stop( poll_thread ); ++ ++ while (poll_thread_running) ++ schedule(); ++} +diff -urN a7/drivers/infiniband/hw/scif/ibscif_protocol.h a8/drivers/infiniband/hw/scif/ibscif_protocol.h +--- a7/drivers/infiniband/hw/scif/ibscif_protocol.h 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_protocol.h 2015-02-23 10:14:37.487809663 -0800 +@@ -0,0 +1,395 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#ifndef IBSCIF_PROTOCOL_H ++#define IBSCIF_PROTOCOL_H ++ ++/* ++ * Protocol EtherType ++ */ ++#define IBSCIF_PACKET_TYPE 0x8086 ++ ++/* ++ * Base protocol header version ++ */ ++#define IBSCIF_PROTOCOL_VER_1 1 ++#define IBSCIF_PROTOCOL_VER IBSCIF_PROTOCOL_VER_1 ++ ++/* ++ * Protocol opcode values - All other values are reserved. ++ */ ++#define ibscif_last_flag 0x4000 ++#define ibscif_immed_flag 0x2000 ++#define ibscif_se_flag 0x1000 ++#define ibscif_force_ack_flag 0x0800 ++#define ibscif_iq_flag 0x0400 ++ ++#define ibscif_op_send 0 ++#define ibscif_op_send_last (ibscif_op_send | ibscif_last_flag) ++#define ibscif_op_send_last_se (ibscif_op_send | ibscif_last_flag | ibscif_se_flag) ++#define ibscif_op_send_immed (ibscif_op_send | ibscif_immed_flag) ++#define ibscif_op_send_immed_se (ibscif_op_send | ibscif_immed_flag | ibscif_se_flag) ++ ++#define ibscif_op_write 1 ++#define ibscif_op_write_last (ibscif_op_write | ibscif_last_flag) ++#define ibscif_op_write_immed (ibscif_op_write | ibscif_immed_flag) ++#define ibscif_op_write_immed_se (ibscif_op_write | ibscif_immed_flag | ibscif_se_flag) ++ ++#define ibscif_op_read 2 ++#define ibscif_op_read_rsp (ibscif_op_read | ibscif_iq_flag) ++#define ibscif_op_read_rsp_last (ibscif_op_read_rsp | ibscif_last_flag) ++ ++#define ibscif_op_comp_swap 3 ++#define ibscif_op_comp_swap_rsp (ibscif_op_comp_swap | ibscif_iq_flag) ++ ++#define ibscif_op_fetch_add 4 ++#define ibscif_op_fetch_add_rsp (ibscif_op_fetch_add | ibscif_iq_flag) ++ ++#define ibscif_op_ack 5 ++#define ibscif_op_disconnect 6 ++ ++#define ibscif_op_send_rma 7 ++#define ibscif_op_send_rma_se (ibscif_op_send_rma | ibscif_se_flag) ++#define ibscif_op_send_rma_immed (ibscif_op_send_rma | ibscif_immed_flag) ++#define ibscif_op_send_rma_immed_se (ibscif_op_send_rma | ibscif_immed_flag | ibscif_se_flag) ++ ++#define ibscif_op_write_rma 8 ++#define ibscif_op_write_rma_immed (ibscif_op_write_rma | ibscif_immed_flag) ++#define ibscif_op_write_rma_immed_se (ibscif_op_write_rma | ibscif_immed_flag | ibscif_se_flag) ++ ++#define ibscif_op_read_rma 9 ++#define ibscif_op_rma_rsp (10 | ibscif_iq_flag) ++ ++#define ibscif_op_reg 11 ++#define ibscif_op_dereg 12 ++ ++#define ibscif_op_close 13 ++#define ibscif_op_reopen 14 ++ ++#define ibscif_op_ud 15 ++#define ibscif_op_cm 16 ++ ++#define ibscif_pdu_is_last(op) (op & ibscif_last_flag) ++#define ibscif_pdu_is_immed(op) (op & ibscif_immed_flag) ++#define ibscif_pdu_is_se(op) (op & ibscif_se_flag) ++#define ibscif_pdu_is_force_ack(op) (op & ibscif_force_ack_flag) ++#define ibscif_pdu_is_iq(op) (op & ibscif_iq_flag) ++ ++#define ibscif_pdu_set_last(op) (op | ibscif_last_flag) ++#define ibscif_pdu_set_immed(op) (op | ibscif_immed_flag) ++#define ibscif_pdu_set_se(op) (op | ibscif_se_flag) ++#define ibscif_pdu_set_force_ack(op) (op | ibscif_force_ack_flag) ++#define ibscif_pdu_set_iq(op) (op | ibscif_iq_flag) ++ ++#define ibscif_pdu_base_type(op) \ ++ (op & ~(ibscif_last_flag | \ ++ ibscif_se_flag | \ ++ ibscif_immed_flag | \ ++ ibscif_force_ack_flag)) ++ ++/* ++ * Remote address descriptor for SCIF RMA operations ++ */ ++struct rma_addr { ++ __be64 offset; ++ __be32 length; ++ __be32 reserved; ++} __attribute__ ((packed)); ++ ++/* ++ * Base header present in every packet ++ */ ++struct base_hdr { ++ __be16 opcode; ++ __be16 length; ++ __be32 dst_qp; ++ __be32 src_qp; ++ __be32 seq_num; ++ __be32 sq_ack_num; ++ __be32 iq_ack_num; ++ __be16 hdr_size; ++ __be16 reserved[3]; ++} __attribute__ ((packed)); ++ ++/* ++ * UD Header ++ */ ++struct ud_hdr { ++ struct base_hdr hdr; ++ __be32 msg_id; ++ __be32 msg_length; ++ __be32 msg_offset; ++ u8 grh[40]; ++} __attribute__ ((packed)); ++ ++/* ++ * Send Header ++ */ ++struct send_hdr { ++ struct base_hdr hdr; ++ __be32 msg_id; ++ __be32 msg_length; ++ __be32 msg_offset; ++ __be32 immed_data; ++ __be32 rma_id; /* RMA */ ++ __be32 num_rma_addrs; /* RMA */ ++ struct rma_addr rma_addrs[0]; /* RMA */ ++} __attribute__ ((packed)); ++ ++/* ++ * RDMA Write Header ++ */ ++struct write_hdr { ++ struct base_hdr hdr; ++ __be64 rdma_address; ++ __be32 rdma_key; ++ __be32 immed_data; ++ __be32 msg_id; ++ __be32 rma_length; /* RMA */ ++ __be32 rma_id; /* RMA */ ++ __be32 num_rma_addrs; /* RMA */ ++ struct rma_addr rma_addrs[0]; /* RMA */ ++} __attribute__ ((packed)); ++ ++/* ++ * RDMA Read Request Header ++ */ ++struct read_req_hdr { ++ struct base_hdr hdr; ++ __be64 rdma_address; ++ __be32 rdma_key; ++ __be32 rdma_length; /* shared with RMA */ ++ __be32 rdma_id; /* shared with RMA */ ++ __be32 num_rma_addrs; /* RMA */ ++ struct rma_addr rma_addrs[0]; /* RMA */ ++} __attribute__ ((packed)); ++ ++/* ++ * RDMA Read Response Header ++ */ ++struct read_rsp_hdr { ++ struct base_hdr hdr; ++ __be32 rdma_offset; ++ __be32 rdma_id; ++} __attribute__ ((packed)); ++ ++ ++/* ++ * Atomic Compare and Swap Header ++ */ ++struct comp_swap_hdr { ++ struct base_hdr hdr; ++ __be64 atomic_address; ++ __be64 comp_data; ++ __be64 swap_data; ++ __be32 atomic_key; ++ __be32 atomic_id; ++ /* no pad needed */ ++} __attribute__ ((packed)); ++ ++ ++/* ++ * Atomic Fetch/Add Header ++ */ ++struct fetch_add_hdr { ++ struct base_hdr hdr; ++ __be64 atomic_address; ++ __be64 add_data; ++ __be32 atomic_key; ++ __be32 atomic_id; ++ /* no pad needed */ ++} __attribute__ ((packed)); ++ ++/* ++ * Atomic Response Header ++ */ ++struct atomic_rsp_hdr { ++ struct base_hdr hdr; ++ __be64 orig_data; ++ __be32 atomic_id; ++} __attribute__ ((packed)); ++ ++/* ++ * ACK Header ++ */ ++struct ack_hdr { ++ struct base_hdr hdr; ++} __attribute__ ((packed)); ++ ++/* ++ * Disconnect Header ++ */ ++struct disconnect_hdr { ++ struct base_hdr hdr; ++ __be32 reason; ++} __attribute__ ((packed)); ++ ++/* ++ * RMA Response Header ++ */ ++struct rma_rsp_hdr { ++ struct base_hdr hdr; ++ __be32 rma_id; ++ __be32 xfer_length; ++ __be32 error; ++} __attribute__ ((packed)); ++ ++/* ++ * MR Reg/Dereg Info Header ++ */ ++struct reg_hdr { ++ struct base_hdr hdr; ++ __be64 scif_offset; ++ __be64 address; ++ __be32 length; ++ __be32 rkey; ++ __be32 access; ++} __attribute__ ((packed)); ++ ++/* ++ * SCIF endpoint close notiffication ++ */ ++struct close_hdr { ++ struct base_hdr hdr; ++} __attribute__ ((packed)); ++ ++ ++#define IBSCIF_CM_REQ 1 ++#define IBSCIF_CM_REP 2 ++#define IBSCIF_CM_REJ 3 ++#define IBSCIF_CM_RTU 4 ++ ++/* ++ * RDMA CM Header ++ */ ++ ++struct cm_hdr { ++ struct base_hdr hdr; ++ __be64 req_ctx; ++ __be64 rep_ctx; ++ __be32 cmd; ++ __be32 port; ++ __be32 qpn; ++ __be32 status; ++ __be32 plen; ++ u8 pdata[0]; ++} __attribute__ ((packed)); ++ ++enum ibscif_reason { /* Set each value to simplify manual lookup. */ ++ ++ /* Local Events */ ++ IBSCIF_REASON_USER_GENERATED = 0, ++ IBSCIF_REASON_CQ_COMPLETION = 1, ++ IBSCIF_REASON_NIC_FATAL = 2, ++ IBSCIF_REASON_NIC_REMOVED = 3, ++ ++ /* Disconnect Event */ ++ IBSCIF_REASON_DISCONNECT = 4, ++ ++ /* CQ Error */ ++ IBSCIF_REASON_CQ_OVERRUN = 5, ++ IBSCIF_REASON_CQ_FATAL = 6, ++ ++ /* QP Errors */ ++ IBSCIF_REASON_QP_SQ_ERROR = 7, ++ IBSCIF_REASON_QP_RQ_ERROR = 8, ++ IBSCIF_REASON_QP_DESTROYED = 9, ++ IBSCIF_REASON_QP_ERROR = 10, ++ IBSCIF_REASON_QP_FATAL = 11, ++ ++ /* Operation Errors */ ++ IBSCIF_REASON_INVALID_OPCODE = 12, ++ IBSCIF_REASON_INVALID_LENGTH = 13, ++ IBSCIF_REASON_INVALID_QP = 14, ++ IBSCIF_REASON_INVALID_MSG_ID = 15, ++ IBSCIF_REASON_INVALID_LKEY = 16, ++ IBSCIF_REASON_INVALID_RDMA_RKEY = 17, ++ IBSCIF_REASON_INVALID_RDMA_ID = 18, ++ IBSCIF_REASON_INVALID_ATOMIC_RKEY = 19, ++ IBSCIF_REASON_INVALID_ATOMIC_ID = 20, ++ IBSCIF_REASON_MAX_IR_EXCEEDED = 21, ++ IBSCIF_REASON_ACK_TIMEOUT = 22, ++ ++ /* Protection Errors */ ++ IBSCIF_REASON_PROTECTION_VIOLATION = 23, ++ IBSCIF_REASON_BOUNDS_VIOLATION = 24, ++ IBSCIF_REASON_ACCESS_VIOLATION = 25, ++ IBSCIF_REASON_WRAP_ERROR = 26 ++}; ++ ++union ibscif_pdu { ++ struct base_hdr hdr; ++ struct ud_hdr ud; ++ struct send_hdr send; ++ struct write_hdr write; ++ struct read_req_hdr read_req; ++ struct read_rsp_hdr read_rsp; ++ struct comp_swap_hdr comp_swap; ++ struct fetch_add_hdr fetch_add; ++ struct atomic_rsp_hdr atomic_rsp; ++ struct ack_hdr ack; ++ struct disconnect_hdr disconnect; ++ struct rma_rsp_hdr rma_rsp; ++ struct reg_hdr reg; ++ struct close_hdr close; ++ struct cm_hdr cm; ++}; ++ ++struct ibscif_full_frame { ++ union ibscif_pdu ibscif; ++}; ++ ++static inline int seq_before(u32 seq1, u32 seq2) ++{ ++ return (s32)(seq1 - seq2) < 0; ++} ++ ++static inline int seq_after(u32 seq1, u32 seq2) ++{ ++ return (s32)(seq2 - seq1) < 0; ++} ++ ++static inline int seq_between(u32 seq_target, u32 seq_low, u32 seq_high) ++{ ++ return seq_high - seq_low >= seq_target - seq_low; ++} ++ ++static inline u32 seq_window(u32 earlier, u32 later) ++{ ++ return earlier > later ? ((u32)~0 - earlier) + later : later - earlier; ++} ++ ++#define ibscif_tx_unacked_window(tx) seq_window((tx)->last_ack_seq_recvd, (tx)->next_seq - 1) ++ ++#define ibscif_rx_window(rx) seq_window((rx)->last_seq_acked, (rx)->last_in_seq) ++ ++#define ibscif_tx_window(tx) ((u32)window_size - ibscif_tx_unacked_window(tx)) ++ ++#endif /* IBSCIF_PROTOCOL_H */ +diff -urN a7/drivers/infiniband/hw/scif/ibscif_provider.c a8/drivers/infiniband/hw/scif/ibscif_provider.c +--- a7/drivers/infiniband/hw/scif/ibscif_provider.c 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_provider.c 2015-02-23 10:14:37.488809663 -0800 +@@ -0,0 +1,406 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "ibscif_driver.h" ++ ++static int ibscif_query_device(struct ib_device *ibdev, struct ib_device_attr *attr) ++{ ++ memset(attr, 0, sizeof *attr); ++ ++ attr->vendor_id = VENDOR_ID; ++ attr->vendor_part_id = DEVICE_ID; ++ attr->hw_ver = HW_REV; ++ attr->fw_ver = FW_REV; ++ attr->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT; ++ attr->max_mr_size = MAX_MR_SIZE; ++ attr->page_size_cap = PAGE_SIZE; ++ attr->max_qp = MAX_QPS; ++ attr->max_qp_wr = MAX_QP_SIZE; ++ attr->max_sge = MAX_SGES; ++ attr->max_cq = MAX_CQS; ++ attr->max_cqe = MAX_CQ_SIZE; ++ attr->max_mr = MAX_MRS; ++ attr->max_pd = MAX_PDS; ++ attr->max_qp_rd_atom = MAX_IR>255 ? 255 : MAX_IR; ++ attr->max_qp_init_rd_atom = MAX_OR>255 ? 255 : MAX_OR; ++ attr->max_res_rd_atom = MAX_IR>255 ? 255 : MAX_IR; ++ attr->atomic_cap = IB_ATOMIC_HCA; ++ attr->sys_image_guid = ibdev->node_guid; ++ ++ return 0; ++} ++ ++static int ibscif_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *attr) ++{ ++ struct ibscif_dev *dev = to_dev(ibdev); ++ ++ memset(attr, 0, sizeof *attr); ++ ++ /* See IB Spec r1.2 Table 145 for physical port state values. */ ++ attr->lid = IBSCIF_NODE_ID_TO_LID(dev->node_id); ++ attr->sm_lid = 1; ++ attr->gid_tbl_len = 1; ++ attr->pkey_tbl_len = 1; ++ attr->max_msg_sz = MAX_MR_SIZE; ++ attr->phys_state = 5; /* LinkUp */ ++ attr->state = IB_PORT_ACTIVE; ++ attr->max_mtu = IB_MTU_4096; ++ attr->active_mtu = IB_MTU_4096; ++ attr->active_width = IB_WIDTH_4X; ++ attr->active_speed = 4; ++ attr->max_vl_num = 1; ++ attr->port_cap_flags = IB_PORT_SM_DISABLED; ++ ++ return 0; ++} ++ ++static int ibscif_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) ++{ ++ *pkey = 0xffff; /* IB_DEFAULT_PKEY_FULL */ ++ return 0; ++} ++ ++static int ibscif_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *ibgid) ++{ ++ struct ibscif_dev *dev = to_dev(ibdev); ++ ++ memcpy(ibgid, &dev->gid, sizeof(*ibgid)); ++ return 0; ++} ++ ++static struct ib_ucontext *ibscif_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) ++{ ++ struct ib_ucontext *context = kzalloc(sizeof *context, GFP_KERNEL); ++ return (!context) ? ERR_PTR(-ENOMEM) : context; ++} ++ ++static int ibscif_dealloc_ucontext(struct ib_ucontext *context) ++{ ++ kfree(context); ++ return 0; ++} ++ ++static void ibscif_generate_eui64(struct ibscif_dev *dev, u8 *eui64) ++{ ++ memcpy(eui64, dev->netdev->dev_addr, 3); ++ eui64[3] = 0xFF; ++ eui64[4] = 0xFE; ++ memcpy(eui64+5, dev->netdev->dev_addr+3, 3); ++} ++ ++static int ibscif_register_device(struct ibscif_dev *dev) ++{ ++ strncpy(dev->ibdev.node_desc, DRV_SIGNON, sizeof dev->ibdev.node_desc); ++ ibscif_generate_eui64(dev, (u8 *)&dev->ibdev.node_guid); ++ dev->ibdev.owner = THIS_MODULE; ++ dev->ibdev.uverbs_abi_ver = UVERBS_ABI_VER; ++ dev->ibdev.uverbs_cmd_mask = ++ (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | ++ (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | ++ (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | ++ (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | ++ (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | ++ (1ull << IB_USER_VERBS_CMD_CREATE_AH) | ++ (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | ++ (1ull << IB_USER_VERBS_CMD_CREATE_QP) | ++ (1ull << IB_USER_VERBS_CMD_QUERY_QP) | ++ (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | ++ (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | ++ (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | ++ (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | ++ (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | ++ (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | ++ (1ull << IB_USER_VERBS_CMD_POLL_CQ) | ++ (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | ++ (1ull << IB_USER_VERBS_CMD_REG_MR) | ++ (1ull << IB_USER_VERBS_CMD_DEREG_MR) | ++ (1ull << IB_USER_VERBS_CMD_POST_SEND) | ++ (1ull << IB_USER_VERBS_CMD_POST_RECV); ++#if defined(MOFED) && !defined(MOFED_2_1) ++ dev->ibdev.node_type = new_ib_type ? RDMA_EXP_NODE_MIC : RDMA_NODE_RNIC; ++#else ++ dev->ibdev.node_type = new_ib_type ? RDMA_NODE_MIC : RDMA_NODE_RNIC; ++#endif ++ dev->ibdev.phys_port_cnt = 1; ++ ++ dev->ibdev.query_device = ibscif_query_device; // Mandatory ++ dev->ibdev.num_comp_vectors = 1; // Mandatory ++ dev->ibdev.query_port = ibscif_query_port; // Mandatory ++ dev->ibdev.query_pkey = ibscif_query_pkey; // Mandatory ++ dev->ibdev.query_gid = ibscif_query_gid; // Mandatory ++ dev->ibdev.alloc_ucontext = ibscif_alloc_ucontext; // Required ++ dev->ibdev.dealloc_ucontext = ibscif_dealloc_ucontext; // Required ++ dev->ibdev.alloc_pd = ibscif_alloc_pd; // Mandatory ++ dev->ibdev.dealloc_pd = ibscif_dealloc_pd; // Mandatory ++ dev->ibdev.create_ah = ibscif_create_ah; // Mandatory ++ dev->ibdev.destroy_ah = ibscif_destroy_ah; // Mandatory ++ dev->ibdev.create_qp = ibscif_create_qp; // Mandatory ++ dev->ibdev.query_qp = ibscif_query_qp; // Optional ++ dev->ibdev.modify_qp = ibscif_modify_qp; // Mandatory ++ dev->ibdev.destroy_qp = ibscif_destroy_qp; // Mandatory ++ dev->ibdev.create_cq = ibscif_create_cq; // Mandatory ++ dev->ibdev.resize_cq = ibscif_resize_cq; // Optional ++ dev->ibdev.destroy_cq = ibscif_destroy_cq; // Mandatory ++ dev->ibdev.poll_cq = ibscif_poll_cq; // Mandatory ++ dev->ibdev.req_notify_cq = ibscif_arm_cq; // Mandatory ++ dev->ibdev.get_dma_mr = ibscif_get_dma_mr; // Mandatory ++ dev->ibdev.reg_phys_mr = ibscif_reg_phys_mr; // Required ++ dev->ibdev.reg_user_mr = ibscif_reg_user_mr; // Required ++ dev->ibdev.dereg_mr = ibscif_dereg_mr; // Mandatory ++ dev->ibdev.post_send = ibscif_post_send; // Mandatory ++ dev->ibdev.post_recv = ibscif_post_receive; // Mandatory ++ dev->ibdev.dma_ops = &ibscif_dma_mapping_ops; // ?? ++ ++ dev->ibdev.iwcm = kzalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); ++ if (!dev->ibdev.iwcm) ++ return -ENOMEM; ++ ++ dev->ibdev.iwcm->connect = ibscif_cm_connect; ++ dev->ibdev.iwcm->accept = ibscif_cm_accept; ++ dev->ibdev.iwcm->reject = ibscif_cm_reject; ++ dev->ibdev.iwcm->create_listen = ibscif_cm_create_listen; ++ dev->ibdev.iwcm->destroy_listen = ibscif_cm_destroy_listen; ++ dev->ibdev.iwcm->add_ref = ibscif_cm_add_ref; ++ dev->ibdev.iwcm->rem_ref = ibscif_cm_rem_ref; ++ dev->ibdev.iwcm->get_qp = ibscif_cm_get_qp; ++ ++ return ib_register_device(&dev->ibdev, NULL); ++} ++ ++static void ibscif_dev_release(struct device *dev) ++{ ++ kfree(dev); ++} ++ ++/* ++ * Hold devlist_mutex during this call for synchronization as needed. ++ * Upon return, dev is invalid. ++ */ ++static void ibscif_remove_dev(struct ibscif_dev *dev) ++{ ++ struct ibscif_conn *conn, *next; ++ ++ if (dev->ibdev.reg_state == IB_DEV_REGISTERED) ++ ib_unregister_device(&dev->ibdev); ++ ++ WARN_ON(!list_empty(&dev->wq_list)); ++ ++ down(&devlist_mutex); ++ list_del(&dev->entry); ++ up(&devlist_mutex); ++ ++ ibscif_refresh_pollep_list(); ++ ++ down(&dev->mutex); ++ list_for_each_entry_safe(conn, next, &dev->conn_list, entry) { ++ scif_close(conn->ep); ++ list_del(&conn->entry); ++ kfree(conn); ++ } ++ up(&dev->mutex); ++ ++ if (dev->listen_ep) ++ scif_close(dev->listen_ep); ++ ibscif_procfs_remove_dev(dev); ++ ++ dev_put(dev->netdev); ++ device_unregister(dev->ibdev.dma_device); ++ ib_dealloc_device(&dev->ibdev); ++} ++ ++static void ibscif_remove_one(struct net_device *netdev) ++{ ++ struct ibscif_dev *dev, *next; ++ ++ list_for_each_entry_safe(dev, next, &devlist, entry) { ++ if (netdev == dev->netdev) { ++ ibscif_remove_dev(dev); ++ break; ++ } ++ } ++} ++ ++static int node_cnt; ++static uint16_t node_ids[IBSCIF_MAX_DEVICES]; ++static uint16_t my_node_id; ++ ++static void ibscif_add_one(struct net_device *netdev) ++{ ++ static int dev_cnt; ++ static dma_addr_t dma_mask = -1; ++ struct ibscif_dev *dev; ++ int ret; ++ ++ dev = (struct ibscif_dev *)ib_alloc_device(sizeof *dev); ++ if (!dev) { ++ printk(KERN_ALERT PFX "%s: fail to allocate ib_device\n", __func__); ++ return; ++ } ++ ++ INIT_LIST_HEAD(&dev->conn_list); ++ INIT_LIST_HEAD(&dev->mr_list); ++ init_MUTEX(&dev->mr_list_mutex); ++ init_MUTEX(&dev->mutex); ++ spin_lock_init(&dev->atomic_op); ++ INIT_LIST_HEAD(&dev->wq_list); ++ atomic_set(&dev->available, 256); /* FIXME */ ++ ++ dev_hold(netdev); ++ dev->netdev = netdev; ++ ++ /* use the MAC address of the netdev as the GID so that RDMA CM can ++ * find the ibdev from the IP address associated with the netdev. ++ */ ++ memcpy(&dev->gid, dev->netdev->dev_addr, ETH_ALEN); ++ ++ dev->ibdev.dma_device = kzalloc(sizeof *dev->ibdev.dma_device, GFP_KERNEL); ++ if (!dev->ibdev.dma_device) { ++ printk(KERN_ALERT PFX "%s: fail to allocate dma_device\n", __func__); ++ goto out_free_ibdev; ++ } ++ ++ snprintf(dev->name, IBSCIF_NAME_SIZE, "scif_dma_%d", dev_cnt); ++ snprintf(dev->ibdev.name, IB_DEVICE_NAME_MAX, "scif%d", dev_cnt++); ++ dev->ibdev.dma_device->release = ibscif_dev_release; ++ dev->ibdev.dma_device->init_name = dev->name; ++ dev->ibdev.dma_device->dma_mask = &dma_mask; ++ ret = device_register(dev->ibdev.dma_device); ++ if (ret) { ++ printk(KERN_ALERT PFX "%s: fail to register dma_device, ret=%d\n", __func__, ret); ++ kfree(dev->ibdev.dma_device); ++ goto out_free_ibdev; ++ } ++ ++ /* Notice: set up listen ep before inserting to devlist */ ++ ++ dev->listen_ep = scif_open(); ++ if (!dev->listen_ep || IS_ERR(dev->listen_ep)) { ++ printk(KERN_ALERT PFX "%s: scif_open returns %ld\n", __func__, PTR_ERR(dev->listen_ep)); ++ goto out_unreg_dmadev ; ++ } ++ ++ ret = scif_get_nodeIDs( node_ids, IBSCIF_MAX_DEVICES, &my_node_id); ++ if (ret < 0) { ++ printk(KERN_ALERT PFX "%s: scif_get_nodeIDS returns %d\n", ++ __func__, ret); ++ goto out_close_ep; ++ } ++ ++ node_cnt = ret; ++ dev->node_id = my_node_id; ++ printk(KERN_ALERT PFX "%s: my node_id is %d\n", __func__, dev->node_id); ++ ++ ret = scif_bind(dev->listen_ep, SCIF_OFED_PORT_0); ++ if (ret < 0) { ++ printk(KERN_ALERT PFX "%s: scif_bind returns %d, port=%d\n", ++ __func__, ret, SCIF_OFED_PORT_0); ++ goto out_close_ep; ++ } ++ ++ ret = scif_listen(dev->listen_ep, IBSCIF_MAX_DEVICES); ++ if (ret < 0) { ++ printk(KERN_ALERT PFX "%s: scif_listen returns %d\n", __func__, ret); ++ goto out_close_ep; ++ } ++ ++ down(&devlist_mutex); ++ list_add_tail(&dev->entry, &devlist); ++ up(&devlist_mutex); ++ ++ if (ibscif_register_device(dev)) ++ ibscif_remove_dev(dev); ++ else ++ ibscif_procfs_add_dev(dev); ++ ++ ibscif_refresh_pollep_list(); ++ ++ return; ++ ++out_close_ep: ++ scif_close(dev->listen_ep); ++ ++out_unreg_dmadev: ++ device_unregister(dev->ibdev.dma_device); /* it will free the memory, too */ ++ ++out_free_ibdev: ++ ib_dealloc_device(&dev->ibdev); ++} ++ ++static int ibscif_notifier(struct notifier_block *nb, unsigned long event, void *ptr) ++{ ++ struct net_device *netdev = (struct net_device *)ptr; ++ ++ if (strcmp(netdev->name, "mic0")) ++ return NOTIFY_DONE; ++ ++ switch(event) { ++ case NETDEV_REGISTER: ++ ibscif_add_one(netdev); ++ ibscif_protocol_init_post(); ++ break; ++ ++ case NETDEV_UNREGISTER: ++ ibscif_remove_one(netdev); ++ break; ++ ++ default: ++ /* we only care about the MAC address, ignore other notifications */ ++ break; ++ } ++ ++ return NOTIFY_DONE; ++} ++ ++static struct notifier_block ibscif_notifier_block = { ++ .notifier_call = ibscif_notifier, ++}; ++ ++int ibscif_dev_init(void) ++{ ++ int err = 0; ++ ++ ibscif_protocol_init_pre(); ++ ++ err = register_netdevice_notifier(&ibscif_notifier_block); ++ if (err) ++ ibscif_protocol_cleanup(); ++ ++ return err; ++} ++ ++void ibscif_dev_cleanup(void) ++{ ++ struct ibscif_dev *dev, *next; ++ ++ ibscif_protocol_cleanup(); ++ unregister_netdevice_notifier(&ibscif_notifier_block); ++ list_for_each_entry_safe(dev, next, &devlist, entry) ++ ibscif_remove_dev(dev); ++} +diff -urN a7/drivers/infiniband/hw/scif/ibscif_qp.c a8/drivers/infiniband/hw/scif/ibscif_qp.c +--- a7/drivers/infiniband/hw/scif/ibscif_qp.c 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_qp.c 2015-02-23 10:14:37.488809663 -0800 +@@ -0,0 +1,868 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "ibscif_driver.h" ++ ++static struct ibscif_wr *ibscif_alloc_wr(struct ibscif_wq *wq, int new_size, int bytes) ++{ ++ if (new_size && (new_size != wq->size)) { ++ struct ibscif_wr *new_wr = vzalloc(bytes); ++ return new_wr ? new_wr : ERR_PTR(-ENOMEM); ++ } ++ return NULL; ++} ++ ++static void ibscif_move_wr(struct ibscif_wq *wq, struct ibscif_wr *new_wr, int new_size) ++{ ++ int i; ++ ++ if (wq->size == new_size) ++ return; ++ ++ for (i = 0; i < wq->depth; i++) { ++ memcpy(&new_wr[i], &wq->wr[wq->head], wq->wr_size); ++ wq->head = (wq->head + 1) % wq->size; ++ } ++ ++ if (wq->wr) { ++ vfree(wq->wr); ++ } ++ ++ wq->wr = new_wr; ++ wq->head = 0; ++ wq->tail = wq->depth; ++ wq->size = new_size; ++} ++ ++/* Caller must provide proper synchronization. */ ++static int ibscif_resize_qp(struct ibscif_qp *qp, int sq_size, int rq_size, int iq_size) ++{ ++ struct ibscif_wr *new_sq, *new_rq, *new_iq; ++ int sq_bytes, rq_bytes, iq_bytes; ++ int old_npages, new_npages, err; ++ ++ sq_bytes = PAGE_ALIGN(sq_size * qp->sq.wr_size); ++ rq_bytes = PAGE_ALIGN(rq_size * qp->rq.wr_size); ++ iq_bytes = PAGE_ALIGN(iq_size * qp->iq.wr_size); ++ ++ sq_size = sq_bytes / qp->sq.wr_size; ++ rq_size = rq_bytes / qp->rq.wr_size; ++ iq_size = iq_bytes / qp->iq.wr_size; ++ ++ if ((sq_size == qp->sq.size) && ++ (rq_size == qp->rq.size) && ++ (iq_size == qp->iq.size)) ++ return 0; ++ ++ if ((sq_size < qp->sq.depth) || ++ (rq_size < qp->rq.depth) || ++ (iq_size < qp->iq.depth)) ++ return -EINVAL; ++ ++ /* Calculate the number of new pages required for this allocation. */ ++ new_npages = (sq_bytes + rq_bytes + iq_bytes) >> PAGE_SHIFT; ++ old_npages = (PAGE_ALIGN(qp->sq.size * qp->sq.wr_size) + ++ PAGE_ALIGN(qp->rq.size * qp->rq.wr_size) + ++ PAGE_ALIGN(qp->iq.size * qp->iq.wr_size)) >> PAGE_SHIFT; ++ new_npages -= old_npages; ++ ++ if (new_npages > 0) { ++ err = ibscif_reserve_quota(&new_npages); ++ if (err) ++ return err; ++ } ++ ++ new_sq = ibscif_alloc_wr(&qp->sq, sq_size, sq_bytes); ++ new_rq = ibscif_alloc_wr(&qp->rq, rq_size, rq_bytes); ++ new_iq = ibscif_alloc_wr(&qp->iq, iq_size, iq_bytes); ++ if (IS_ERR(new_sq) || IS_ERR(new_rq) || IS_ERR(new_iq)) ++ goto out; ++ ++ ibscif_move_wr(&qp->sq, new_sq, sq_size); ++ ibscif_move_wr(&qp->rq, new_rq, rq_size); ++ ibscif_move_wr(&qp->iq, new_iq, iq_size); ++ ++ if (new_npages < 0) ++ ibscif_release_quota(-new_npages); ++ ++ return 0; ++out: ++ if (new_sq && !IS_ERR(new_sq)) ++ vfree(new_sq); ++ if (new_rq && !IS_ERR(new_rq)) ++ vfree(new_rq); ++ if (new_iq && !IS_ERR(new_iq)) ++ vfree(new_iq); ++ ++ return -ENOMEM; ++} ++ ++static int ibscif_init_wqs(struct ibscif_qp *qp, struct ib_qp_init_attr *attr) ++{ ++ spin_lock_init(&qp->sq.lock); ++ spin_lock_init(&qp->rq.lock); ++ spin_lock_init(&qp->iq.lock); ++ ++ qp->sq.qp = qp; ++ qp->rq.qp = qp; ++ qp->iq.qp = qp; ++ ++ qp->sq.wirestate = &qp->wire.sq; ++ qp->iq.wirestate = &qp->wire.iq; ++ ++ qp->sq.max_sge = attr->cap.max_send_sge; ++ qp->rq.max_sge = attr->cap.max_recv_sge; ++ qp->iq.max_sge = 1; ++ ++ qp->sq.wr_size = sizeof *qp->sq.wr + (sizeof *qp->sq.wr->ds_list * qp->sq.max_sge); ++ qp->rq.wr_size = sizeof *qp->rq.wr + (sizeof *qp->rq.wr->ds_list * qp->rq.max_sge); ++ qp->iq.wr_size = sizeof *qp->iq.wr + (sizeof *qp->iq.wr->ds_list * qp->iq.max_sge); ++ ++ return ibscif_resize_qp(qp, attr->cap.max_send_wr, attr->cap.max_recv_wr, (rma_threshold==0x7FFFFFFF)?0:attr->cap.max_send_wr); ++} ++ ++static void ibscif_reset_tx_state(struct ibscif_tx_state *tx) ++{ ++ tx->next_seq = 1; ++ tx->last_ack_seq_recvd = 0; ++ tx->next_msg_id = 0; ++} ++ ++static void ibscif_reset_rx_state(struct ibscif_rx_state *rx) ++{ ++ rx->last_in_seq = 0; ++ rx->last_seq_acked = 0; ++ rx->defer_in_process = 0; ++} ++ ++static void ibscif_reset_wirestate(struct ibscif_wirestate *wirestate) ++{ ++ ibscif_reset_tx_state(&wirestate->tx); ++ ibscif_reset_rx_state(&wirestate->rx); ++} ++ ++static void ibscif_reset_wire(struct ibscif_wire *wire) ++{ ++ ibscif_reset_wirestate(&wire->sq); ++ ibscif_reset_wirestate(&wire->iq); ++} ++ ++static void ibscif_init_wire(struct ibscif_wire *wire) ++{ ++ ibscif_reset_wire(wire); ++} ++ ++static void ibscif_query_qp_cap(struct ibscif_qp *qp, struct ib_qp_cap *cap) ++{ ++ memset(cap, 0, sizeof *cap); ++ cap->max_send_wr = qp->sq.size; ++ cap->max_recv_wr = qp->rq.size; ++ cap->max_send_sge = qp->sq.max_sge; ++ cap->max_recv_sge = qp->rq.max_sge; ++} ++ ++struct ib_qp *ibscif_create_qp(struct ib_pd *ibpd, struct ib_qp_init_attr *attr, struct ib_udata *udata) ++{ ++ struct ibscif_dev *dev = to_dev(ibpd->device); ++ struct ibscif_qp *qp; ++ int err; ++ ++ if ((attr->qp_type != IB_QPT_RC && attr->qp_type != IB_QPT_UD) || ++ (attr->cap.max_send_wr > MAX_QP_SIZE) || ++ (attr->cap.max_recv_wr > MAX_QP_SIZE) || ++ (attr->cap.max_send_sge > MAX_SGES) || ++ (attr->cap.max_recv_sge > MAX_SGES) || ++ (attr->cap.max_send_wr && !attr->send_cq) || ++ (attr->cap.max_recv_wr && !attr->recv_cq)) ++ return ERR_PTR(-EINVAL); ++ ++ if (!atomic_add_unless(&dev->qp_cnt, 1, MAX_QPS)) ++ return ERR_PTR(-EAGAIN); ++ ++ qp = kzalloc(sizeof *qp, GFP_KERNEL); ++ if (!qp) { ++ atomic_dec(&dev->qp_cnt); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ qp->local_node_id = dev->node_id; ++ ++ kref_init(&qp->ref); ++ init_completion(&qp->done); ++ init_MUTEX(&qp->modify_mutex); ++ spin_lock_init(&qp->lock); ++ ibscif_init_wire(&qp->wire); ++ qp->sq_policy = attr->sq_sig_type; ++ qp->dev = dev; ++ qp->mtu = IBSCIF_MTU; /* FIXME */ ++ qp->state = QP_IDLE; ++ ++ err = ibscif_init_wqs(qp, attr); ++ if (err) ++ goto out; ++ ++ ibscif_query_qp_cap(qp, &attr->cap); ++ ++ err = ibscif_wiremap_add(qp, &qp->ibqp.qp_num); ++ if (err) ++ goto out; ++ ++ qp->magic = QP_MAGIC; ++ ++ ibscif_scheduler_add_qp(qp); ++ qp->in_scheduler = 1; ++ ++ return &qp->ibqp; ++out: ++ ibscif_destroy_qp(&qp->ibqp); ++ return ERR_PTR(err); ++} ++ ++static inline enum ib_qp_state to_ib_qp_state(enum ibscif_qp_state state) ++{ ++ switch (state) { ++ case QP_IDLE: return IB_QPS_INIT; ++ case QP_CONNECTED: return IB_QPS_RTS; ++ case QP_DISCONNECT: return IB_QPS_SQD; ++ case QP_ERROR: return IB_QPS_ERR; ++ case QP_RESET: return IB_QPS_RESET; ++ default: return -1; ++ } ++} ++ ++static inline enum ibscif_qp_state to_ibscif_qp_state(enum ib_qp_state state) ++{ ++ switch (state) { ++ case IB_QPS_INIT: return QP_IDLE; ++ case IB_QPS_RTS: return QP_CONNECTED; ++ case IB_QPS_SQD: return QP_DISCONNECT; ++ case IB_QPS_ERR: return QP_ERROR; ++ case IB_QPS_RESET: return QP_RESET; ++ case IB_QPS_RTR: return QP_IGNORE; ++ default: return -1; ++ } ++} ++ ++/* Caller must provide proper synchronization. */ ++static void __ibscif_query_qp(struct ibscif_qp *qp, struct ib_qp_attr *attr, struct ib_qp_init_attr *init_attr) ++{ ++ struct ib_qp_cap cap; ++ ++ ibscif_query_qp_cap(qp, &cap); ++ ++ if (attr) { ++ attr->qp_state = to_ib_qp_state(qp->state); ++ attr->cur_qp_state = attr->qp_state; ++ attr->port_num = 1; ++ attr->path_mtu = qp->mtu; ++ attr->dest_qp_num = qp->remote_qpn; ++ attr->qp_access_flags = qp->access; ++ attr->max_rd_atomic = qp->max_or; ++ attr->max_dest_rd_atomic = qp->iq.size; ++ attr->cap = cap; ++ } ++ ++ if (init_attr) { ++ init_attr->qp_type = qp->ibqp.qp_type; ++ init_attr->sq_sig_type = qp->sq_policy; ++ init_attr->cap = cap; ++ } ++} ++ ++int ibscif_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_qp_init_attr *init_attr) ++{ ++ struct ibscif_qp *qp = to_qp(ibqp); ++ ++ memset(attr, 0, sizeof *attr); ++ memset(init_attr, 0, sizeof *init_attr); ++ ++ spin_lock_bh(&qp->lock); ++ __ibscif_query_qp(qp, attr, init_attr); ++ spin_unlock_bh(&qp->lock); ++ ++ return 0; ++} ++ ++static int ibscif_flush_wq(struct ibscif_wq *wq, struct ibscif_cq *cq) ++{ ++ struct ibscif_wr *wr; ++ struct ibscif_wc *wc; ++ int i, num_wr, err; ++ ++ /* Prevent divide by zero traps on wrap math. */ ++ if (!wq->size) ++ return 0; ++ ++ spin_lock_bh(&wq->lock); ++ for (i = (wq->head + wq->completions) % wq->size, num_wr = 0; ++ wq->depth && (wq->completions != wq->depth); ++ i = (i + 1) % wq->size, num_wr++) { ++ ++ wr = ibscif_get_wr(wq, i); ++ ++ ibscif_clear_ds_refs(wr->ds_list, wr->num_ds); ++ ++ if (!cq) { ++ wq->completions++; ++ continue; ++ } ++ ++ err = ibscif_reserve_cqe(cq, &wc); ++ if (err) { ++ num_wr = err; ++ break; ++ } ++ ++ wc->ibwc.qp = &wq->qp->ibqp; ++ wc->ibwc.src_qp = wq->qp->remote_qpn; ++ wc->ibwc.wr_id = wr->id; ++ wc->ibwc.opcode = is_rq(wq) ? IB_WC_RECV : to_ib_wc_opcode(wr->opcode); ++ wc->ibwc.status = IB_WC_WR_FLUSH_ERR; ++ wc->ibwc.ex.imm_data = 0; ++ wc->ibwc.byte_len = 0; ++ wc->ibwc.port_num = 1; ++ ++ wc->wq = wq; ++ wc->reap = wq->reap + 1; ++ wq->reap = 0; ++ wq->completions++; ++ ++ ibscif_append_cqe(cq, wc, 0); ++ } ++ spin_unlock_bh(&wq->lock); ++ ++ if (num_wr && cq) ++ ibscif_notify_cq(cq); ++ ++ return num_wr; ++} ++ ++static void ibscif_flush_wqs(struct ibscif_qp *qp) ++{ ++ int ret; ++ ++ ret = ibscif_flush_wq(&qp->sq, to_cq(qp->ibqp.send_cq)); ++ if (ret) /* A clean SQ flush should have done nothing. */ ++ qp->state = QP_ERROR; ++ ++ ret = ibscif_flush_wq(&qp->rq, to_cq(qp->ibqp.recv_cq)); ++ if (ret < 0) ++ qp->state = QP_ERROR; ++ ++ ibscif_flush_wq(&qp->iq, NULL); ++} ++ ++static void ibscif_reset_wq(struct ibscif_wq *wq, struct ibscif_cq *cq) ++{ ++ ibscif_clear_cqes(cq, wq); ++ ++ wq->head = 0; ++ wq->tail = 0; ++ wq->depth = 0; ++ wq->reap = 0; ++ wq->next_wr = 0; ++ wq->next_msg_id = 0; ++ wq->completions = 0; ++} ++ ++static void ibscif_reset_wqs(struct ibscif_qp *qp) ++{ ++ ibscif_reset_wq(&qp->sq, to_cq(qp->ibqp.send_cq)); ++ ibscif_reset_wq(&qp->rq, to_cq(qp->ibqp.recv_cq)); ++ ibscif_reset_wq(&qp->iq, NULL); ++} ++ ++static void ibscif_qp_event(struct ibscif_qp *qp, enum ib_event_type event) ++{ ++ if (qp->ibqp.event_handler) { ++ struct ib_event record; ++ record.event = event; ++ record.device = qp->ibqp.device; ++ record.element.qp = &qp->ibqp; ++ qp->ibqp.event_handler(&record, qp->ibqp.qp_context); ++ } ++} ++ ++/* Caller must provide proper synchronization. */ ++static void ibscif_qp_error(struct ibscif_qp *qp) ++{ ++ if (qp->state == QP_ERROR) ++ return; ++ ++ if (qp->state == QP_CONNECTED) ++ ibscif_send_disconnect(qp, IBSCIF_REASON_DISCONNECT); ++ ++ qp->state = QP_ERROR; ++ ++ ibscif_flush_wqs(qp); ++ ++ ibscif_cm_async_callback(qp->cm_context); ++ qp->cm_context = NULL; ++ ++ /* don't generate the error event because transitioning to IB_QPS_ERR ++ state is normal when a QP is disconnected */ ++ ++ //ibscif_qp_event(qp, IB_EVENT_QP_FATAL); ++} ++ ++/* Caller must provide proper synchronization. */ ++static void ibscif_qp_reset(struct ibscif_qp *qp) ++{ ++ if (qp->state == QP_RESET) ++ return; ++ ++ if (qp->state == QP_CONNECTED) ++ ibscif_send_disconnect(qp, IBSCIF_REASON_DISCONNECT); ++ ++ ibscif_reset_wqs(qp); ++ ibscif_reset_wire(&qp->wire); ++ ++ ibscif_cm_async_callback(qp->cm_context); ++ qp->cm_context = NULL; ++ ++ qp->state = QP_RESET; ++} ++ ++/* Caller must provide proper synchronization. */ ++void ibscif_qp_idle(struct ibscif_qp *qp) ++{ ++ if (qp->state == QP_IDLE) ++ return; ++ ++ ibscif_reset_wqs(qp); ++ ibscif_reset_wire(&qp->wire); ++ ++ qp->state = QP_IDLE; ++} ++ ++/* Caller must provide proper synchronization. */ ++static void ibscif_qp_connect(struct ibscif_qp *qp, enum ibscif_qp_state cur_state) ++{ ++ if (cur_state == QP_CONNECTED) ++ return; ++ ++ qp->loopback = (qp->ibqp.qp_type != IB_QPT_UD) && !scif_loopback && (qp->local_node_id == qp->remote_node_id); ++ qp->conn = NULL; ++ ++ qp->state = QP_CONNECTED; ++} ++ ++/* Caller must provide proper synchronization. */ ++static void ibscif_qp_local_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason) ++{ ++ if (qp->state != QP_CONNECTED) ++ return; ++ ++ if (reason != IBSCIF_REASON_DISCONNECT) ++ printk(KERN_NOTICE PFX "QP %u sending abnormal disconnect %d\n", ++ qp->ibqp.qp_num, reason); ++ ++ qp->state = QP_DISCONNECT; ++ ibscif_send_disconnect(qp, reason); ++ ++ ibscif_flush_wqs(qp); ++ ++ ibscif_cm_async_callback(qp->cm_context); ++ qp->cm_context = NULL; ++ ++ if (reason != IBSCIF_REASON_DISCONNECT) { ++ qp->state = QP_ERROR; ++ ibscif_qp_event(qp, IB_EVENT_QP_FATAL); ++ } else ++ ibscif_qp_idle(qp); ++} ++ ++void ibscif_qp_internal_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason) ++{ ++ spin_lock_bh(&qp->lock); ++ ibscif_qp_local_disconnect(qp, reason); ++ spin_unlock_bh(&qp->lock); ++} ++ ++void ibscif_qp_remote_disconnect(struct ibscif_qp *qp, enum ibscif_reason reason) ++{ ++ if (reason != IBSCIF_REASON_DISCONNECT) ++ printk(KERN_NOTICE PFX "QP %u received abnormal disconnect %d\n", ++ qp->ibqp.qp_num, reason); ++ ++ if (qp->loopback) { ++ /* ++ * Prevent simultaneous loopback QP disconnect deadlocks. ++ * This is no worse than dropping a disconnect packet. ++ */ ++ if (!spin_trylock_bh(&qp->lock)) ++ return; ++ } else ++ spin_lock_bh(&qp->lock); ++ ++ if (qp->state != QP_CONNECTED) { ++ spin_unlock_bh(&qp->lock); ++ return; ++ } ++ ++ ibscif_flush_wqs(qp); ++ ++ ibscif_cm_async_callback(qp->cm_context); ++ qp->cm_context = NULL; ++ ++ if (reason != IBSCIF_REASON_DISCONNECT) { ++ qp->state = QP_ERROR; ++ ibscif_qp_event(qp, IB_EVENT_QP_FATAL); ++ } else ++ qp->state = QP_IDLE; ++ ++ spin_unlock_bh(&qp->lock); ++} ++ ++#define MODIFY_ALLOWED 1 ++#define MODIFY_INVALID 0 ++#define VALID_TRANSITION(next_state, modify_allowed) { 1, modify_allowed }, ++#define INVAL_TRANSITION(next_state) { 0, MODIFY_INVALID }, ++#define START_STATE(current_state) { ++#define CEASE_STATE(current_state) }, ++ ++static const struct { ++ ++ int valid; ++ int modify_allowed; ++ ++} qp_transition[NR_QP_STATES][NR_QP_STATES] = { ++ ++ START_STATE(QP_IDLE) ++ VALID_TRANSITION( QP_IDLE, MODIFY_ALLOWED ) ++ VALID_TRANSITION( QP_CONNECTED, MODIFY_ALLOWED ) ++ INVAL_TRANSITION( QP_DISCONNECT ) ++ VALID_TRANSITION( QP_ERROR, MODIFY_INVALID ) ++ VALID_TRANSITION( QP_RESET, MODIFY_INVALID ) ++ VALID_TRANSITION( QP_IGNORE, MODIFY_ALLOWED ) ++ CEASE_STATE(QP_IDLE) ++ ++ START_STATE(QP_CONNECTED) ++ INVAL_TRANSITION( QP_IDLE ) ++ VALID_TRANSITION( QP_CONNECTED, MODIFY_INVALID ) ++ VALID_TRANSITION( QP_DISCONNECT, MODIFY_INVALID ) ++ VALID_TRANSITION( QP_ERROR, MODIFY_INVALID ) ++ VALID_TRANSITION( QP_RESET, MODIFY_INVALID ) ++ VALID_TRANSITION( QP_IGNORE, MODIFY_ALLOWED ) ++ CEASE_STATE(QP_CONNECTED) ++ ++ START_STATE(QP_DISCONNECT) /* Automatic transition to IDLE */ ++ INVAL_TRANSITION( QP_IDLE ) ++ INVAL_TRANSITION( QP_CONNECTED ) ++ INVAL_TRANSITION( QP_DISCONNECT ) ++ INVAL_TRANSITION( QP_ERROR ) ++ INVAL_TRANSITION( QP_RESET ) ++ INVAL_TRANSITION( QP_IGNORE ) ++ CEASE_STATE(QP_DISCONNECT) ++ ++ START_STATE(QP_ERROR) ++ VALID_TRANSITION( QP_IDLE, MODIFY_INVALID ) ++ INVAL_TRANSITION( QP_CONNECTED ) ++ INVAL_TRANSITION( QP_DISCONNECT ) ++ VALID_TRANSITION( QP_ERROR, MODIFY_INVALID ) ++ VALID_TRANSITION( QP_RESET, MODIFY_INVALID ) ++ VALID_TRANSITION( QP_IGNORE, MODIFY_ALLOWED ) ++ CEASE_STATE(QP_ERROR) ++ ++ START_STATE(QP_RESET) ++ VALID_TRANSITION( QP_IDLE, MODIFY_ALLOWED ) ++ INVAL_TRANSITION( QP_CONNECTED ) ++ INVAL_TRANSITION( QP_DISCONNECT ) ++ VALID_TRANSITION( QP_ERROR, MODIFY_INVALID ) ++ VALID_TRANSITION( QP_RESET, MODIFY_INVALID ) ++ VALID_TRANSITION( QP_IGNORE, MODIFY_ALLOWED ) ++ CEASE_STATE(QP_RESET) ++}; ++ ++int ibscif_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) ++{ ++ struct ibscif_qp *qp = to_qp(ibqp); ++ enum ibscif_qp_state cur_state, new_state; ++ int sq_size, rq_size, max_or, max_ir; ++ int err = -EINVAL; ++ ++ /* ++ * Mutex prevents simultaneous user-mode QP modifies. ++ */ ++ down(&qp->modify_mutex); ++ ++ cur_state = qp->state; ++ ++ if ((attr_mask & IB_QP_CUR_STATE) && (to_ibscif_qp_state(attr->cur_qp_state) != cur_state)) ++ goto out; ++ if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || attr->port_num > 1)) ++ goto out; ++ ++ /* Validate any state transition. */ ++ if (attr_mask & IB_QP_STATE) { ++ new_state = to_ibscif_qp_state(attr->qp_state); ++ if (new_state < 0 || new_state >= NR_QP_STATES) ++ goto out; ++ ++ if (!qp_transition[cur_state][new_state].valid) ++ goto out; ++ } else ++ new_state = cur_state; ++ ++ /* Validate any attribute modify request. */ ++ if (attr_mask & (IB_QP_AV | ++ IB_QP_CAP | ++ IB_QP_DEST_QPN | ++ IB_QP_ACCESS_FLAGS | ++ IB_QP_MAX_QP_RD_ATOMIC | ++ IB_QP_MAX_DEST_RD_ATOMIC)) { ++ ++ if (!qp_transition[cur_state][new_state].modify_allowed) ++ goto out; ++ ++ if ((attr_mask & IB_QP_AV) && (attr->ah_attr.ah_flags & IB_AH_GRH) && check_grh) { ++ int remote_node_id = IBSCIF_LID_TO_NODE_ID(attr->ah_attr.dlid); ++ struct ibscif_conn *conn; ++ union ib_gid *dgid; ++ ++ if (verbose) ++ printk(KERN_INFO PFX "%s: %d-->%d, DGID=%llx:%llx\n", ++ __func__, qp->local_node_id, remote_node_id, ++ __be64_to_cpu(attr->ah_attr.grh.dgid.global.subnet_prefix), ++ __be64_to_cpu(attr->ah_attr.grh.dgid.global.interface_id)); ++ ++ if (remote_node_id == qp->local_node_id) { ++ dgid = &qp->dev->gid; ++ } ++ else { ++ spin_lock(&qp->lock); ++ conn = ibscif_get_conn(qp->local_node_id, remote_node_id, 0); ++ spin_unlock(&qp->lock); ++ if (!conn) { ++ if (verbose) ++ printk(KERN_INFO PFX "%s: failed to make SCIF connection %d-->%d.\n", ++ __func__, qp->local_node_id, remote_node_id); ++ goto out; ++ } ++ dgid = &conn->remote_gid; ++ ibscif_put_conn(conn); ++ } ++ ++ if (verbose) ++ printk(KERN_INFO PFX "%s: local GID[%d]=%llx:%llx\n", ++ __func__, remote_node_id, ++ __be64_to_cpu(dgid->global.subnet_prefix), ++ __be64_to_cpu(dgid->global.interface_id)); ++ ++ if (memcmp(dgid, &attr->ah_attr.grh.dgid, sizeof(*dgid))) { ++ if (verbose) ++ printk(KERN_INFO PFX "%s: connecting to DGID outside the box is unsupported.\n", ++ __func__); ++ goto out; ++ } ++ } ++ ++ if (attr_mask & IB_QP_CAP) { ++ sq_size = attr->cap.max_send_wr; ++ rq_size = attr->cap.max_recv_wr; ++ if ((sq_size > MAX_QP_SIZE) || (rq_size > MAX_QP_SIZE)) ++ goto out; ++ } else { ++ sq_size = qp->sq.size; ++ rq_size = qp->rq.size; ++ } ++ if ((sq_size && !qp->ibqp.send_cq) || (rq_size && !qp->ibqp.recv_cq)) ++ goto out; ++ ++ max_or = (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) ? ++ attr->max_rd_atomic : qp->max_or; ++ max_ir = (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) ? ++ attr->max_dest_rd_atomic : qp->iq.size; ++ ++ if (rma_threshold<0x7FFFFFFF && max_ir>MAX_IR && max_ir>=qp->sq.size) ++ max_ir -= qp->sq.size; ++ ++ if ((max_or > MAX_OR) || (max_ir > MAX_IR)) ++ goto out; ++ ++ /* Validation successful; resize the QP as needed. */ ++ err = ibscif_resize_qp(qp, sq_size, rq_size, max_ir + ((rma_threshold==0x7FFFFFFFF)?0:sq_size)); ++ if (err) ++ goto out; ++ ++ /* No failure paths below the QP resize. */ ++ ++ qp->max_or = max_or; ++ ++ if (attr_mask & IB_QP_ACCESS_FLAGS) ++ qp->access = attr->qp_access_flags; ++ ++ if (attr_mask & IB_QP_DEST_QPN) ++ qp->remote_qpn = attr->dest_qp_num; ++ ++ if (attr_mask & IB_QP_AV) ++ qp->remote_node_id = IBSCIF_LID_TO_NODE_ID(attr->ah_attr.dlid); ++ } ++ ++ err = 0; ++ if (attr_mask & IB_QP_STATE) { ++ ++ /* Perform state change processing. */ ++ spin_lock_bh(&qp->lock); ++ switch (new_state) { ++ case QP_IDLE: ++ ibscif_qp_idle(qp); ++ break; ++ case QP_CONNECTED: ++ ibscif_qp_connect(qp, cur_state); ++ break; ++ case QP_DISCONNECT: ++ ibscif_qp_local_disconnect(qp, IBSCIF_REASON_DISCONNECT); ++ break; ++ case QP_ERROR: ++ ibscif_qp_error(qp); ++ break; ++ case QP_RESET: ++ ibscif_qp_reset(qp); ++ break; ++ default: ++ break; ++ } ++ spin_unlock_bh(&qp->lock); ++ ++ /* scif_connect() can not be called with spin_lock_bh() held */ ++ if (ibqp->qp_type != IB_QPT_UD && new_state == QP_CONNECTED && !qp->loopback) { ++ int flag = (qp->ibqp.qp_num > qp->remote_qpn); ++ spin_lock(&qp->lock); ++ qp->conn = ibscif_get_conn( qp->local_node_id, qp->remote_node_id, flag ); ++ spin_unlock(&qp->lock); ++ } ++ } ++ ++ __ibscif_query_qp(qp, attr, NULL); ++out: ++ up(&qp->modify_mutex); ++ return err; ++} ++ ++void ibscif_complete_qp(struct kref *ref) ++{ ++ struct ibscif_qp *qp = container_of(ref, struct ibscif_qp, ref); ++ complete(&qp->done); ++} ++ ++int ibscif_destroy_qp(struct ib_qp *ibqp) ++{ ++ struct ibscif_qp *qp = to_qp(ibqp); ++ struct ibscif_dev *dev = qp->dev; ++ int i, j; ++ struct ibscif_conn *conn[IBSCIF_MAX_DEVICES]; ++ ++ if (qp->cm_context) { ++ ibscif_cm_async_callback(qp->cm_context); ++ qp->cm_context = NULL; ++ } ++ ++ if (ibqp->qp_num) ++ ibscif_wiremap_del(ibqp->qp_num); ++ ++ if (qp->in_scheduler) ++ ibscif_scheduler_remove_qp(qp); ++ ++ spin_lock_bh(&qp->lock); ++ if (qp->state == QP_CONNECTED) ++ ibscif_send_disconnect(qp, IBSCIF_REASON_DISCONNECT); ++ spin_unlock_bh(&qp->lock); ++ ++ ibscif_put_qp(qp); ++ wait_for_completion(&qp->done); ++ ++ ibscif_flush_wqs(qp); ++ ibscif_reset_wqs(qp); ++ ibscif_reset_wire(&qp->wire); ++ ++ if (qp->sq.wr) ++ vfree(qp->sq.wr); ++ if (qp->rq.wr) ++ vfree(qp->rq.wr); ++ if (qp->iq.wr) ++ vfree(qp->iq.wr); ++ ++ ibscif_release_quota((PAGE_ALIGN(qp->sq.size * qp->sq.wr_size) + ++ PAGE_ALIGN(qp->rq.size * qp->rq.wr_size) + ++ PAGE_ALIGN(qp->iq.size * qp->iq.wr_size)) >> PAGE_SHIFT); ++ ++ atomic_dec(&dev->qp_cnt); ++ ++ ibscif_put_conn(qp->conn); ++ ++ if (qp->ibqp.qp_type == IB_QPT_UD) { ++ spin_lock_bh(&qp->lock); ++ for (i=0, j=0; iud_conn[i]) { ++ conn[j++] = qp->ud_conn[i]; ++ qp->ud_conn[i] = NULL; ++ } ++ } ++ spin_unlock_bh(&qp->lock); ++ ++ /* ibscif_put_conn() may call scif_unregister(), should not hold a lock */ ++ for (i=0; iibqp.qp_type != IB_QPT_UD) ++ return; ++ ++ ++ spin_lock_bh(&qp->lock); ++ ++ for (i=0; iud_conn[i] == conn) ++ goto done; ++ } ++ ++ for (i=0; iud_conn[i] == NULL) { ++ atomic_inc(&conn->refcnt); ++ qp->ud_conn[i] = conn; ++ break; ++ } ++ } ++done: ++ spin_unlock_bh(&qp->lock); ++} ++ +diff -urN a7/drivers/infiniband/hw/scif/ibscif_scheduler.c a8/drivers/infiniband/hw/scif/ibscif_scheduler.c +--- a7/drivers/infiniband/hw/scif/ibscif_scheduler.c 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_scheduler.c 2015-02-23 10:14:37.488809663 -0800 +@@ -0,0 +1,195 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "ibscif_driver.h" ++ ++static int ibscif_schedule_tx(struct ibscif_wq *wq, int max_send) ++{ ++ struct ibscif_tx_state *tx = &wq->wirestate->tx; ++ struct ibscif_qp *qp = wq->qp; ++ struct ibscif_wr *wr; ++ int index, sent = 0; ++ ++ while ((wq->next_wr != wq->tail) && ibscif_tx_window(tx) && max_send) { ++ ++ index = wq->next_wr; ++ wr = ibscif_get_wr(wq, index); ++ ++ /* ++ * Ack processing can reschedule a WR that is in retry; only process ++ * it if we are all caught up. Also, do not start a fenced WR until ++ * all prior RDMA read and atomic operations have completed. ++ */ ++ if ((wr->flags & IB_SEND_FENCE) && atomic_read(&qp->or_depth) && ++ (wr->state == WR_WAITING)) ++ break; ++ ++ switch (wr->opcode) { ++ case WR_RDMA_READ: ++ case WR_ATOMIC_CMP_AND_SWP: ++ case WR_ATOMIC_FETCH_AND_ADD: ++ /* Throttle IQ stream requests if needed. */ ++ if (wr->state == WR_WAITING) { ++ if (atomic_read(&qp->or_depth) == qp->max_or) ++ return 0; ++ atomic_inc(&qp->or_depth); ++ } ++ /* Fall through. */ ++ case WR_SEND: ++ case WR_SEND_WITH_IMM: ++ case WR_RDMA_WRITE: ++ case WR_RDMA_WRITE_WITH_IMM: ++ case WR_RDMA_READ_RSP: ++ case WR_ATOMIC_RSP: ++ case WR_RMA_RSP: ++ sent = ibscif_xmit_wr(wq, wr, min((u32)max_send, ibscif_tx_window(tx)), ++ 0, tx->next_seq, &tx->next_seq); ++ break; ++ case WR_UD: ++ sent = ibscif_xmit_wr(wq, wr, min((u32)max_send, ibscif_tx_window(tx)), ++ 0, 0, NULL); ++ break; ++ default: ++ printk(KERN_ERR PFX "%s() botch: found opcode %d on work queue\n", ++ __func__, wr->opcode); ++ return -EOPNOTSUPP; ++ } ++ ++ /* If an IQ stream request did not get started we need to back off or_depth. */ ++ if ((wr->state == WR_WAITING) && ++ ((wr->opcode == WR_RDMA_READ) || ++ (wr->opcode == WR_ATOMIC_CMP_AND_SWP) || (wr->opcode == WR_ATOMIC_FETCH_AND_ADD))) ++ atomic_dec(&qp->or_depth); ++ ++ if (sent < 0) ++ return sent; ++ ++ max_send -= sent; ++ ++ /* ++ * The tx engine bumps next_wr when finished sending a whole WR. ++ * Bail if it didn't this time around. ++ */ ++ if (wq->next_wr == index) ++ break; ++ } ++ ++ return 0; ++} ++ ++static int ibscif_schedule_wq(struct ibscif_wq *wq) ++{ ++ int max_send, err = 0; ++ int need_call_sq_completions = 0; ++ ++ /* Ignore loopback QPs that may be scheduled by retry processing. */ ++ if (wq->qp->loopback) ++ return 0; ++ ++ if (!(max_send = atomic_read(&wq->qp->dev->available))) ++ return -EBUSY; ++ ++ spin_lock(&wq->lock); ++ err = ibscif_schedule_tx(wq, max_send); ++ need_call_sq_completions = wq->fast_rdma_completions; ++ wq->fast_rdma_completions = 0; ++ spin_unlock(&wq->lock); ++ ++ if (unlikely(err)) ++ ibscif_qp_internal_disconnect(wq->qp, IBSCIF_REASON_QP_FATAL); ++ ++ if (fast_rdma && need_call_sq_completions) ++ ibscif_process_sq_completions(wq->qp); ++ ++ return err; ++} ++ ++void ibscif_schedule(struct ibscif_wq *wq) ++{ ++ struct ibscif_dev *dev; ++ struct list_head processed; ++ ++ if (wq->qp->loopback) { ++ ibscif_loopback(wq); ++ return; ++ } ++ dev = wq->qp->dev; ++ ++ if (!ibscif_schedule_wq(wq)) ++ goto out; ++ ++ while (atomic_xchg(&dev->was_new, 0)) { ++ /* Bail if the device is busy. */ ++ if (down_trylock(&dev->mutex)) ++ goto out; ++ ++ /* ++ * Schedule each WQ on the device and move it to the processed list. ++ * When complete, append the processed list to the device WQ list. ++ */ ++ INIT_LIST_HEAD(&processed); ++ while (!list_empty(&dev->wq_list)) { ++ wq = list_entry(dev->wq_list.next, typeof(*wq), entry); ++ if (!ibscif_schedule_wq(wq)) { ++ DEV_STAT(dev, sched_exhaust++); ++ list_splice(&processed, dev->wq_list.prev); ++ up(&dev->mutex); ++ goto out; ++ } ++ list_move_tail(&wq->entry, &processed); ++ } ++ list_splice(&processed, dev->wq_list.prev); ++ ++ up(&dev->mutex); ++ } ++ return; ++out: ++ atomic_inc(&dev->was_new); ++} ++ ++void ibscif_scheduler_add_qp(struct ibscif_qp *qp) ++{ ++ struct ibscif_dev *dev = qp->dev; ++ ++ down(&dev->mutex); ++ list_add_tail(&qp->sq.entry, &dev->wq_list); ++ list_add_tail(&qp->iq.entry, &dev->wq_list); ++ up(&dev->mutex); ++} ++ ++void ibscif_scheduler_remove_qp(struct ibscif_qp *qp) ++{ ++ struct ibscif_dev *dev = qp->dev; ++ ++ down(&dev->mutex); ++ list_del(&qp->sq.entry); ++ list_del(&qp->iq.entry); ++ up(&dev->mutex); ++} +diff -urN a7/drivers/infiniband/hw/scif/ibscif_util.c a8/drivers/infiniband/hw/scif/ibscif_util.c +--- a7/drivers/infiniband/hw/scif/ibscif_util.c 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/ibscif_util.c 2015-02-23 10:14:37.488809663 -0800 +@@ -0,0 +1,623 @@ ++/* ++ * Copyright (c) 2008 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the ++ * GNU General Public License (GPL) Version 2, available from the ++ * file COPYING in the main directory of this source tree, or the ++ * OpenFabrics.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "ibscif_driver.h" ++ ++#define IBSCIF_CONN_IDLE 0 ++#define IBSCIF_CONN_REQ_SENT 1 ++#define IBSCIF_CONN_REQ_RCVD 2 ++#define IBSCIF_CONN_ESTABLISHED 3 ++#define IBSCIF_CONN_ACTIVE 4 ++ ++DEFINE_SPINLOCK(conn_state_lock); ++static int conn_state[IBSCIF_MAX_DEVICES][IBSCIF_MAX_DEVICES]; ++ ++#define IBSCIF_CONN_REP 1 ++#define IBSCIF_CONN_REJ 2 ++#define IBSCIF_CONN_ERR 3 ++ ++struct ibscif_conn_resp { ++ int cmd; ++ union ib_gid gid; ++}; ++ ++void ibscif_do_accept(struct ibscif_dev *dev) ++{ ++ struct scif_portID peer; ++ scif_epd_t ep; ++ struct ibscif_conn *conn; ++ int ret; ++ struct ibscif_conn_resp resp; ++ int resp_size; ++ ++ if (check_grh) ++ resp_size = sizeof(resp); ++ else ++ resp_size = sizeof(int); ++ ++ ret = scif_accept(dev->listen_ep, &peer, &ep, SCIF_ACCEPT_SYNC); ++ if (ret) { ++ printk(KERN_ALERT PFX "%s: scif_accept returns %ld\n", __func__, PTR_ERR(ep)); ++ return; ++ } ++ ++ if (verbose) ++ printk(KERN_INFO PFX "%s: %d<--%d\n", __func__, dev->node_id, peer.node); ++ ++ if (check_grh) ++ memcpy(&resp.gid, &dev->gid, sizeof(resp.gid)); ++ ++ spin_lock(&conn_state_lock); ++ switch (conn_state[dev->node_id][peer.node]) { ++ case IBSCIF_CONN_IDLE: ++ conn_state[dev->node_id][peer.node] = IBSCIF_CONN_REQ_RCVD; ++ resp.cmd = IBSCIF_CONN_REP; ++ if (verbose) ++ printk(KERN_INFO PFX "%s: no double connection, accepting\n", __func__); ++ break; ++ ++ case IBSCIF_CONN_REQ_SENT: ++ /* A connection request has been sent, but no response yet. Node id is used to ++ * break the tie when both side send the connection request. One side is allowed ++ * to accept the request and its own request will be rejected by the peer. ++ */ ++ if (dev->node_id > peer.node) { ++ resp.cmd = IBSCIF_CONN_REJ; ++ if (verbose) ++ printk(KERN_INFO PFX "%s: double connection, rejecting (peer will accept)\n", __func__); ++ } ++ else if (dev->node_id == peer.node) { ++ conn_state[dev->node_id][peer.node] = IBSCIF_CONN_REQ_RCVD; ++ resp.cmd = IBSCIF_CONN_REP; ++ if (verbose) ++ printk(KERN_INFO PFX "%s: loopback connection, accepting\n", __func__); ++ } ++ else { ++ conn_state[dev->node_id][peer.node] = IBSCIF_CONN_REQ_RCVD; ++ resp.cmd = IBSCIF_CONN_REP; ++ if (verbose) ++ printk(KERN_INFO PFX "%s: double connection, accepting (peer will reject)\n", __func__); ++ } ++ break; ++ ++ case IBSCIF_CONN_REQ_RCVD: ++ if (verbose) ++ printk(KERN_INFO PFX "%s: duplicated connection request, rejecting\n", __func__); ++ resp.cmd = IBSCIF_CONN_REJ; ++ break; ++ ++ case IBSCIF_CONN_ESTABLISHED: ++ case IBSCIF_CONN_ACTIVE: ++ if (verbose) ++ printk(KERN_INFO PFX "%s: already connected, rejecting\n", __func__); ++ resp.cmd = IBSCIF_CONN_REJ; ++ break; ++ ++ default: ++ if (verbose) ++ printk(KERN_INFO PFX "%s: invalid state: %d\n", __func__, conn_state[dev->node_id][peer.node]); ++ resp.cmd = IBSCIF_CONN_ERR; ++ break; ++ } ++ spin_unlock(&conn_state_lock); ++ ++ ret = scif_send(ep, &resp, resp_size, SCIF_SEND_BLOCK); ++ if (ret < 0) { ++ printk(KERN_ALERT PFX "%s: scif_send returns %d\n", __func__, ret); ++ scif_close(ep); ++ return; ++ } ++ ++ if (resp.cmd != IBSCIF_CONN_REP) { ++ /* one additional hand shaking to prevent the previous send from being trashed by ep closing */ ++ scif_recv(ep, &resp, resp_size, SCIF_RECV_BLOCK); ++ scif_close(ep); ++ return; ++ } ++ ++ if (check_grh) { ++ ret = scif_recv(ep, &resp, resp_size, SCIF_RECV_BLOCK); ++ if (ret < 0) { ++ printk(KERN_ALERT PFX "%s: scif_recv returns %d\n", __func__, ret); ++ scif_close(ep); ++ spin_lock(&conn_state_lock); ++ conn_state[dev->node_id][peer.node] = IBSCIF_CONN_IDLE; ++ spin_unlock(&conn_state_lock); ++ return; ++ } ++ } ++ ++ conn = kzalloc(sizeof (*conn), GFP_KERNEL); ++ if (!conn) { ++ printk(KERN_ALERT PFX "%s: cannot allocate connection context.\n", __func__); ++ scif_close(ep); ++ spin_lock(&conn_state_lock); ++ conn_state[dev->node_id][peer.node] = IBSCIF_CONN_IDLE; ++ spin_unlock(&conn_state_lock); ++ return; ++ } ++ ++ conn->ep = ep; ++ conn->remote_node_id = peer.node; ++ if (check_grh) ++ memcpy(&conn->remote_gid, &resp.gid, sizeof(conn->remote_gid)); ++ conn->dev = dev; ++ atomic_set(&conn->refcnt, 0); ++ ++ spin_lock(&conn_state_lock); ++ conn_state[dev->node_id][peer.node] = IBSCIF_CONN_ESTABLISHED; ++ spin_unlock(&conn_state_lock); ++ ++ if (verbose) ++ printk(KERN_INFO PFX "%s: connection established. ep=%p\n", __func__, ep); ++ ++ ibscif_refresh_mreg(conn); ++ ++ /* one addition sync to ensure the MRs are registered with the new ep at both side */ ++ scif_send(ep, &resp, resp_size, SCIF_SEND_BLOCK); ++ scif_recv(ep, &resp, resp_size, SCIF_RECV_BLOCK); ++ ++ list_add(&conn->entry, &dev->conn_list); ++ ibscif_refresh_pollep_list(); ++ ++ spin_lock(&conn_state_lock); ++ conn_state[dev->node_id][peer.node] = IBSCIF_CONN_ACTIVE; ++ spin_unlock(&conn_state_lock); ++} ++ ++struct ibscif_conn *ibscif_do_connect(struct ibscif_dev *dev, int remote_node_id) ++{ ++ struct scif_portID dest; ++ struct ibscif_conn *conn = NULL; ++ int ret; ++ scif_epd_t ep; ++ struct ibscif_conn_resp resp; ++ union ib_gid peer_gid; ++ int resp_size; ++ ++ if (check_grh) ++ resp_size = sizeof(resp); ++ else ++ resp_size = sizeof(int); ++ ++ if (verbose) ++ printk(KERN_INFO PFX "%s: %d-->%d\n", __func__, dev->node_id, remote_node_id); ++ ++ /* Validate remote_node_id for conn_state array check */ ++ if ((remote_node_id < 0) || (remote_node_id >= IBSCIF_MAX_DEVICES)) ++ return ERR_PTR(-EINVAL); ++ ++ spin_lock(&conn_state_lock); ++ if (conn_state[dev->node_id][remote_node_id] != IBSCIF_CONN_IDLE) { ++ spin_unlock(&conn_state_lock); ++ if (verbose) ++ printk(KERN_INFO PFX "%s: connection already in progress, retry\n", __func__); ++ return ERR_PTR(-EAGAIN); ++ } ++ conn_state[dev->node_id][remote_node_id] = IBSCIF_CONN_REQ_SENT; ++ spin_unlock(&conn_state_lock); ++ ++ ep = scif_open(); ++ if (!ep) /* SCIF API semantics */ ++ goto out_state; ++ ++ if (IS_ERR(ep)) /* SCIF emulator semantics */ ++ goto out_state; ++ ++ dest.node = remote_node_id; ++ dest.port = SCIF_OFED_PORT_0; ++ ++ ret = scif_connect(ep, &dest); ++ if (ret < 0) ++ goto out_close; ++ ++ /* Now ret is the port number ep is bound to */ ++ ++ ret = scif_recv(ep, &resp, resp_size, SCIF_RECV_BLOCK); ++ if (ret < 0) { ++ printk(KERN_ALERT PFX "%s: scif_recv returns %d\n", __func__, ret); ++ goto out_close; ++ } ++ ++ if (resp.cmd != IBSCIF_CONN_REP) { ++ scif_send(ep, &resp, resp_size, SCIF_SEND_BLOCK); ++ /* the peer has issued the connection request */ ++ if (resp.cmd == IBSCIF_CONN_REJ) { ++ if (verbose) ++ printk(KERN_INFO PFX "%s: rejected by peer due to double connection\n", __func__); ++ scif_close(ep); ++ /* don't reset the state becasue it's used for checking connection state */ ++ return ERR_PTR(-EAGAIN); ++ } ++ else { ++ if (verbose) ++ printk(KERN_INFO PFX "%s: rejected by peer due to invalid state\n", __func__); ++ goto out_close; ++ } ++ } ++ ++ if (check_grh) { ++ memcpy(&peer_gid, &resp.gid, sizeof(peer_gid)); ++ memcpy(&resp.gid, &dev->gid, sizeof(resp.gid)); ++ ret = scif_send(ep, &resp, resp_size, SCIF_SEND_BLOCK); ++ if (ret < 0) { ++ printk(KERN_ALERT PFX "%s: scif_send returns %d\n", __func__, ret); ++ goto out_close; ++ } ++ } ++ ++ if (verbose) ++ printk(KERN_INFO PFX "%s: connection established. ep=%p\n", __func__, ep); ++ ++ spin_lock(&conn_state_lock); ++ conn_state[dev->node_id][remote_node_id] = IBSCIF_CONN_ESTABLISHED; ++ spin_unlock(&conn_state_lock); ++ ++ conn = kzalloc(sizeof *conn, GFP_KERNEL); ++ if (!conn) { ++ printk(KERN_ALERT PFX "%s: failed to allocate connection object\n", __func__); ++ goto out_close; ++ } ++ ++ conn->ep = ep; ++ conn->remote_node_id = remote_node_id; ++ if (check_grh) ++ memcpy(&conn->remote_gid, &peer_gid, sizeof(conn->remote_gid)); ++ conn->dev = dev; ++ atomic_set(&conn->refcnt, 0); ++ ++ ibscif_refresh_mreg(conn); ++ ++ /* one addition sync to ensure the MRs are registered with the new ep at both side */ ++ scif_send(ep, &resp, resp_size, SCIF_SEND_BLOCK); ++ scif_recv(ep, &resp, resp_size, SCIF_RECV_BLOCK); ++ ++ list_add_tail(&conn->entry, &dev->conn_list); ++ ibscif_refresh_pollep_list(); ++ ++ spin_lock(&conn_state_lock); ++ conn_state[dev->node_id][remote_node_id] = IBSCIF_CONN_ACTIVE; ++ spin_unlock(&conn_state_lock); ++ ++ return conn; ++ ++out_close: ++ scif_close(ep); ++ ++out_state: ++ spin_lock(&conn_state_lock); ++ if (conn_state[dev->node_id][remote_node_id] == IBSCIF_CONN_REQ_SENT) ++ conn_state[dev->node_id][remote_node_id] = IBSCIF_CONN_IDLE; ++ spin_unlock(&conn_state_lock); ++ return conn; ++} ++ ++struct ibscif_conn *ibscif_get_conn(int node_id, int remote_node_id, int find_local_peer) ++{ ++ struct ibscif_dev *cur, *next, *dev = NULL; ++ struct ibscif_conn *conn, *conn1, *conn2; ++ int done=0, err=0, connect_tried=0; ++ ++ down(&devlist_mutex); ++ list_for_each_entry_safe(cur, next, &devlist, entry) { ++ if (cur->node_id == node_id) { ++ dev = cur; ++ break; ++ } ++ } ++ up(&devlist_mutex); ++ ++ if (!dev) ++ return NULL; ++ ++again: ++ conn1 = NULL; ++ conn2 = NULL; ++ down(&dev->mutex); ++ list_for_each_entry(conn, &dev->conn_list, entry) ++ { ++ if (conn->remote_node_id == remote_node_id) { ++ if (node_id == remote_node_id) { ++ if (!conn1) { ++ conn1 = conn; ++ continue; ++ } ++ else { ++ conn2 = conn; ++ break; ++ } ++ } ++ up(&dev->mutex); ++ atomic_inc(&conn->refcnt); ++ if (conn->local_close) { ++ conn->local_close = 0; ++ ibscif_send_reopen(conn); ++ } ++ return conn; ++ } ++ } ++ up(&dev->mutex); ++ ++ /* for loopback connections, we must wait for both endpoints be in the list to ensure that ++ * different endpoints are assigned to the two sides ++ */ ++ if (node_id == remote_node_id) { ++ if (conn1 && conn2) { ++ conn = find_local_peer ? conn2 : conn1; ++ atomic_inc(&conn->refcnt); ++ if (conn->local_close) { ++ conn->local_close = 0; ++ ibscif_send_reopen(conn); ++ } ++ return conn; ++ } ++ else if (conn1) { ++ schedule(); ++ goto again; ++ } ++ } ++ ++ if (connect_tried) { ++ printk(KERN_ALERT PFX "%s: ERROR: cannot get connection (%d-->%d) after waiting, state=%d\n", ++ __func__, dev->node_id, remote_node_id, err-1); ++ return NULL; ++ } ++ ++ conn = ibscif_do_connect(dev, remote_node_id); ++ ++ /* If a connection is in progress, wait for its finish */ ++ if (conn == ERR_PTR(-EAGAIN)) { ++ while (!done && !err) { ++ spin_lock(&conn_state_lock); ++ switch (conn_state[node_id][remote_node_id]) { ++ case IBSCIF_CONN_REQ_SENT: ++ case IBSCIF_CONN_REQ_RCVD: ++ case IBSCIF_CONN_ESTABLISHED: ++ break; ++ case IBSCIF_CONN_ACTIVE: ++ done = 1; ++ break; ++ default: ++ err = 1 + conn_state[node_id][remote_node_id]; ++ break; ++ } ++ spin_unlock(&conn_state_lock); ++ schedule(); ++ } ++ } ++ ++ connect_tried = 1; ++ goto again; ++} ++ ++void ibscif_put_conn(struct ibscif_conn *conn) ++{ ++ if (!conn) ++ return; ++ ++ if (atomic_dec_and_test(&conn->refcnt)) { ++ // printk(KERN_INFO PFX "%s: local_close, conn=%p, remote_close=%d\n", __func__, conn, conn->remote_close); ++ ibscif_send_close(conn); ++ conn->local_close = 1; ++ } ++} ++ ++void ibscif_get_pollep_list(struct scif_pollepd *polleps, ++ struct ibscif_dev **devs, int *types, struct ibscif_conn **conns, int *count) ++{ ++ struct ibscif_dev *dev; ++ struct ibscif_conn *conn; ++ int i = 0; ++ int max = *count; ++ ++ down(&devlist_mutex); ++ list_for_each_entry(dev, &devlist, entry) { ++ if (i >= max) ++ break; ++ ++ polleps[i].epd = dev->listen_ep; ++ polleps[i].events = POLLIN; ++ polleps[i].revents = 0; ++ devs[i] = dev; ++ types[i] = IBSCIF_EP_TYPE_LISTEN; ++ conns[i] = NULL; ++ i++; ++ if (verbose) ++ printk(KERN_INFO PFX "%s: ep=%p (%d:listen)\n", __func__, dev->listen_ep, dev->node_id); ++ ++ down(&dev->mutex); ++ list_for_each_entry(conn, &dev->conn_list, entry) ++ { ++ if (i >= max) ++ break; ++ polleps[i].epd = conn->ep; ++ polleps[i].events = POLLIN; ++ polleps[i].revents = 0; ++ devs[i] = dev; ++ types[i] = IBSCIF_EP_TYPE_COMM; ++ conns[i] = conn; ++ i++; ++ if (verbose) ++ printk(KERN_INFO PFX "%s: ep=%p (%d<--->%d)\n", __func__, conn->ep, dev->node_id, conn->remote_node_id); ++ } ++ up(&dev->mutex); ++ } ++ up(&devlist_mutex); ++ ++ if (verbose) ++ printk(KERN_INFO PFX "%s: count=%d\n", __func__, i); ++ *count = i; ++} ++ ++void ibscif_get_ep_list(scif_epd_t *eps, int *count) ++{ ++ struct ibscif_dev *dev; ++ struct ibscif_conn *conn; ++ int i = 0; ++ int max = *count; ++ ++ down(&devlist_mutex); ++ list_for_each_entry(dev, &devlist, entry) { ++ if (i >= max) ++ break; ++ ++ down(&dev->mutex); ++ list_for_each_entry(conn, &dev->conn_list, entry) ++ { ++ if (i >= max) ++ break; ++ eps[i] = conn->ep; ++ i++; ++ } ++ up(&dev->mutex); ++ } ++ up(&devlist_mutex); ++ ++ *count = i; ++} ++ ++void ibscif_remove_ep(struct ibscif_dev *dev, scif_epd_t ep) ++{ ++ struct ibscif_conn *conn, *next; ++ down(&dev->mutex); ++ list_for_each_entry_safe(conn, next, &dev->conn_list, entry) ++ { ++ if (conn->ep == ep) { ++ spin_lock(&conn_state_lock); ++ conn_state[conn->dev->node_id][conn->remote_node_id] = IBSCIF_CONN_IDLE; ++ spin_unlock(&conn_state_lock); ++ list_del(&conn->entry); ++ } ++ } ++ up(&dev->mutex); ++} ++ ++ ++void ibscif_free_conn(struct ibscif_conn *conn) ++{ ++ scif_close(conn->ep); ++ kfree(conn); ++} ++ ++int ibscif_cleanup_idle_conn(void) ++{ ++ struct ibscif_dev *dev; ++ struct ibscif_conn *conn, *next; ++ struct ibscif_conn *idle_conns[IBSCIF_MAX_DEVICES]; ++ int i, n=0; ++ ++ down(&devlist_mutex); ++ list_for_each_entry(dev, &devlist, entry) { ++ down(&dev->mutex); ++ list_for_each_entry_safe(conn, next, &dev->conn_list, entry) ++ { ++ if (conn->local_close && conn->remote_close) { ++ spin_lock(&conn_state_lock); ++ conn_state[conn->dev->node_id][conn->remote_node_id] = IBSCIF_CONN_IDLE; ++ spin_unlock(&conn_state_lock); ++ list_del(&conn->entry); ++ idle_conns[n++] = conn; ++ } ++ } ++ up(&dev->mutex); ++ } ++ up(&devlist_mutex); ++ ++ for (i=0; i ++ ++static uint32_t ibscif_time_passed(void) ++{ ++ static int first = 1; ++ static struct timeval t0; ++ static struct timeval t; ++ uint32_t usec; ++ ++ if (first) { ++ do_gettimeofday(&t0); ++ first = 0; ++ return 0; ++ } ++ ++ do_gettimeofday(&t); ++ usec = (t.tv_sec - t0.tv_sec) * 1000000UL; ++ if (t.tv_usec >= t0.tv_usec) ++ usec += (t.tv_usec - t0.tv_usec); ++ else ++ usec -= (t0.tv_usec - t.tv_usec); ++ ++ t0 = t; ++ return usec; ++} ++ ++#define IBSCIF_PERF_MAX_SAMPLES 100 ++#define IBSCIF_PERF_MAX_COUNTERS 10 ++ ++void ibscif_perf_sample(int counter, int next) ++{ ++ static uint32_t T[IBSCIF_PERF_MAX_SAMPLES][IBSCIF_PERF_MAX_COUNTERS]; ++ static int T_idx=0; ++ int i, j, sum; ++ ++ if (counter>=0 && counter0) ++ sum += T[i][j]; ++ } ++ printk("SUM(T1..T%d)=%u\n", IBSCIF_PERF_MAX_COUNTERS-1, sum); ++ } ++ } ++} ++ +diff -urN a7/drivers/infiniband/hw/scif/Kconfig a8/drivers/infiniband/hw/scif/Kconfig +--- a7/drivers/infiniband/hw/scif/Kconfig 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/Kconfig 2015-02-23 10:14:37.489809663 -0800 +@@ -0,0 +1,4 @@ ++config INFINIBAND_SCIF ++ tristate "SCIF RDMA driver support" ++ ---help--- ++ RDMA over SCIF driver. +diff -urN a7/drivers/infiniband/hw/scif/Makefile a8/drivers/infiniband/hw/scif/Makefile +--- a7/drivers/infiniband/hw/scif/Makefile 1969-12-31 16:00:00.000000000 -0800 ++++ a8/drivers/infiniband/hw/scif/Makefile 2015-02-23 10:14:37.489809663 -0800 +@@ -0,0 +1,41 @@ ++ifneq ($(KERNELRELEASE),) ++ ++# Original Make begins ++ ++obj-$(CONFIG_INFINIBAND_SCIF) += ibscif.o ++ ++ibscif-y := ibscif_main.o \ ++ ibscif_ah.o \ ++ ibscif_pd.o \ ++ ibscif_cq.o \ ++ ibscif_qp.o \ ++ ibscif_mr.o \ ++ ibscif_cm.o \ ++ ibscif_post.o \ ++ ibscif_procfs.o \ ++ ibscif_loopback.o \ ++ ibscif_provider.o \ ++ ibscif_protocol.o \ ++ ibscif_scheduler.o \ ++ ibscif_util.o ++ ++# Original Makefile ends ++ ++else ++ ++ifeq ($(KVER),) ++ ifeq ($(KDIR),) ++ KDIR := /lib/modules/$(shell uname -r)/build ++ endif ++else ++ KDIR := /lib/modules/$(KVER)/build ++endif ++ ++all: ++ $(MAKE) -C $(KDIR) SUBDIRS=$(shell pwd) CONFIG_INFINIBAND_SCIF=m ++ ++clean: ++ rm -rf *.o *.ko *.mod.c .*.cmd Module.* .tmp_versions ++ ++endif ++ diff --git a/tech-preview/xeon-phi/0009-update-drivers-infiniband-s-Kconfig-and-Makefile-to-.patch b/tech-preview/xeon-phi/0009-update-drivers-infiniband-s-Kconfig-and-Makefile-to-.patch new file mode 100644 index 0000000..bacf14f --- /dev/null +++ b/tech-preview/xeon-phi/0009-update-drivers-infiniband-s-Kconfig-and-Makefile-to-.patch @@ -0,0 +1,37 @@ +From 4f27d323bd47563f40a663672a331c5b2c95138e Mon Sep 17 00:00:00 2001 +From: Phil Cayton +Date: Tue, 4 Feb 2014 12:25:45 -0800 +Subject: [PATCH 09/12] update drivers/infiniband's Kconfig and Makefile to + allow compilation of CCL-Direct (ibp) + +Signed-off-by: Phil Cayton +--- +diff -urN a8/drivers/infiniband/hw/Makefile a9/drivers/infiniband/hw/Makefile +--- a8/drivers/infiniband/hw/Makefile 2015-01-05 15:04:13.993463721 -0800 ++++ a9/drivers/infiniband/hw/Makefile 2015-01-05 15:09:10.056451249 -0800 +@@ -10,3 +10,4 @@ + obj-$(CONFIG_INFINIBAND_NES) += nes/ + obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ + obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ ++obj-$(CONFIG_INFINIBAND_SCIF) += scif/ +diff -urN a8/drivers/infiniband/Kconfig a9/drivers/infiniband/Kconfig +--- a8/drivers/infiniband/Kconfig 2015-01-05 15:04:14.001463720 -0800 ++++ a9/drivers/infiniband/Kconfig 2015-01-05 15:07:03.176456594 -0800 +@@ -55,6 +55,9 @@ + source "drivers/infiniband/hw/nes/Kconfig" + source "drivers/infiniband/hw/ocrdma/Kconfig" + source "drivers/infiniband/hw/usnic/Kconfig" ++source "drivers/infiniband/hw/scif/Kconfig" ++ ++source "drivers/infiniband/ibp/Kconfig" + + source "drivers/infiniband/ulp/ipoib/Kconfig" + +diff -urN a8/drivers/infiniband/Makefile a9/drivers/infiniband/Makefile +--- a8/drivers/infiniband/Makefile 2015-01-05 15:04:14.001463720 -0800 ++++ a9/drivers/infiniband/Makefile 2015-01-05 15:08:25.112453143 -0800 +@@ -1,3 +1,4 @@ + obj-$(CONFIG_INFINIBAND) += core/ + obj-$(CONFIG_INFINIBAND) += hw/ + obj-$(CONFIG_INFINIBAND) += ulp/ ++obj-$(CONFIG_IBP_SERVER) += ibp/ diff --git a/tech-preview/xeon-phi/0010-Update-qib-for-XEON-PHI-support.patch b/tech-preview/xeon-phi/0010-Update-qib-for-XEON-PHI-support.patch new file mode 100644 index 0000000..09a8ba0 --- /dev/null +++ b/tech-preview/xeon-phi/0010-Update-qib-for-XEON-PHI-support.patch @@ -0,0 +1,2783 @@ +IB/qib: Update qib for XEON PHI support + +From: Jubin John + +Reviewed-by: Mike Marciniszyn +Signed-off-by: Jubin John +--- +diff -urN a9/drivers/infiniband/hw/qib/Makefile a10/drivers/infiniband/hw/qib/Makefile +--- a9/drivers/infiniband/hw/qib/Makefile 2015-01-05 15:05:04.280461602 -0800 ++++ a10/drivers/infiniband/hw/qib/Makefile 2015-01-05 15:10:58.250446692 -0800 +@@ -14,3 +14,8 @@ + ib_qib-$(CONFIG_X86_64) += qib_wc_x86_64.o + ib_qib-$(CONFIG_PPC64) += qib_wc_ppc64.o + ib_qib-$(CONFIG_DEBUG_FS) += qib_debugfs.o ++ ++ifeq ($(CONFIG_INFINIBAND_SCIF),m) ++ib_qib-y += qib_knx.o ++ccflags-y += -DQIB_CONFIG_KNX ++endif +diff -urN a9/drivers/infiniband/hw/qib/qib_common.h a10/drivers/infiniband/hw/qib/qib_common.h +--- a9/drivers/infiniband/hw/qib/qib_common.h 2015-01-05 15:05:04.281461602 -0800 ++++ a10/drivers/infiniband/hw/qib/qib_common.h 2015-01-05 15:10:58.250446692 -0800 +@@ -1,4 +1,5 @@ + /* ++ * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. + * All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. +@@ -337,8 +338,12 @@ + * Should be set to QIB_USER_SWVERSION. + */ + __u32 spu_userversion; +- ++#ifdef QIB_CONFIG_KNX ++ __u16 spu_knx_node_id; ++ __u16 _spu_unused2; ++#else + __u32 _spu_unused2; ++#endif + + /* size of struct base_info to write to */ + __u32 spu_base_info_size; +diff -urN a9/drivers/infiniband/hw/qib/qib_file_ops.c a10/drivers/infiniband/hw/qib/qib_file_ops.c +--- a9/drivers/infiniband/hw/qib/qib_file_ops.c 2015-01-05 15:05:04.280461602 -0800 ++++ a10/drivers/infiniband/hw/qib/qib_file_ops.c 2015-01-05 15:10:58.251446692 -0800 +@@ -48,6 +48,7 @@ + #include "qib.h" + #include "qib_common.h" + #include "qib_user_sdma.h" ++#include "qib_knx.h" + + #undef pr_fmt + #define pr_fmt(fmt) QIB_DRV_NAME ": " fmt +@@ -59,6 +60,9 @@ + unsigned long, loff_t); + static unsigned int qib_poll(struct file *, struct poll_table_struct *); + static int qib_mmapf(struct file *, struct vm_area_struct *); ++static int subctxt_search_ctxts(struct qib_devdata *, struct file *, ++ const struct qib_user_info *); ++ + + static const struct file_operations qib_file_ops = { + .owner = THIS_MODULE, +@@ -89,6 +93,64 @@ + return paddr; + } + ++#ifdef QIB_CONFIG_KNX ++/* ++ * Fills in only a few of the fields in the qib_base_info structure so the ++ * module on the KNX size can allocate all necessary memories locally. ++ */ ++static int qib_get_early_base_info(struct file *fp, void __user *ubase, ++ size_t ubase_size) { ++ struct qib_ctxtdata *rcd = ctxt_fp(fp); ++ int ret = 0; ++ struct qib_devdata *dd = rcd->dd; ++ struct qib_base_info *kinfo = NULL; ++ size_t sz; ++ int local_node = (numa_node_id() == pcibus_to_node(dd->pcidev->bus)); ++ ++ sz = sizeof(*kinfo); ++ if (!rcd->subctxt_cnt) ++ sz -= 7 * sizeof(u64); ++ if (ubase_size < sz) { ++ ret = -EINVAL; ++ goto bail; ++ } ++ ++ kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL); ++ if (kinfo == NULL) { ++ ret = -ENOMEM; ++ goto bail; ++ } ++ ++ ret = dd->f_get_base_info(rcd, kinfo); ++ if (ret < 0) ++ goto bail_free; ++ ++ if (rcd->subctxt_cnt && !subctxt_fp(fp)) ++ kinfo->spi_runtime_flags |= QIB_RUNTIME_MASTER; ++ ++ kinfo->spi_unit = dd->unit; ++ kinfo->spi_port = rcd->ppd->port; ++ kinfo->spi_ctxt = rcd->ctxt; ++ kinfo->spi_subctxt = subctxt_fp(fp); ++ kinfo->spi_rcvhdr_cnt = dd->rcvhdrcnt; ++ kinfo->spi_rcvhdrent_size = dd->rcvhdrentsize; ++ kinfo->spi_rcv_egrbufsize = dd->rcvegrbufsize; ++ kinfo->spi_rcv_egrbuftotlen = ++ rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size; ++ kinfo->spi_rcv_egrperchunk = rcd->rcvegrbufs_perchunk; ++ kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen / ++ rcd->rcvegrbuf_chunks; ++ ++ sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo); ++ if (copy_to_user(ubase, kinfo, sz)) ++ ret = -EFAULT; ++bail_free: ++ kfree(kinfo); ++bail: ++ return ret; ++} ++#endif ++ + static int qib_get_base_info(struct file *fp, void __user *ubase, + size_t ubase_size) + { +@@ -177,14 +239,43 @@ + */ + kinfo->spi_rcvhdr_base = (u64) rcd->rcvhdrq_phys; + kinfo->spi_rcvhdr_tailaddr = (u64) rcd->rcvhdrqtailaddr_phys; ++ /* ++ * In the case of KNX, qib_do_user_init() would call into the ++ * KNX-specific memory allocation/registration functions. These ++ * functions will write the registered memory offsets in the ++ * qib_base_info structure. Those are the addresses that need to be ++ * handled to user level. ++ */ ++ kinfo->spi_uregbase = knx_node_fp(fp) ? ++ qib_knx_ctxt_info(rcd, QIB_KNX_CTXTINFO_UREG, fp) : ++ (u64) dd->uregbase + dd->ureg_align * rcd->ctxt; ++ ++ if (knx_node_fp(fp)) ++ kinfo->spi_runtime_flags = ++ qib_knx_ctxt_info(rcd, QIB_KNX_CTXTINFO_FLAGS, fp); + kinfo->spi_rhf_offset = dd->rhf_offset; + kinfo->spi_rcv_egrbufs = (u64) rcd->rcvegr_phys; +- kinfo->spi_pioavailaddr = (u64) dd->pioavailregs_phys; ++ ++ /* see comment for spi_uregbase above */ ++ if (knx_node_fp(fp)) ++ kinfo->spi_pioavailaddr = ++ qib_knx_ctxt_info(rcd, QIB_KNX_CTXTINFO_PIOAVAIL, fp); ++ else ++ kinfo->spi_pioavailaddr = (u64) dd->pioavailregs_phys; ++ + /* setup per-unit (not port) status area for user programs */ +- kinfo->spi_status = (u64) kinfo->spi_pioavailaddr + +- (char *) ppd->statusp - +- (char *) dd->pioavailregs_dma; +- kinfo->spi_uregbase = (u64) dd->uregbase + dd->ureg_align * rcd->ctxt; ++ kinfo->spi_status = (knx_node_fp(fp) ? ++ qib_knx_ctxt_info( ++ rcd, QIB_KNX_CTXTINFO_STATUS, fp) : ++ (u64) dd->pioavailregs_phys) + ++ (char *) ppd->statusp - (char *) dd->pioavailregs_dma; ++ ++ /* ++ * Do not set spi_piobufbase to KNX offset here as it is used in ++ * PIO index calculations below. For KNX contexts, the value of ++ * spi_piobufbase is not the physical address but the offset of ++ * the registered memory. ++ */ + if (!shared) { + kinfo->spi_piocnt = rcd->piocnt; + kinfo->spi_piobufbase = (u64) rcd->piobufs; +@@ -204,7 +295,11 @@ + dd->palign * kinfo->spi_piocnt * slave; + } + +- if (shared) { ++ /* ++ * In the case of KNX contexts, shared context memory is setup and ++ * handled on the the KNX. ++ */ ++ if (shared && !knx_node_fp(fp)) { + kinfo->spi_sendbuf_status = + cvt_kvaddr(&rcd->user_event_mask[subctxt_fp(fp)]); + /* only spi_subctxt_* fields should be set in this block! */ +@@ -225,6 +320,11 @@ + kinfo->spi_pioindex = (kinfo->spi_piobufbase - dd->pio2k_bufbase) / + dd->palign; + kinfo->spi_pioalign = dd->palign; ++ /* Update spi_piobufbase after all calculations are done. */ ++ if (knx_node_fp(fp)) ++ kinfo->spi_piobufbase = ++ qib_knx_ctxt_info(rcd, QIB_KNX_CTXTINFO_PIOBUFBASE, fp); ++ + kinfo->spi_qpair = QIB_KD_QP; + /* + * user mode PIO buffers are always 2KB, even when 4KB can +@@ -1261,6 +1361,17 @@ + goto bail; + } + ++#ifdef QIB_CONFIG_KNX ++ if (uinfo->spu_knx_node_id) ++ /* ++ * When setting up a context for a KNX process, setup of ++ * the subcontexts memory is done on the KNX side and ++ * mapped into user level. Therefore, the host driver never ++ * has to worry about it unless we are setting up a context ++ * on the host. ++ */ ++ goto no_subctxt_mem; ++#endif + rcd->subctxt_uregbase = vmalloc_user(PAGE_SIZE * num_subctxts); + if (!rcd->subctxt_uregbase) { + ret = -ENOMEM; +@@ -1283,6 +1394,9 @@ + goto bail_rhdr; + } + ++#ifdef QIB_CONFIG_KNX ++no_subctxt_mem: ++#endif + rcd->subctxt_cnt = uinfo->spu_subctxt_cnt; + rcd->subctxt_id = uinfo->spu_subctxt_id; + rcd->active_slaves = 1; +@@ -1317,6 +1431,14 @@ + + rcd = qib_create_ctxtdata(ppd, ctxt, numa_id); + ++#ifdef QIB_CONFIG_KNX ++ if (uinfo->spu_knx_node_id) ++ /* ++ * Skip allocation of page pointer list for TID ++ * receives. This will be done on the KNX. ++ */ ++ goto no_page_list; ++#endif + /* + * Allocate memory for use in qib_tid_update() at open to + * reduce cost of expected send setup per message segment +@@ -1332,7 +1454,11 @@ + ret = -ENOMEM; + goto bailerr; + } ++#ifdef QIB_CONFIG_KNX ++no_page_list: ++#endif + rcd->userversion = uinfo->spu_userversion; ++ + ret = init_subctxts(dd, rcd, uinfo); + if (ret) + goto bailerr; +@@ -1489,43 +1615,68 @@ + static int find_shared_ctxt(struct file *fp, + const struct qib_user_info *uinfo) + { +- int devmax, ndev, i; ++ int devmax, ndev; + int ret = 0; ++ struct qib_devdata *dd; + ++#ifdef QIB_CONFIG_KNX ++ /* ++ * In the case we are allocating a context for a KNX process, ++ * Don't loop over all devices but use the one assosiated with the ++ * requesting KNX. ++ */ ++ if (uinfo->spu_knx_node_id) { ++ dd = qib_knx_node_to_dd(uinfo->spu_knx_node_id); ++ if (dd && dd->num_knx) ++ ret = subctxt_search_ctxts(dd, fp, uinfo); ++ goto done; ++ } ++#endif + devmax = qib_count_units(NULL, NULL); + + for (ndev = 0; ndev < devmax; ndev++) { +- struct qib_devdata *dd = qib_lookup(ndev); +- ++ dd = qib_lookup(ndev); + /* device portion of usable() */ + if (!(dd && (dd->flags & QIB_PRESENT) && dd->kregbase)) + continue; +- for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) { +- struct qib_ctxtdata *rcd = dd->rcd[i]; ++ ret = subctxt_search_ctxts(dd, fp, uinfo); ++ if (ret) ++ break; ++ } ++#ifdef QIB_CONFIG_KNX ++done: ++#endif ++ return ret; ++} + +- /* Skip ctxts which are not yet open */ +- if (!rcd || !rcd->cnt) +- continue; +- /* Skip ctxt if it doesn't match the requested one */ +- if (rcd->subctxt_id != uinfo->spu_subctxt_id) +- continue; +- /* Verify the sharing process matches the master */ +- if (rcd->subctxt_cnt != uinfo->spu_subctxt_cnt || +- rcd->userversion != uinfo->spu_userversion || +- rcd->cnt >= rcd->subctxt_cnt) { +- ret = -EINVAL; +- goto done; +- } +- ctxt_fp(fp) = rcd; +- subctxt_fp(fp) = rcd->cnt++; +- rcd->subpid[subctxt_fp(fp)] = current->pid; +- tidcursor_fp(fp) = 0; +- rcd->active_slaves |= 1 << subctxt_fp(fp); +- ret = 1; ++static int subctxt_search_ctxts(struct qib_devdata *dd, struct file *fp, ++ const struct qib_user_info *uinfo) ++{ ++ int ret = 0, i; ++ for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) { ++ struct qib_ctxtdata *rcd = dd->rcd[i]; ++ ++ /* Skip ctxts which are not yet open */ ++ if (!rcd || !rcd->cnt) ++ continue; ++ /* Skip ctxt if it doesn't match the requested one */ ++ if (rcd->subctxt_id != uinfo->spu_subctxt_id) ++ continue; ++ /* Verify the sharing process matches the master */ ++ if (rcd->subctxt_cnt != uinfo->spu_subctxt_cnt || ++ rcd->userversion != uinfo->spu_userversion || ++ rcd->cnt >= rcd->subctxt_cnt) { ++ ret = -EINVAL; + goto done; + } ++ ctxt_fp(fp) = rcd; ++ subctxt_fp(fp) = rcd->cnt++; ++ rcd->subpid[subctxt_fp(fp)] = current->pid; ++ tidcursor_fp(fp) = 0; ++ rcd->active_slaves |= 1 << subctxt_fp(fp); ++ ret = 1; ++ break; + } +- + done: + return ret; + } +@@ -1617,6 +1768,13 @@ + + if (swminor >= 11 && uinfo->spu_port_alg < QIB_PORT_ALG_COUNT) + alg = uinfo->spu_port_alg; ++#ifdef QIB_CONFIG_KNX ++ /* Make sure we have a connection to the KNX module on the right node */ ++ if (uinfo->spu_knx_node_id && !qib_knx_get(uinfo->spu_knx_node_id)) { ++ ret = -ENODEV; ++ goto done; ++ } ++#endif + + mutex_lock(&qib_mutex); + +@@ -1624,13 +1782,38 @@ + uinfo->spu_subctxt_cnt) { + ret = find_shared_ctxt(fp, uinfo); + if (ret > 0) { +- ret = do_qib_user_sdma_queue_create(fp); ++#ifdef QIB_CONFIG_KNX ++ if (uinfo->spu_knx_node_id) { ++ ret = qib_knx_sdma_queue_create(fp); ++ } else ++#endif ++ ret = do_qib_user_sdma_queue_create(fp); + if (!ret) + assign_ctxt_affinity(fp, (ctxt_fp(fp))->dd); + goto done_ok; + } + } + ++#ifdef QIB_CONFIG_KNX ++ /* ++ * If there is a KNX node set, we pick the device that is ++ * associate with that KNX node ++ */ ++ if (uinfo->spu_knx_node_id) { ++ struct qib_devdata *dd = ++ qib_knx_node_to_dd(uinfo->spu_knx_node_id); ++ if (dd) { ++ ret = find_free_ctxt(dd->unit, fp, uinfo); ++ if (!ret) ++ ret = qib_knx_alloc_ctxt( ++ uinfo->spu_knx_node_id, ++ ctxt_fp(fp)->ctxt); ++ } else ++ ret = -ENXIO; ++ goto done_chk_sdma; ++ } ++ ++#endif + i_minor = iminor(file_inode(fp)) - QIB_USER_MINOR_BASE; + if (i_minor) + ret = find_free_ctxt(i_minor - 1, fp, uinfo); +@@ -1639,7 +1822,6 @@ + const unsigned int cpu = cpumask_first(¤t->cpus_allowed); + const unsigned int weight = + cpumask_weight(¤t->cpus_allowed); +- + if (weight == 1 && !test_bit(cpu, qib_cpulist)) + if (!find_hca(cpu, &unit) && unit >= 0) + if (!find_free_ctxt(unit, fp, uinfo)) { +@@ -1650,9 +1832,21 @@ + } + + done_chk_sdma: +- if (!ret) ++ if (!ret) { ++#ifdef QIB_CONFIG_KNX ++ if (uinfo->spu_knx_node_id) { ++ ret = qib_knx_sdma_queue_create(fp); ++ /*if (!ret) ++ ret = qib_knx_setup_tidrcv(fp);*/ ++ goto done_ok; ++ } ++#endif + ret = do_qib_user_sdma_queue_create(fp); ++ } + done_ok: ++#ifdef QIB_CONFIG_KNX ++ knx_node_fp(fp) = uinfo->spu_knx_node_id; ++#endif + mutex_unlock(&qib_mutex); + + done: +@@ -1667,11 +1861,25 @@ + struct qib_ctxtdata *rcd = ctxt_fp(fp); + struct qib_devdata *dd; + unsigned uctxt; ++#ifdef QIB_CONFIG_KNX ++ struct qib_base_info *base_info = NULL; ++ void __user *ubase = (void __user *)(unsigned long) ++ uinfo->spu_base_info; ++#endif + + /* Subctxts don't need to initialize anything since master did it. */ + if (subctxt_fp(fp)) { + ret = wait_event_interruptible(rcd->wait, + !test_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag)); ++#ifdef QIB_CONFIG_KNX ++ /* ++ * Subctxt pio buffers need to be registered after the ++ * master has set everything up. ++ */ ++ if (uinfo->spu_knx_node_id) ++ ret = qib_knx_setup_piobufs(rcd->dd, rcd, ++ subctxt_fp(fp)); ++#endif + goto bail; + } + +@@ -1722,6 +1930,41 @@ + */ + dd->f_sendctrl(dd->pport, QIB_SENDCTRL_AVAIL_BLIP); + ++#ifdef QIB_CONFIG_KNX ++ if (uinfo->spu_knx_node_id) { ++ /* ++ * When setting up rcvhdr Q and eager buffers for a KNX, the ++ * memory comes from the KNX side encoded in the qib_base_info ++ * structure. ++ */ ++ if (uinfo->spu_base_info_size < (sizeof(*base_info) - ++ 7 * sizeof(u64))) { ++ ret = -EINVAL; ++ goto bail_pio; ++ } ++ base_info = kzalloc(sizeof(*base_info), GFP_KERNEL); ++ if (!base_info) { ++ ret = -ENOMEM; ++ goto bail_pio; ++ } ++ if (copy_from_user(base_info, ubase, ++ uinfo->spu_base_info_size)) { ++ ret = -EFAULT; ++ goto bail_pio; ++ } ++ ret = qib_knx_setup_piobufs(dd, rcd, subctxt_fp(fp)); ++ if (ret) ++ goto cont_init; ++ ret = qib_knx_setup_pioregs(dd, rcd, base_info); ++ if (ret) ++ goto cont_init; ++ ret = qib_knx_create_rcvhdrq(dd, rcd, base_info); ++ if (ret) ++ goto cont_init; ++ ret = qib_knx_setup_eagerbufs(rcd, base_info); ++ goto cont_init; ++ } ++#endif /* QIB_CONFIG_KNX */ + /* + * Now allocate the rcvhdr Q and eager TIDs; skip the TID + * array for time being. If rcd->ctxt > chip-supported, +@@ -1731,6 +1974,9 @@ + ret = qib_create_rcvhdrq(dd, rcd); + if (!ret) + ret = qib_setup_eagerbufs(rcd); ++#ifdef QIB_CONFIG_KNX ++cont_init: ++#endif + if (ret) + goto bail_pio; + +@@ -1828,6 +2074,13 @@ + + /* drain user sdma queue */ + if (fd->pq) { ++#ifdef QIB_CONFIG_KNX ++ /* ++ * The thread should be stopped first before attempting ++ * to clean the queue. ++ */ ++ qib_knx_sdma_queue_destroy(fd); ++#endif + qib_user_sdma_queue_drain(rcd->ppd, fd->pq); + qib_user_sdma_queue_destroy(fd->pq); + } +@@ -1885,6 +2138,12 @@ + } + + mutex_unlock(&qib_mutex); ++#ifdef QIB_CONFIG_KNX ++ if (fd->knx_node_id) { ++ qib_knx_free_ctxtdata(dd, rcd); ++ goto bail; ++ } ++#endif + qib_free_ctxtdata(dd, rcd); /* after releasing the mutex */ + + bail: +@@ -2170,6 +2429,13 @@ + ret = qib_assign_ctxt(fp, &cmd.cmd.user_info); + if (ret) + goto bail; ++#ifdef QIB_CONFIG_KNX ++ if (cmd.cmd.user_info.spu_knx_node_id) ++ ret = qib_get_early_base_info( ++ fp, (void __user *) (unsigned long) ++ cmd.cmd.user_info.spu_base_info, ++ cmd.cmd.user_info.spu_base_info_size); ++#endif + break; + + case QIB_CMD_USER_INIT: +diff -urN a9/drivers/infiniband/hw/qib/qib.h a10/drivers/infiniband/hw/qib/qib.h +--- a9/drivers/infiniband/hw/qib/qib.h 2015-01-05 15:05:04.280461602 -0800 ++++ a10/drivers/infiniband/hw/qib/qib.h 2015-01-05 15:10:58.250446692 -0800 +@@ -234,6 +234,10 @@ + u32 lookaside_qpn; + /* QPs waiting for context processing */ + struct list_head qp_wait_list; ++#ifdef QIB_CONFIG_KNX ++ /* KNX Receive Context Data */ ++ struct qib_knx_ctxt *krcd; ++#endif + #ifdef CONFIG_DEBUG_FS + /* verbs stats per CTX */ + struct qib_opcode_stats_perctx *opstats; +@@ -1106,6 +1110,11 @@ + struct kthread_worker *worker; + + int assigned_node_id; /* NUMA node closest to HCA */ ++ ++#ifdef QIB_CONFIG_KNX ++ /* number of KNx nodes using this device */ ++ u16 num_knx; ++#endif + }; + + /* hol_state values */ +@@ -1134,6 +1143,9 @@ + unsigned tidcursor; + struct qib_user_sdma_queue *pq; + int rec_cpu_num; /* for cpu affinity; -1 if none */ ++#ifdef QIB_CONFIG_KNX ++ u16 knx_node_id; ++#endif + }; + + extern struct list_head qib_dev_list; +@@ -1211,6 +1223,13 @@ + (((struct qib_filedata *)(fp)->private_data)->tidcursor) + #define user_sdma_queue_fp(fp) \ + (((struct qib_filedata *)(fp)->private_data)->pq) ++#ifdef QIB_CONFIG_KNX ++#define knx_node_fp(fp) \ ++ (((struct qib_filedata *)(fp)->private_data)->knx_node_id) ++#else ++/* allow the use of knx_node_fp() outside of a #ifdef QIB_CONFIG_KNX */ ++#define knx_node_fp(fp) 0 ++#endif + + static inline struct qib_devdata *dd_from_ppd(struct qib_pportdata *ppd) + { +diff -urN a9/drivers/infiniband/hw/qib/qib_init.c a10/drivers/infiniband/hw/qib/qib_init.c +--- a9/drivers/infiniband/hw/qib/qib_init.c 2015-01-05 15:05:04.279461602 -0800 ++++ a10/drivers/infiniband/hw/qib/qib_init.c 2015-01-05 15:10:58.251446692 -0800 +@@ -51,6 +51,10 @@ + #include "qib_verbs.h" + #endif + ++#ifdef QIB_CONFIG_KNX ++#include "qib_knx.h" ++#endif ++ + #undef pr_fmt + #define pr_fmt(fmt) QIB_DRV_NAME ": " fmt + +@@ -1301,6 +1305,12 @@ + /* not fatal if it doesn't work */ + if (qib_init_qibfs()) + pr_err("Unable to register ipathfs\n"); ++ ++#ifdef QIB_CONFIG_KNX ++ ret = qib_knx_server_init(); ++ if (ret < 0) ++ pr_err(": Unable to start KNX listen thread\n"); ++#endif + goto bail; /* all OK */ + + bail_dev: +@@ -1325,6 +1335,9 @@ + { + int ret; + ++#ifdef QIB_CONFIG_KNX ++ qib_knx_server_exit(); ++#endif + ret = qib_exit_qibfs(); + if (ret) + pr_err( +@@ -1568,6 +1581,9 @@ + /* unregister from IB core */ + qib_unregister_ib_device(dd); + ++#ifdef QIB_CONFIG_KNX ++ qib_knx_remove_device(dd); ++#endif + /* + * Disable the IB link, disable interrupts on the device, + * clear dma engines, etc. +diff -urN a9/drivers/infiniband/hw/qib/qib_knx.c a10/drivers/infiniband/hw/qib/qib_knx.c +--- a9/drivers/infiniband/hw/qib/qib_knx.c 1969-12-31 16:00:00.000000000 -0800 ++++ a10/drivers/infiniband/hw/qib/qib_knx.c 2015-01-05 15:10:58.252446692 -0800 +@@ -0,0 +1,1532 @@ ++/* ++ * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "qib.h" ++#include "qib_knx.h" ++#include "qib_user_sdma.h" ++#include "qib_knx_common.h" ++ ++unsigned int qib_knx_nconns = 5; ++module_param_named(num_conns, qib_knx_nconns, uint, S_IRUGO); ++MODULE_PARM_DESC(num_conns, "Max number of pending connections"); ++ ++#define QIB_KNX_SCIF_PORT SCIF_OFED_PORT_9 ++#define CLIENT_THREAD_NAME(x) "qib/mic" __stringify(x) ++ ++#define knx_sdma_next(sdma) \ ++ (sdma->head = ((sdma->head + 1) % sdma->desc_num)) ++#define per_ctxt(ctxt, sub) ((ctxt * QLOGIC_IB_MAX_SUBCTXT) + sub) ++#define QIB_KNX_SDMA_STATUS(sdma, st) \ ++ QIB_KNX_SDMA_SET(sdma->mflags->status, ((u64)st << 32) | 1) ++ ++struct qib_knx_server { ++ struct task_struct *kthread; ++ struct scif_pollepd epd; ++ spinlock_t client_lock; ++ struct list_head clients; ++ unsigned int nclients; ++}; ++ ++struct qib_knx_rma { ++ /* SCIF registered offset */ ++ off_t offset; ++ /* size of mapped memory (in bytes) */ ++ size_t size; ++ /* kernel virtual address of ioremap'ed memory */ ++ void *kvaddr; ++}; ++ ++struct qib_knx_mem_map { ++ /* physical address is DMA range */ ++ dma_addr_t dma_mapped_addr; ++ /* DMA direction */ ++ enum dma_data_direction dir; ++ /* size of remote memory area */ ++ size_t size; ++ /* SCIF array of physical pages */ ++ struct scif_range *pages; ++}; ++ ++struct qib_knx_mem_map_sg { ++ /* list of pages to map */ ++ struct scatterlist *sglist; ++ /* DMA direction */ ++ enum dma_data_direction dir; ++ /* total size of mapped memory */ ++ size_t size; ++ struct scif_range *pages; ++}; ++ ++struct qib_knx_tidrcv { ++ struct qib_knx_rma tidmem; ++ u64 tidbase; ++ u32 tidcnt; ++}; ++ ++struct qib_knx_ctxt { ++ u16 ctxt; ++ struct qib_knx *knx; ++ struct qib_pportdata *ppd; ++ /* local registered memory for PIO buffers */ ++ struct qib_knx_rma piobufs[QLOGIC_IB_MAX_SUBCTXT]; ++ /* local registered memory for user registers */ ++ struct qib_knx_rma uregs; ++ /* local registered memory for PIO avail registers */ ++ struct qib_knx_rma pioavail; ++ /* remote registered memory for RcvHdr Q */ ++ struct qib_knx_mem_map_sg rcvhdrq; ++ /* remote registered memory for SendBuf status */ ++ struct qib_knx_mem_map sbufstatus; ++ /* remote registered memory for RcvHdrTail register */ ++ struct qib_knx_mem_map rcvhdrqtailaddr; ++ /* remote registered memory for Eager buffers */ ++ struct qib_knx_mem_map_sg eagerbufs; ++ ++ /* Saved offsets for shared context processes */ ++ __u64 uregbase; ++ __u64 pioavailaddr; ++ __u64 status; ++ __u64 piobufbase[QLOGIC_IB_MAX_SUBCTXT]; ++ __u32 runtime_flags; ++ ++ struct qib_user_sdma_queue *pq[QLOGIC_IB_MAX_SUBCTXT]; ++}; ++ ++struct qib_knx_sdma { ++ /* KNX flags page */ ++ struct scif_range *mflag_pages; ++ struct qib_knx_sdma_mflags *mflags; ++ /* KNX descriptor queue */ ++ struct scif_range *queue_pages; ++ struct qib_knx_sdma_desc *queue; ++ u32 desc_num; ++ /* host flags (in host memory) */ ++ struct qib_knx_rma hflags_mem; ++ struct qib_knx_sdma_hflags *hflags; ++ u32 head; /* shadow */ ++ u32 complete; ++}; ++ ++struct qib_knx { ++ struct list_head list; ++ struct scif_pollepd epd; ++ struct scif_portID peer; ++ struct scif_pci_info pci_info; ++ int numa_node; ++ struct qib_devdata *dd; ++ struct qib_knx_ctxt **ctxts; ++ spinlock_t ctxt_lock; ++ resource_size_t bar; ++ u64 barlen; ++ struct qib_knx_sdma *sdma; ++ struct task_struct *sdma_poll; ++ atomic_t tref; ++ char tname[64]; ++ struct qib_knx_rma tidmem; ++}; ++ ++static struct qib_knx_server *server; ++ ++static int qib_knx_init(struct qib_knx_server *); ++static void qib_knx_free(struct qib_knx *, int); ++static int qib_knx_server_listen(void *); ++static off_t qib_knx_register_memory(struct qib_knx *, struct qib_knx_rma *, ++ void *, size_t, int, const char *); ++static int qib_knx_unregister_memory(struct qib_knx *, struct qib_knx_rma *, ++ const char *); ++static __always_inline void qib_knx_memcpy(void *, void __iomem *, size_t); ++static ssize_t qib_show_knx_node(struct device *, struct device_attribute *, ++ char *); ++static int qib_knx_sdma_init(struct qib_knx *); ++static void qib_knx_sdma_teardown(struct qib_knx *); ++static __always_inline struct page * ++qib_knx_phys_to_page(struct qib_knx *, unsigned long); ++static int qib_knx_sdma_pkts_to_descs(struct qib_knx_ctxt *, ++ struct qib_knx_sdma_desc *, ++ struct qib_user_sdma_queue *, ++ int *, struct list_head *); ++static int qib_knx_sdma_poll(void *); ++static int qib_knx_tidrcv_init(struct qib_knx *); ++static int qib_knx_tidrcv_teardown(struct qib_knx *); ++ ++inline struct qib_knx *qib_knx_get(u16 nodeid) ++{ ++ struct qib_knx *knx = NULL; ++ ++ spin_lock(&server->client_lock); ++ if (!list_empty(&server->clients)) ++ list_for_each_entry(knx, &server->clients, list) ++ if (knx->peer.node == nodeid) ++ break; ++ spin_unlock(&server->client_lock); ++ return knx; ++} ++ ++inline struct qib_devdata *qib_knx_node_to_dd(u16 node) ++{ ++ struct qib_knx *knx = qib_knx_get(node); ++ return knx ? knx->dd : NULL; ++} ++ ++static int qib_knx_init(struct qib_knx_server *server) ++{ ++ int ret = 0, num_devs = 0, i, seen = 0; ++ unsigned fewest = -1U; ++ struct qib_devdata *dd = NULL, *dd_no_numa = NULL; ++ struct qib_knx *knx; ++ struct qib_device_info info = { -1 }; ++ ++ knx = kzalloc(sizeof(*knx), GFP_KERNEL); ++ if (!knx) { ++ ret = -ENOMEM; ++ goto bail; ++ } ++ ret = scif_accept(server->epd.epd, &knx->peer, &knx->epd.epd, 0); ++ if (ret) { ++ kfree(knx); ++ goto bail; ++ } ++ ++ INIT_LIST_HEAD(&knx->list); ++ spin_lock_init(&knx->ctxt_lock); ++ knx->numa_node = -1; ++ ret = scif_pci_info(knx->peer.node, &knx->pci_info); ++ if (!ret) { ++ knx->numa_node = pcibus_to_node(knx->pci_info.pdev->bus); ++ knx->bar = pci_resource_start(knx->pci_info.pdev, 0); ++ knx->barlen = pci_resource_len(knx->pci_info.pdev, 0); ++ } ++ ++ if (knx->numa_node < 0) ++ knx->numa_node = numa_node_id(); ++ ++ num_devs = qib_count_units(NULL, NULL); ++ if (unlikely(!num_devs)) { ++ ret = -ENODEV; ++ /* we have to send this */ ++ scif_send(knx->epd.epd, &info, sizeof(info), ++ SCIF_SEND_BLOCK); ++ goto done; ++ } ++ ++ /* ++ * Attempt to find an HCA on the same NUMA node as the card. Save ++ * the first HCA that hasn't been associated with a card in case ++ * there is no HCA on the same NUMA node. ++ */ ++ for (i = 0; seen < num_devs; i++) { ++ dd = qib_lookup(i); ++ if (dd) { ++ if (dd->assigned_node_id == knx->numa_node) { ++ knx->dd = dd; ++ break; ++ } else if (dd->num_knx < fewest) ++ dd_no_numa = dd; ++ seen++; ++ } ++ } ++ /* ++ * We didn't find a QIB device on the same NUMA node, ++ * use the "backup". ++ */ ++ if (unlikely(!knx->dd)) { ++ if (!dd_no_numa) { ++ ret = -ENODEV; ++ /* we have to send this */ ++ scif_send(knx->epd.epd, &info, sizeof(info), ++ SCIF_SEND_BLOCK); ++ goto done; ++ } ++ knx->dd = dd_no_numa; ++ } ++ knx->dd->num_knx++; ++ ++ knx->ctxts = kzalloc_node(knx->dd->ctxtcnt * sizeof(*knx->ctxts), ++ GFP_KERNEL, knx->numa_node); ++ if (!knx->ctxts) ++ ret = -ENOMEM; ++ /* Give the KNX the associated device information. */ ++ info.unit = knx->dd->unit; ++ ret = scif_send(knx->epd.epd, &info, sizeof(info), ++ SCIF_SEND_BLOCK); ++ ++ ret = qib_knx_sdma_init(knx); ++ if (ret) ++ goto done; ++ atomic_set(&knx->tref, 0); ++ ret = qib_knx_tidrcv_init(knx); ++done: ++ spin_lock(&server->client_lock); ++ list_add_tail(&knx->list, &server->clients); ++ server->nclients++; ++ spin_unlock(&server->client_lock); ++ try_module_get(THIS_MODULE); ++bail: ++ return ret; ++} ++ ++static void qib_knx_free(struct qib_knx *knx, int unload) ++{ ++ struct qib_devdata *dd = knx->dd; ++ int i; ++ ++ qib_knx_tidrcv_teardown(knx); ++ qib_knx_sdma_teardown(knx); ++ if (dd) ++ dd->num_knx--; ++ /* ++ * If this function is called with unload set, we can ++ * free the context data. Otherwise, we are here ++ * because the connection between the modules has broken. ++ */ ++ if (knx->ctxts && unload && dd) ++ for (i = dd->first_user_ctxt; i < dd->ctxtcnt; i++) ++ qib_knx_free_ctxtdata(dd, dd->rcd[i]); ++ ++ scif_close(knx->epd.epd); ++ module_put(THIS_MODULE); ++ if (unload) ++ kfree(knx->ctxts); ++} ++ ++static int qib_knx_server_listen(void *data) ++{ ++ struct qib_knx_server *server = ++ (struct qib_knx_server *)data; ++ struct qib_knx *client, *ptr; ++ int ret = 0; ++ ++ server->epd.epd = scif_open(); ++ if (!server->epd.epd) { ++ ret = -EIO; ++ goto done; ++ } ++ server->epd.events = POLLIN; ++ ret = scif_bind(server->epd.epd, QIB_KNX_SCIF_PORT); ++ if (ret < 0) ++ goto err_close; ++ ++ ret = scif_listen(server->epd.epd, qib_knx_nconns); ++ if (ret) ++ goto err_close; ++ ++ while (!kthread_should_stop()) { ++ schedule(); ++ ++ /* poll for one millisecond. Is 50ms good? */ ++ ret = scif_poll(&server->epd, 1, 50); ++ if (ret > 0) ++ ret = qib_knx_init(server); ++ ++ /* ++ * Check for any disconnected clients and clean them up. ++ * Since there is nothing anywhere else that can change the ++ * list, we only lock when we are deleting a client so ++ * querying functions operate on "static" list. ++ */ ++ list_for_each_entry_safe(client, ptr, &server->clients, list) { ++ client->epd.events = POLLIN; ++ if (scif_poll(&client->epd, 1, 1)) { ++ if (client->epd.revents & POLLHUP) { ++ spin_lock(&server->client_lock); ++ list_del(&client->list); ++ spin_unlock(&server->client_lock); ++ qib_knx_free(client, 0); ++ kfree(client); ++ } ++ } ++ } ++ } ++err_close: ++ scif_close(server->epd.epd); ++done: ++ return ret; ++} ++ ++ ++static off_t qib_knx_register_memory(struct qib_knx *knx, ++ struct qib_knx_rma *rma, void *kvaddr, ++ size_t size, int prot, const char *what) ++{ ++ int ret = 0; ++ off_t regoffset; ++ ++ if (!kvaddr || ((unsigned long)kvaddr & ~PAGE_MASK)) { ++ ret = -EINVAL; ++ goto bail; ++ } ++ rma->kvaddr = kvaddr; ++ rma->size = size; ++ ++ regoffset = scif_register(knx->epd.epd, rma->kvaddr, rma->size, ++ 0, prot, SCIF_MAP_KERNEL); ++ if (IS_ERR_VALUE(regoffset)) { ++ ret = regoffset; ++ goto bail; ++ } ++ rma->offset = regoffset; ++ return regoffset; ++bail: ++ rma->kvaddr = NULL; ++ rma->size = 0; ++ return ret; ++} ++ ++static int qib_knx_unregister_memory(struct qib_knx *knx, ++ struct qib_knx_rma *rma, const char *what) ++{ ++ int ret = 0; ++ ++ if (!rma) { ++ ret = -EINVAL; ++ goto done; ++ } ++ if (rma->offset) ++ ret = scif_unregister(knx->epd.epd, rma->offset, rma->size); ++ rma->kvaddr = NULL; ++ rma->size = 0; ++ rma->offset = 0; ++done: ++ return ret; ++} ++ ++static __always_inline void qib_knx_memcpy(void *dst, void __iomem *src, ++ size_t size) ++{ ++ memcpy_fromio(dst, src, size); ++} ++ ++int qib_knx_alloc_ctxt(u16 node_id, unsigned ctxt) ++{ ++ struct qib_knx *knx = qib_knx_get(node_id); ++ struct qib_devdata *dd = knx->dd; ++ struct qib_knx_ctxt *ptr; ++ int ret = 0; ++ ++ if (ctxt >= dd->ctxtcnt) { ++ ret = -EINVAL; ++ goto bail; ++ } ++ if (unlikely(!knx->ctxts)) { ++ ret = -ENOMEM; ++ goto bail; ++ } ++ ptr = kzalloc_node(sizeof(*ptr), GFP_KERNEL, knx->numa_node); ++ if (unlikely(!ptr)) { ++ ret = -ENOMEM; ++ goto bail; ++ } ++ ptr->knx = knx; ++ ptr->ctxt = ctxt; ++ ptr->ppd = dd->rcd[ctxt]->ppd; ++ ++ spin_lock(&knx->ctxt_lock); ++ knx->ctxts[ctxt] = ptr; ++ dd->rcd[ctxt]->krcd = ptr; ++ spin_unlock(&knx->ctxt_lock); ++bail: ++ return ret; ++} ++ ++__u64 qib_knx_ctxt_info(struct qib_ctxtdata *rcd, ++ enum qib_knx_ctxtinfo_type type, ++ struct file *fp) ++{ ++ struct qib_knx *knx = rcd->krcd->knx; ++ __u16 subctxt; ++ __u64 ret = 0; ++ ++ spin_lock(&knx->ctxt_lock); ++ if (!knx || !knx->ctxts || !knx->ctxts[rcd->ctxt]) ++ goto done; ++ ++ switch (type) { ++ case QIB_KNX_CTXTINFO_UREG: ++ ret = knx->ctxts[rcd->ctxt]->uregbase; ++ break; ++ case QIB_KNX_CTXTINFO_PIOAVAIL: ++ ret = knx->ctxts[rcd->ctxt]->pioavailaddr; ++ break; ++ case QIB_KNX_CTXTINFO_STATUS: ++ ret = knx->ctxts[rcd->ctxt]->status; ++ break; ++ case QIB_KNX_CTXTINFO_PIOBUFBASE: ++ subctxt = fp ? subctxt_fp(fp) : 0; ++ ret = knx->ctxts[rcd->ctxt]->piobufbase[subctxt]; ++ break; ++ case QIB_KNX_CTXTINFO_FLAGS: ++ ret = knx->ctxts[rcd->ctxt]->runtime_flags; ++ break; ++ } ++done: ++ spin_unlock(&knx->ctxt_lock); ++ return ret; ++} ++ ++int qib_knx_setup_piobufs(struct qib_devdata *dd, struct qib_ctxtdata *rcd, ++ __u16 subctxt) ++{ ++ unsigned piobufs, piocnt; ++ char buf[16]; ++ off_t offset; ++ int ret = 0; ++ struct qib_knx *knx = rcd->krcd->knx; ++ ++ if (unlikely(!knx)) { ++ ret = -ENODEV; ++ goto bail; ++ } ++ if (unlikely(!knx->ctxts[rcd->ctxt])) { ++ ret = -EINVAL; ++ goto bail; ++ } ++ ++ /* ++ * We don't calculate piobufs based on the rcd->piobufs like ++ * everywhere else in the driver because rcd->piobufs is based ++ * on the 2K PIO buffer virtual address. We just need an offset. ++ */ ++ piobufs = rcd->pio_base * dd->palign; ++ if (!rcd->subctxt_cnt) ++ piocnt = rcd->piocnt; ++ else if (!subctxt) { ++ piocnt = (rcd->piocnt / rcd->subctxt_cnt) + ++ (rcd->piocnt % rcd->subctxt_cnt); ++ piobufs += dd->palign * (rcd->piocnt - piocnt); ++ } else { ++ piocnt = rcd->piocnt / rcd->subctxt_cnt; ++ piobufs += dd->palign * piocnt * (subctxt - 1); ++ } ++ ++ /* register PIO buffers */ ++ snprintf(buf, sizeof(buf), "PIO bufs %u:%u", rcd->ctxt, subctxt); ++ offset = qib_knx_register_memory( ++ knx, &knx->ctxts[rcd->ctxt]->piobufs[subctxt], ++ dd->piobase + piobufs, piocnt * dd->palign, ++ SCIF_PROT_WRITE, buf); ++ if (IS_ERR_VALUE(offset)) { ++ ret = offset; ++ goto bail; ++ } ++ knx->ctxts[rcd->ctxt]->piobufbase[subctxt] = offset; ++bail: ++ return ret; ++} ++ ++int qib_knx_setup_pioregs(struct qib_devdata *dd, struct qib_ctxtdata *rcd, ++ struct qib_base_info *binfo) ++{ ++ int ret = 0; ++ off_t offset; ++ struct qib_knx *knx = rcd->krcd->knx; ++ ++ if (unlikely(!knx)) { ++ ret = -ENODEV; ++ goto bail; ++ } ++ if (unlikely(!knx->ctxts[rcd->ctxt])) { ++ ret = -EINVAL; ++ goto bail; ++ } ++ ++ /* register the user registers to remote mapping */ ++ offset = qib_knx_register_memory(knx, &knx->ctxts[rcd->ctxt]->uregs, ++ (char *)dd->userbase + ++ (dd->ureg_align * rcd->ctxt), ++ dd->flags & QIB_HAS_HDRSUPP ? ++ 2 * PAGE_SIZE : PAGE_SIZE, ++ SCIF_PROT_READ|SCIF_PROT_WRITE, ++ "UserRegs"); ++ if (IS_ERR_VALUE(offset)) { ++ ret = offset; ++ goto bail; ++ } ++ knx->ctxts[rcd->ctxt]->uregbase = offset; ++ ++ /* ++ * register the PIO availability registers. ++ * user status 64bit values are part of the page containing the ++ * pio availability registers. ++ */ ++ offset = qib_knx_register_memory(knx, &knx->ctxts[rcd->ctxt]->pioavail, ++ (void *)dd->pioavailregs_dma, ++ PAGE_SIZE, SCIF_PROT_READ, ++ "pioavail regs"); ++ if (IS_ERR_VALUE(offset)) { ++ ret = offset; ++ goto bail_uregs; ++ } ++ knx->ctxts[rcd->ctxt]->pioavailaddr = offset; ++ /* ++ * User status bitmask is part of the same mapped page as the PIO ++ * availability bits and user level code should know that. Therefore, ++ * we just need to give it the offset into the mapped page where the ++ * status mask is located. ++ */ ++ knx->ctxts[rcd->ctxt]->status = offset; ++ /* Record the run time flags that were passed in by the user. */ ++ knx->ctxts[rcd->ctxt]->runtime_flags = binfo->spi_runtime_flags; ++ goto bail; ++bail_uregs: ++ qib_knx_unregister_memory(knx, &knx->ctxts[rcd->ctxt]->uregs, ++ "UserRegs"); ++bail: ++ return ret; ++} ++ ++int qib_knx_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd, ++ struct qib_base_info *binfo) ++{ ++ struct qib_knx_mem_map_sg *mapsg; ++ struct qib_knx_mem_map *map; ++ struct qib_knx *knx = rcd->krcd->knx; ++ dma_addr_t offset; ++ struct scatterlist *sg; ++ unsigned num_pages; ++ size_t size; ++ int ret = 0, i; ++ ++ if (unlikely(!knx)) { ++ ret = -ENODEV; ++ goto bail; ++ } ++ if (unlikely(!knx->ctxts[rcd->ctxt])) { ++ ret = -EINVAL; ++ goto bail; ++ } ++ if (unlikely(!binfo->spi_rcvhdr_base)) { ++ ret = -EIO; ++ goto bail; ++ } ++ ++ size = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize * ++ sizeof(u32), PAGE_SIZE); ++ mapsg = &knx->ctxts[rcd->ctxt]->rcvhdrq; ++ ret = scif_get_pages(knx->epd.epd, binfo->spi_rcvhdr_base, ++ size, &mapsg->pages); ++ if (ret) ++ goto bail; ++ if (!mapsg->pages->nr_pages) { ++ rcd->rcvhdrq = NULL; ++ ret = -ENOMEM; ++ goto bail_rcvq_pages; ++ } ++ num_pages = mapsg->pages->nr_pages; ++ if (num_pages * PAGE_SIZE != size) { ++ ret = -EINVAL; ++ goto bail_rcvq_pages; ++ } ++ rcd->rcvhdrq_size = size; ++ /* verify that rcvhdr q is contiguous */ ++ offset = mapsg->pages->phys_addr[0]; ++ for (i = 1; i < num_pages; i++) { ++ if (offset + PAGE_SIZE != mapsg->pages->phys_addr[i]) { ++ ret = -EFAULT; ++ goto bail_rcvq_pages; ++ } ++ offset += PAGE_SIZE; ++ } ++ memset(mapsg->pages->va[0], 0, size); ++ mapsg->size = size; ++ mapsg->dir = DMA_FROM_DEVICE; ++ /* ++ * Streaming DMa mappings are supposed to be short-lived. ++ * The mappings here are not exactly short-lived and ++ * technically we might not even need them since SusieQ ++ * can use 64bit addresses for DMA but the CPU might not. ++ * (see pci_set_dma_mask() in qib_pcie.c). ++ */ ++ mapsg->sglist = kzalloc_node(num_pages * sizeof(*mapsg->sglist), ++ GFP_KERNEL, knx->numa_node); ++ if (!mapsg->sglist) { ++ ret = -ENOMEM; ++ goto bail_rcvq_pages; ++ } ++ sg_init_table(mapsg->sglist, num_pages); ++ for_each_sg(mapsg->sglist, sg, num_pages, i) ++ sg_set_page(sg, vmalloc_to_page(mapsg->pages->va[i]), PAGE_SIZE, ++ 0); ++ ret = pci_map_sg(dd->pcidev, mapsg->sglist, num_pages, mapsg->dir); ++ if (!ret) { ++ rcd->rcvhdrq_phys = 0; ++ goto bail_free_sgtable; ++ } ++ /* ++ * pci_map_sg() will remap all 128 pages of the ++ * scatterlist separately (without coalescing them). ++ * However, since the buffer is contiguous, as long ++ * as the base address is mapped correctly, everything ++ * should work. In any case, check that the mapped ++ * addresses are contiguous anyway. ++ */ ++ offset = sg_dma_address(mapsg->sglist); ++ for_each_sg(mapsg->sglist, sg, num_pages, i) { ++ dma_addr_t sgaddr; ++ sgaddr = sg_dma_address(sg); ++ if ((offset == sgaddr && i) || ++ (offset != sgaddr && sgaddr != offset + PAGE_SIZE)) { ++ ret = -EINVAL; ++ goto bail_rcvhdrq; ++ } ++ offset = sgaddr; ++ } ++ rcd->rcvhdrq_phys = sg_dma_address(mapsg->sglist); ++ rcd->rcvhdrq = mapsg->pages->va[0]; ++ ++ map = &knx->ctxts[rcd->ctxt]->sbufstatus; ++ ret = scif_get_pages(knx->epd.epd, binfo->spi_sendbuf_status, ++ PAGE_SIZE, &map->pages); ++ if (ret) ++ goto bail_rcvhdrq; ++ ++ map->size = PAGE_SIZE; ++ if (map->pages->nr_pages > 0) { ++ rcd->user_event_mask = map->pages->va[0]; ++ /* ++ * clear the mapped page - this is important as it will cause ++ * user level to request "invalid" updates on every PIO send. ++ */ ++ memset(rcd->user_event_mask, 0, PAGE_SIZE); ++ } ++ /* ++ * Map the rcvhdrtailaddr page(s) if we are goign to DMA the tail ++ * register to memory, the chip will be prgrammed when ++ * qib_do_user_init() calls f_rcvctrl(). ++ */ ++ if (!(dd->flags & QIB_NODMA_RTAIL) && binfo->spi_rcvhdr_tailaddr) { ++ map = &knx->ctxts[rcd->ctxt]->rcvhdrqtailaddr; ++ ret = scif_get_pages(knx->epd.epd, binfo->spi_rcvhdr_tailaddr, ++ PAGE_SIZE, &map->pages); ++ if (ret) ++ goto bail_umask; ++ map->size = PAGE_SIZE; ++ map->dir = DMA_FROM_DEVICE; ++ /* don't reuse num_pages in case there is an error */ ++ if (map->pages->nr_pages > 0) { ++ rcd->rcvhdrqtailaddr_phys = ++ pci_map_page(dd->pcidev, ++ vmalloc_to_page(map->pages->va[0]), ++ 0, map->size, map->dir); ++ if (pci_dma_mapping_error(dd->pcidev, ++ rcd->rcvhdrqtailaddr_phys)) { ++ rcd->rcvhdrqtailaddr_phys = 0; ++ ret = -ENOMEM; ++ goto bail_tail; ++ } ++ rcd->rcvhdrtail_kvaddr = map->pages->va[0]; ++ /* clear, just in case... */ ++ memset(rcd->rcvhdrtail_kvaddr, 0, map->size); ++ map->dma_mapped_addr = ++ rcd->rcvhdrqtailaddr_phys; ++ knx->ctxts[rcd->ctxt]->runtime_flags &= ++ ~QIB_RUNTIME_NODMA_RTAIL; ++ } ++ } ++ ret = 0; ++ goto bail; ++bail_tail: ++ scif_put_pages(knx->ctxts[rcd->ctxt]->rcvhdrqtailaddr.pages); ++bail_umask: ++ rcd->user_event_mask = NULL; ++ scif_put_pages(knx->ctxts[rcd->ctxt]->sbufstatus.pages); ++bail_rcvhdrq: ++ rcd->rcvhdrq = NULL; ++ pci_unmap_sg(dd->pcidev, knx->ctxts[rcd->ctxt]->rcvhdrq.sglist, ++ num_pages, knx->ctxts[rcd->ctxt]->rcvhdrq.dir); ++bail_free_sgtable: ++ kfree(knx->ctxts[rcd->ctxt]->rcvhdrq.sglist); ++bail_rcvq_pages: ++ scif_put_pages(knx->ctxts[rcd->ctxt]->rcvhdrq.pages); ++bail: ++ return ret; ++} ++ ++int qib_knx_setup_eagerbufs(struct qib_ctxtdata *rcd, ++ struct qib_base_info *binfo) ++{ ++ struct qib_knx_mem_map_sg *map; ++ struct scatterlist *sg; ++ struct qib_devdata *dd = rcd->dd; ++ struct qib_knx *knx = rcd->krcd->knx; ++ unsigned size, egrsize, egrcnt, num_pages, bufs_ppage, ++ egrbufcnt; ++ dma_addr_t dma_addr, page; ++ int ret = -ENOMEM, i, bufcnt; ++ ++ if (unlikely(!knx)) { ++ ret = -ENODEV; ++ goto bail; ++ } ++ if (unlikely(!knx->ctxts[rcd->ctxt])) { ++ ret = -EINVAL; ++ goto bail; ++ } ++ if (unlikely(!binfo->spi_rcv_egrbufs)) { ++ ret = -ENOBUFS; ++ goto bail; ++ } ++ size = binfo->spi_rcv_egrbuftotlen; ++ egrsize = dd->rcvegrbufsize; ++ egrcnt = rcd->rcvegrcnt; ++ ++ /* ++ * Check whether the total size of the buffer is enough for all ++ * Eager buffers. ++ */ ++ if (size < egrsize * egrcnt) { ++ ret = -EINVAL; ++ goto bail; ++ } ++ ++ /* number of pages required to fit all the eager buffers */ ++ num_pages = (egrsize * egrcnt) / PAGE_SIZE; ++ /* number of buffers per page (depends on MTU) */ ++ bufs_ppage = PAGE_SIZE / egrsize; ++ map = &knx->ctxts[rcd->ctxt]->eagerbufs; ++ ret = scif_get_pages(knx->epd.epd, binfo->spi_rcv_egrbufs, ++ size, &map->pages); ++ if (ret) ++ goto bail; ++ ++ if (map->pages->nr_pages != num_pages) { ++ ret = -EINVAL; ++ goto bail_free_scif; ++ } ++ ++ /* ++ * Allocate pointer to the pages from the KNX memory. ++ * In the case of KNX eager buffers, we are not dealing with ++ * 32K chunks of locally allocated memory. Therefore, we ++ * allocate num_pages pointers instead of rcd->rcvegrbuf_chunks. ++ */ ++ if (likely(!rcd->rcvegrbuf)) { ++ rcd->rcvegrbuf = kzalloc_node(num_pages * ++ sizeof(rcd->rcvegrbuf[0]), ++ GFP_KERNEL, rcd->node_id); ++ if (!rcd->rcvegrbuf) { ++ ret = -ENOMEM; ++ goto bail_free_scif; ++ } ++ } ++ ++ /* ++ * Allocate array of DMA addresses for each of the mapped ++ * pages. ++ */ ++ if (likely(!rcd->rcvegrbuf_phys)) { ++ rcd->rcvegrbuf_phys = ++ kzalloc_node(num_pages * sizeof(rcd->rcvegrbuf_phys[0]), ++ GFP_KERNEL, rcd->node_id); ++ if (!rcd->rcvegrbuf_phys) { ++ ret = -ENOMEM; ++ goto bail_free_rcvegr; ++ } ++ } ++ ++ map->size = size; ++ map->dir = DMA_BIDIRECTIONAL; ++ map->sglist = kzalloc_node(num_pages * sizeof(*map->sglist), GFP_KERNEL, ++ knx->numa_node); ++ if (!map->sglist) { ++ ret = -ENOMEM; ++ goto bail_free_rcvegr_phys; ++ } ++ sg_init_table(map->sglist, num_pages); ++ for_each_sg(map->sglist, sg, num_pages, i) { ++ memset(map->pages->va[i], 0, PAGE_SIZE); ++ sg_set_page(sg, vmalloc_to_page(map->pages->va[i]), ++ PAGE_SIZE, 0); ++ } ++ ret = pci_map_sg(dd->pcidev, map->sglist, num_pages, map->dir); ++ if (!ret) { ++ ret = -ENOMEM; ++ goto bail_free_rcvegr_phys; ++ } ++ for_each_sg(map->sglist, sg, num_pages, i) { ++ rcd->rcvegrbuf_phys[i] = sg_dma_address(sg); ++ rcd->rcvegrbuf[i] = map->pages->va[i]; ++ } ++ ++ for (egrbufcnt = i = 0; i < num_pages; i++) { ++ page = rcd->rcvegrbuf_phys[i]; ++ dma_addr = page; ++ for (bufcnt = 0; egrbufcnt < egrcnt && bufcnt < bufs_ppage; ++ egrbufcnt++, bufcnt++) { ++ dd->f_put_tid(dd, rcd->rcvegr_tid_base + ++ egrbufcnt + ++ (u64 __iomem *)((char __iomem *) ++ dd->kregbase + ++ dd->rcvegrbase), ++ RCVHQ_RCV_TYPE_EAGER, dma_addr); ++ dma_addr += egrsize; ++ } ++ } ++ ret = 0; ++ goto bail; ++bail_free_rcvegr_phys: ++ kfree(map->sglist); ++ kfree(rcd->rcvegrbuf_phys); ++ rcd->rcvegrbuf_phys = NULL; ++bail_free_rcvegr: ++ kfree(rcd->rcvegrbuf); ++ rcd->rcvegrbuf = NULL; ++bail_free_scif: ++ scif_put_pages(map->pages); ++bail: ++ return ret; ++} ++ ++void qib_knx_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd) ++{ ++ struct qib_knx *knx = rcd->krcd->knx; ++ struct qib_knx_ctxt *ctxt; ++ char buf[16]; ++ int i, ret = 0; ++ ++ if (!rcd || !knx || !knx->ctxts) ++ return; ++ ++ spin_lock(&knx->ctxt_lock); ++ ctxt = knx->ctxts[rcd->ctxt]; ++ knx->ctxts[rcd->ctxt] = NULL; ++ spin_unlock(&knx->ctxt_lock); ++ ++ if (!ctxt) ++ return; ++ ++ if (rcd->rcvhdrq) { ++ /* Unmap the RcvHdr Q */ ++ pci_unmap_sg(dd->pcidev, ctxt->rcvhdrq.sglist, ++ ctxt->rcvhdrq.pages->nr_pages, ++ ctxt->rcvhdrq.dir); ++ /* TODO: do something with return value */ ++ ret = scif_put_pages(ctxt->rcvhdrq.pages); ++ kfree(ctxt->rcvhdrq.sglist); ++ } ++ ++ if (rcd->user_event_mask) ++ /* TODO: do something with return value */ ++ ret = scif_put_pages(ctxt->sbufstatus.pages); ++ ++ if (rcd->rcvhdrtail_kvaddr) { ++ pci_unmap_page(dd->pcidev, ++ ctxt->rcvhdrqtailaddr.dma_mapped_addr, ++ ctxt->rcvhdrqtailaddr.size, ++ ctxt->rcvhdrqtailaddr.dir); ++ /* TODO: do something with return value */ ++ ret = scif_put_pages(ctxt->rcvhdrqtailaddr.pages); ++ } ++ ++ if (rcd->rcvegrbuf) { ++ pci_unmap_sg(dd->pcidev, ctxt->eagerbufs.sglist, ++ ctxt->eagerbufs.pages->nr_pages, ++ ctxt->eagerbufs.dir); ++ /* TODO: do something with return value */ ++ ret = scif_put_pages(ctxt->eagerbufs.pages); ++ kfree(ctxt->eagerbufs.sglist); ++ kfree(rcd->rcvegrbuf); ++ kfree(rcd->rcvegrbuf_phys); ++ } ++ ++ /* We are done with all remote memory, handle local */ ++ qib_knx_unregister_memory(knx, &ctxt->pioavail, "pioavail regs"); ++ qib_knx_unregister_memory(knx, &ctxt->uregs, "UserRegs"); ++ for (i = 0; i < QLOGIC_IB_MAX_SUBCTXT; i++) { ++ snprintf(buf, sizeof(buf), "PIO bufs %u:%u", rcd->ctxt, i); ++ qib_knx_unregister_memory(knx, &ctxt->piobufs[i], buf); ++ } ++ ++ kfree(ctxt); ++ kfree(rcd); ++} ++ ++/* ++ * TID management for processes on the MIC happens on the MIC. Therefore, ++ * we only register the HW TID array here. ++ * The MIC will calculate TID array offsets using the same algorithm is ++ * the host. Therefore, it is OK that the entire HW TID array is mapped ++ * since neither side should step on the other. ++ */ ++static int qib_knx_tidrcv_init(struct qib_knx *knx) ++{ ++ struct qib_devdata *dd = knx->dd; ++ struct qib_knx_tid_info info; ++ void *tidbase; ++ int ret = 0; ++ off_t offset = 0; ++ size_t len; ++ char buf[64]; ++ ++ memset(&info, 0, sizeof(info)); ++ ++ info.tidcnt = dd->rcvtidcnt; ++ tidbase = ((char *)dd->kregbase + dd->rcvtidbase); ++ info.tidbase_len = dd->ctxtcnt * dd->rcvtidcnt * sizeof(tidbase); ++ info.tidtemplate = dd->tidtemplate; ++ info.invalidtid = dd->tidinvalid; ++ /* information needed to properly calculate DMA address to MIC pages */ ++ info.bar_addr = knx->bar; ++ info.bar_len = knx->barlen; ++ ++ snprintf(buf, sizeof(buf), "TID array KNx%u", knx->peer.node); ++ offset = qib_knx_register_memory(knx, &knx->tidmem, tidbase, ++ info.tidbase_len, SCIF_PROT_WRITE, ++ buf); ++ info.tidbase_offset = offset; ++ if (IS_ERR_VALUE(offset)) ++ ret = offset; ++ len = scif_send(knx->epd.epd, &info, sizeof(info), ++ SCIF_SEND_BLOCK); ++ if (len < sizeof(info)) ++ ret = -EFAULT; ++ return ret; ++} ++ ++static int qib_knx_tidrcv_teardown(struct qib_knx *knx) ++{ ++ char buf[64]; ++ snprintf(buf, sizeof(buf), "TID array KNx%u", knx->peer.node); ++ return qib_knx_unregister_memory(knx, &knx->tidmem, buf); ++} ++ ++static int qib_knx_sdma_init(struct qib_knx *knx) ++{ ++ struct qib_knx_host_mem flags; ++ struct qib_knx_knc_mem mflags; ++ struct qib_knx_sdma *sdma; ++ char buf[64]; ++ int ret = 0; ++ ++ sdma = kzalloc_node(sizeof(*sdma), GFP_KERNEL, knx->numa_node); ++ if (!sdma) { ++ ret = -ENOMEM; ++ goto done; ++ } ++ sdma->hflags = kzalloc_node(PAGE_SIZE, GFP_KERNEL, knx->numa_node); ++ if (!sdma->hflags) { ++ ret = -ENOMEM; ++ goto done_free; ++ } ++ snprintf(buf, sizeof(buf), "Host SDMA flags KNx%u", knx->peer.node); ++ flags.flags_offset = qib_knx_register_memory(knx, &sdma->hflags_mem, ++ sdma->hflags, ++ PAGE_SIZE, ++ SCIF_PROT_WRITE, ++ buf); ++ if (IS_ERR_VALUE(flags.flags_offset)) { ++ ret = flags.flags_offset; ++ goto free_flags; ++ } ++ sdma->desc_num = knx->dd->pport[0].sdma_descq_cnt; ++ flags.desc_num = sdma->desc_num; ++ ret = scif_send(knx->epd.epd, &flags, sizeof(flags), ++ SCIF_SEND_BLOCK); ++ if (ret < sizeof(flags)) ++ goto unregister; ++ ret = scif_recv(knx->epd.epd, &mflags, sizeof(mflags), ++ SCIF_RECV_BLOCK); ++ if (ret < sizeof(mflags)) { ++ ret = -EINVAL; ++ goto unregister; ++ } ++ ret = scif_get_pages(knx->epd.epd, mflags.flags_offset, ++ PAGE_SIZE, &sdma->mflag_pages); ++ if (ret < 0 || !sdma->mflag_pages->nr_pages) { ++ ret = -EFAULT; ++ goto unregister; ++ } ++ sdma->mflags = sdma->mflag_pages->va[0]; ++ ret = scif_get_pages(knx->epd.epd, mflags.queue_offset, ++ mflags.queue_len, &sdma->queue_pages); ++ if (ret < 0) ++ goto put_flags; ++ if ((sdma->queue_pages->nr_pages * PAGE_SIZE) != ++ mflags.queue_len) { ++ ret = -EFAULT; ++ goto put_queue; ++ } ++ sdma->queue = sdma->queue_pages->va[0]; ++ sdma->complete = -1; ++ sdma->head = -1; ++ /* set the initial trigger value */ ++ QIB_KNX_SDMA_SET(sdma->hflags->trigger, -1); ++ QIB_KNX_SDMA_SET(sdma->mflags->complete, sdma->complete); ++ snprintf(knx->tname, sizeof(knx->tname), "qib/mic%u/poll", ++ knx->peer.node); ++ knx->sdma = sdma; ++ ret = 0; ++ goto done; ++put_queue: ++ scif_put_pages(sdma->queue_pages); ++put_flags: ++ scif_put_pages(sdma->mflag_pages); ++unregister: ++ qib_knx_unregister_memory(knx, &sdma->hflags_mem, buf); ++free_flags: ++ kfree(sdma->hflags); ++done_free: ++ kfree(sdma); ++done: ++ /* ++ * we have to respond to the MIC so it doesn't get stuck ++ * in the scif_recv call ++ */ ++ scif_send(knx->epd.epd, &ret, sizeof(ret), SCIF_SEND_BLOCK); ++ return ret; ++} ++ ++static void qib_knx_sdma_teardown(struct qib_knx *knx) ++{ ++ int ret; ++ if (knx->sdma_poll) ++ ret = kthread_stop(knx->sdma_poll); ++ if (knx->sdma) { ++ if (knx->sdma->queue_pages->nr_pages) { ++ knx->sdma->queue = NULL; ++ scif_put_pages(knx->sdma->queue_pages); ++ } ++ if (knx->sdma->mflag_pages->nr_pages) { ++ knx->sdma->mflags = NULL; ++ scif_put_pages(knx->sdma->mflag_pages); ++ } ++ kfree(knx->sdma->hflags); ++ kfree(knx->sdma); ++ knx->sdma = NULL; ++ } ++} ++ ++int qib_knx_sdma_queue_create(struct file *fd) ++{ ++ struct qib_ctxtdata *rcd = ctxt_fp(fd); ++ struct qib_devdata *dd = rcd->dd; ++ struct qib_knx *knx = rcd->krcd->knx; ++ struct qib_knx_ctxt *ctxt = knx->ctxts[rcd->ctxt]; ++ u8 subctxt = subctxt_fp(fd); ++ int ret = 0; ++ ++ if (!ctxt) { ++ ret = -EINVAL; ++ goto done; ++ } ++ ctxt->pq[subctxt] = qib_user_sdma_queue_create(&dd->pcidev->dev, ++ dd->unit, rcd->ctxt, ++ subctxt); ++ if (!ctxt->pq[subctxt]) ++ ret = -ENOMEM; ++ user_sdma_queue_fp(fd) = ctxt->pq[subctxt]; ++ /* ++ * We start the polling thread the first time a user SDMA ++ * queue is created. There is no reason to take up CPU ++ * cycles before then. ++ */ ++ if (atomic_inc_return(&knx->tref) == 1) { ++ knx->sdma_poll = kthread_run(qib_knx_sdma_poll, knx, ++ knx->tname); ++ if (IS_ERR(knx->sdma_poll)) { ++ ret = -PTR_ERR(knx->sdma_poll); ++ atomic_dec(&knx->tref); ++ goto free_queue; ++ } ++ } ++ goto done; ++free_queue: ++ user_sdma_queue_fp(fd) = NULL; ++ qib_user_sdma_queue_destroy(ctxt->pq[subctxt]); ++ ctxt->pq[subctxt] = NULL; ++done: ++ return ret; ++} ++ ++void qib_knx_sdma_queue_destroy(struct qib_filedata *fd) ++{ ++ struct qib_ctxtdata *rcd = fd->rcd; ++ struct qib_knx *knx; ++ unsigned ctxt = rcd->ctxt, subctxt = fd->subctxt; ++ ++ /* Host processes do not have a KNX rcd pointer. */ ++ if (!rcd->krcd) ++ return; ++ knx = rcd->krcd->knx; ++ /* We still have the memory pointer through fd->pq */ ++ spin_lock(&knx->ctxt_lock); ++ if (knx->ctxts[ctxt]) ++ knx->ctxts[ctxt]->pq[subctxt] = NULL; ++ spin_unlock(&knx->ctxt_lock); ++ if (atomic_dec_and_test(&knx->tref)) { ++ int ret = kthread_stop(knx->sdma_poll); ++ knx->sdma_poll = NULL; ++ } ++} ++ ++/* ++ * Convert a MIC physical address to the corresponding host page. ++ */ ++static __always_inline struct page * ++qib_knx_phys_to_page(struct qib_knx *knx, unsigned long addr) { ++ unsigned long paddr; ++ if ((knx->bar + addr + PAGE_SIZE) > ++ (knx->bar + knx->barlen)) ++ return NULL; ++ paddr = knx->bar + addr; ++ return pfn_to_page(paddr >> PAGE_SHIFT); ++} ++ ++static int qib_knx_sdma_pkts_to_descs(struct qib_knx_ctxt *ctxt, ++ struct qib_knx_sdma_desc *desc, ++ struct qib_user_sdma_queue *pq, ++ int *ndesc, struct list_head *list) ++{ ++ struct qib_knx *knx = ctxt->knx; ++ struct qib_user_sdma_pkt *pkt; ++ dma_addr_t pbc_dma_addr; ++ unsigned pktnw, pbcnw; ++ u32 counter; ++ u16 frag_size; ++ int ret = 0; ++ __le32 *pbc; ++ ++ counter = pq->counter; ++ ++ pbc = qib_user_sdma_alloc_header(pq, desc->pbclen, &pbc_dma_addr); ++ if (!pbc) { ++ ret = -ENOMEM; ++ goto done; ++ } ++ memcpy(pbc, desc->pbc, desc->pbclen); ++ ++ pktnw = (le32_to_cpu(*pbc) & 0xFFFF); ++ /* ++ * This assignment is a bit strange. it's because the ++ * the pbc counts the number of 32 bit words in the full ++ * packet _except_ the first word of the pbc itself... ++ */ ++ pbcnw = (desc->pbclen >> 2) - 1; ++ ++ if (pktnw < pbcnw) { ++ ret = -EINVAL; ++ goto free_pbc; ++ } ++ ++ if (pktnw != ((desc->length >> 2) + pbcnw)) { ++ ret = -EINVAL; ++ goto free_pbc; ++ } ++ ++ frag_size = (le32_to_cpu(*pbc)>>16) & 0xFFFF; ++ if (((frag_size ? frag_size : desc->length) + desc->pbclen) > ++ ctxt->ppd->ibmaxlen) { ++ ret = -EINVAL; ++ goto free_pbc; ++ } ++ if (frag_size) { ++ /* new SDMA "protocol" */ ++ unsigned pktsize, n; ++ ++ n = desc->npages * ((2 * PAGE_SIZE / frag_size) + 1); ++ pktsize = sizeof(*pkt) + sizeof(pkt->addr[0]) * n; ++ ++ pkt = kzalloc(pktsize + desc->tidlen, GFP_KERNEL); ++ if (!pkt) { ++ ret = -ENOMEM; ++ goto free_pbc; ++ } ++ pkt->largepkt = 1; ++ pkt->frag_size = frag_size; ++ pkt->addrlimit = n + ARRAY_SIZE(pkt->addr); ++ ++ if (desc->tidlen) { ++ char *tidsmptr = (char *)pkt + pktsize; ++ memcpy(tidsmptr, desc->tidsm, desc->tidlen); ++ pkt->tidsm = ++ (struct qib_tid_session_member *)tidsmptr; ++ pkt->tidsmcount = desc->tidlen / ++ sizeof(*desc->tidsm); ++ pkt->tidsmidx = 0; ++ } ++ *pbc = cpu_to_le32(le32_to_cpu(*pbc) & 0x0000FFFF); ++ } else { ++ /* old SDMA */ ++ pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL); ++ if (!pkt) { ++ ret = -ENOMEM; ++ goto free_pbc; ++ } ++ pkt->largepkt = 0; ++ pkt->frag_size = desc->length; ++ pkt->addrlimit = ARRAY_SIZE(pkt->addr); ++ } ++ pkt->bytes_togo = desc->length; ++ pkt->payload_size = 0; ++ pkt->counter = counter; ++ pkt->tiddma = !!desc->tidlen; ++ /* ++ * The generic user SDMA code will use this as a flag to ++ * decide whether to call the KNx-specific pkt free ++ * function. However, it doesn't know what the value ++ * actually means. ++ */ ++ pkt->remote = (u64)knx; ++ ++ qib_user_sdma_init_frag(pkt, 0, ++ 0, desc->pbclen, ++ 1, 0, ++ 0, 0, ++ NULL, pbc, ++ pbc_dma_addr, desc->pbclen); ++ pkt->index = 0; ++ pkt->naddr = 1; ++ ++ if (desc->npages) { ++ /* we have user data */ ++ int i; ++ struct page *page; ++ unsigned plen = 0, len = desc->length; ++ for (i = 0; i < desc->npages; i++) { ++ unsigned long off = (i == 0 ? desc->offset : 0); ++ plen = (len > PAGE_SIZE ? PAGE_SIZE : len); ++ page = qib_knx_phys_to_page(knx, desc->pages[i]); ++ ret = qib_user_sdma_page_to_frags(knx->dd, pq, ++ pkt, page, 0, off, ++ (off + plen > PAGE_SIZE ? ++ PAGE_SIZE - off : plen), ++ NULL); ++ if (ret < 0) ++ goto free_sdma; ++ len -= plen - off; ++ } ++ } else { ++ pkt->addr[0].last_desc = 1; ++ if (pbc_dma_addr == 0) { ++ pbc_dma_addr = dma_map_single(&knx->dd->pcidev->dev, ++ pbc, desc->pbclen, ++ DMA_TO_DEVICE); ++ if (dma_mapping_error(&knx->dd->pcidev->dev, ++ pbc_dma_addr)) { ++ ret = -ENOMEM; ++ goto free_sdma; ++ } ++ pkt->addr[0].addr = pbc_dma_addr; ++ pkt->addr[0].dma_mapped = 1; ++ } ++ } ++ counter++; ++ pkt->pq = pq; ++ pkt->index = 0; ++ *ndesc = pkt->naddr; ++ ++ list_add_tail(&pkt->list, list); ++ goto done; ++free_sdma: ++ if (pkt->largepkt) ++ kfree(pkt); ++ else ++ kmem_cache_free(pq->pkt_slab, pkt); ++free_pbc: ++ if (pbc_dma_addr) ++ dma_pool_free(pq->header_cache, pbc, pbc_dma_addr); ++ else ++ kfree(pbc); ++done: ++ return ret; ++} ++ ++void qib_knx_sdma_free_pkt(struct qib_user_sdma_pkt *pkt) ++{ ++ struct qib_knx *knx = (struct qib_knx *)pkt->remote; ++ struct qib_knx_sdma *sdma = knx->sdma; ++ sdma_next(sdma, complete); ++ QIB_KNX_SDMA_SET(sdma->mflags->complete, sdma->complete); ++} ++ ++static int qib_knx_sdma_poll(void *data) ++{ ++ struct qib_knx *knx = (struct qib_knx *)data; ++ struct qib_knx_ctxt *ctxt; ++ struct qib_knx_sdma_desc desc; ++ struct qib_knx_sdma *sdma = knx->sdma; ++ struct qib_user_sdma_queue *pq; ++ struct list_head list; ++ u32 new_head; ++ int ret = 0, ndesc = 0, added; ++ ++ if (!sdma) ++ return -EFAULT; ++ ++ while (!kthread_should_stop()) { ++ added = 0; ++ new_head = QIB_KNX_SDMA_VALUE(sdma->hflags->trigger); ++ while (sdma->head != new_head) { ++ knx_sdma_next(sdma); ++ qib_knx_memcpy(&desc, sdma->queue + sdma->head, ++ sizeof(desc)); ++ if (!desc.ctxt) { ++ QIB_KNX_SDMA_STATUS(sdma, -EINVAL); ++ continue; ++ } ++ spin_lock(&knx->ctxt_lock); ++ ctxt = knx->ctxts[desc.ctxt]; ++ if (!ctxt) { ++ /* we should never get here */ ++ QIB_KNX_SDMA_STATUS(sdma, -EINVAL); ++ goto done_unlock; ++ } ++ pq = ctxt->pq[desc.subctxt]; ++ if (!pq) { ++ QIB_KNX_SDMA_STATUS(sdma, -EFAULT); ++ goto done_unlock; ++ } ++ mutex_lock(&pq->lock); ++ if (pq->added > ctxt->ppd->sdma_descq_removed) ++ qib_user_sdma_hwqueue_clean(ctxt->ppd); ++ if (pq->num_sending) ++ qib_user_sdma_queue_clean(ctxt->ppd, pq); ++ ++ INIT_LIST_HEAD(&list); ++ ret = qib_knx_sdma_pkts_to_descs(ctxt, &desc, pq, ++ &ndesc, &list); ++ QIB_KNX_SDMA_STATUS(sdma, ret); ++ if (!list_empty(&list)) { ++ if (qib_sdma_descq_freecnt(ctxt->ppd) < ++ ndesc) { ++ qib_user_sdma_hwqueue_clean( ++ ctxt->ppd); ++ if (pq->num_sending) ++ qib_user_sdma_queue_clean( ++ ctxt->ppd, pq); ++ } ++ ret = qib_user_sdma_push_pkts(ctxt->ppd, ++ pq, &list, 1); ++ if (ret < 0) ++ goto free_pkts; ++ else { ++ pq->counter++; ++ added++; ++ } ++ } ++free_pkts: ++ if (!list_empty(&list)) ++ qib_user_sdma_free_pkt_list( ++ &knx->dd->pcidev->dev, pq, &list); ++ mutex_unlock(&pq->lock); ++done_unlock: ++ spin_unlock(&knx->ctxt_lock); ++ } ++ if (!added) { ++ int i; ++ /* ++ * Push the queues along ++ * The polling thread will enter the inner loop only ++ * if the KNX has posted new descriptors to the queue. ++ * However, any packets that have been completed by ++ * the HW need to be cleaned and that won't happen ++ * unless we explicitly check. ++ */ ++ for (i = 0; ++ i < knx->dd->ctxtcnt * QLOGIC_IB_MAX_SUBCTXT; ++ i++) { ++ int c = i / QLOGIC_IB_MAX_SUBCTXT, ++ s = i % QLOGIC_IB_MAX_SUBCTXT; ++ spin_lock(&knx->ctxt_lock); ++ ctxt = knx->ctxts[c]; ++ if (!ctxt) ++ goto loop_unlock; ++ pq = ctxt->pq[s]; ++ if (!pq) ++ goto loop_unlock; ++ mutex_lock(&pq->lock); ++ if (pq->num_sending) ++ qib_user_sdma_queue_clean(ctxt->ppd, ++ pq); ++ mutex_unlock(&pq->lock); ++loop_unlock: ++ spin_unlock(&knx->ctxt_lock); ++ } ++ might_sleep(); ++ } ++ } ++ return ret; ++} ++ ++void qib_knx_remove_device(struct qib_devdata *dd) ++{ ++ if (server && dd->num_knx) { ++ struct qib_knx *knx, *knxp; ++ list_for_each_entry_safe(knx, knxp, &server->clients, list) { ++ if (knx->dd == dd) { ++ spin_lock(&server->client_lock); ++ list_del(&knx->list); ++ server->nclients--; ++ spin_unlock(&server->client_lock); ++ qib_knx_free(knx, 0); ++ kfree(knx); ++ } ++ } ++ } ++ return; ++} ++ ++int __init qib_knx_server_init(void) ++{ ++ server = kzalloc(sizeof(struct qib_knx_server), GFP_KERNEL); ++ if (!server) ++ return -ENOMEM; ++ INIT_LIST_HEAD(&server->clients); ++ spin_lock_init(&server->client_lock); ++ server->kthread = kthread_run(qib_knx_server_listen, ++ server, CLIENT_THREAD_NAME(0)); ++ if (IS_ERR(server->kthread)) ++ return -PTR_ERR(server->kthread); ++ return 0; ++} ++ ++void __exit qib_knx_server_exit(void) ++{ ++ if (server) { ++ struct qib_knx *t, *tt; ++ /* Stop the thread so we don't accept any new connections. */ ++ kthread_stop(server->kthread); ++ list_for_each_entry_safe(t, tt, &server->clients, list) { ++ spin_lock(&server->client_lock); ++ list_del(&t->list); ++ spin_unlock(&server->client_lock); ++ qib_knx_free(t, 1); ++ kfree(t); ++ } ++ kfree(server); ++ } ++} +diff -urN a9/drivers/infiniband/hw/qib/qib_knx_common.h a10/drivers/infiniband/hw/qib/qib_knx_common.h +--- a9/drivers/infiniband/hw/qib/qib_knx_common.h 1969-12-31 16:00:00.000000000 -0800 ++++ a10/drivers/infiniband/hw/qib/qib_knx_common.h 2015-01-05 15:10:58.252446692 -0800 +@@ -0,0 +1,126 @@ ++/* ++ * Copyright (c) 2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef _QIB_KNX_COMMON_H ++#define _QIB_KNX_COMMON_H ++ ++struct qib_device_info { ++ u16 unit; ++}; ++ ++#define QIB_SDMA_MAX_NPAGES 33 ++#define QIB_KNX_SDMA_VALUE(fld) ((volatile u64)fld) ++#define QIB_KNX_SDMA_SET(fld, val) \ ++ do { \ ++ fld = (u64)(val); \ ++ smp_mb(); \ ++ } while (0) ++ ++struct qib_knx_host_mem { ++ off_t flags_offset; ++ unsigned desc_num; ++}; ++ ++struct qib_knx_knc_mem { ++ off_t flags_offset; ++ off_t queue_offset; ++ size_t queue_len; ++}; ++ ++struct qib_tid_sm { ++ __u16 tid; ++ __u16 offset; ++ __u16 length; ++}; ++ ++/* ++ * SDMA transfer descriptor. This structure communicates the SDMA ++ * transfers from the MIC to the host. It is very important for ++ * performance reasons that its size is multiple of 64B in order ++ * to guarantee proper alignment in the descriptor array. ++ */ ++struct qib_knx_sdma_desc { ++ u16 ctxt; ++ u16 subctxt; ++ u32 pbclen; ++ __le32 pbc[16]; ++ u64 length; ++ u32 npages; ++ unsigned tidlen; ++ off_t offset; ++ unsigned long pages[QIB_SDMA_MAX_NPAGES]; ++ /* This array is 198B so the compiler will pad ++ * it by 2B to make it multiple of 8B. */ ++ struct qib_tid_sm tidsm[QIB_SDMA_MAX_NPAGES]; ++ /* ++ * The two paddings below are included in order to ++ * make the size of the entire struct 576B (multiple ++ * of 64B). The goal is that all elements in an array ++ * of struct qib_knx_sdma_desc are 64B aligned. ++ */ ++ u16 __padding0; ++ u64 __padding1[2]; ++}; ++ ++/* ++ * trigger, status, and complete fields are by 8 to be ++ * cacheline size. ++ */ ++struct qib_knx_sdma_hflags { ++ u64 trigger; ++ u64 __padding[7]; ++}; ++ ++#define sdma_next(s, fld) \ ++ ((s)->fld = (((s)->fld + 1) == (s)->desc_num) ? 0 : ((s)->fld + 1)) ++ ++struct qib_knx_sdma_mflags { ++ u64 status; ++ u64 __padding1[7]; ++ u64 complete; ++ u64 __padding2[7]; ++}; ++ ++struct qib_knx_tid_info { ++ /* this is the entire set of 512 entries (= 4K) so ++ * we can resgister. subctxt devision will be done ++ * in MIC driver. */ ++ off_t tidbase_offset; ++ size_t tidbase_len; ++ u64 tidbase; ++ unsigned tidcnt; ++ u64 tidtemplate; ++ unsigned long invalidtid; ++ u64 bar_addr; ++ u64 bar_len; ++}; ++ ++#endif /* _QIB_KNX_COMMON_H */ +diff -urN a9/drivers/infiniband/hw/qib/qib_knx.h a10/drivers/infiniband/hw/qib/qib_knx.h +--- a9/drivers/infiniband/hw/qib/qib_knx.h 1969-12-31 16:00:00.000000000 -0800 ++++ a10/drivers/infiniband/hw/qib/qib_knx.h 2015-01-05 15:10:58.252446692 -0800 +@@ -0,0 +1,74 @@ ++/* ++ * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. ++ * ++ * This software is available to you under a choice of one of two ++ * licenses. You may choose to be licensed under the terms of the GNU ++ * General Public License (GPL) Version 2, available from the file ++ * COPYING in the main directory of this source tree, or the ++ * OpenIB.org BSD license below: ++ * ++ * Redistribution and use in source and binary forms, with or ++ * without modification, are permitted provided that the following ++ * conditions are met: ++ * ++ * - Redistributions of source code must retain the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer. ++ * ++ * - Redistributions in binary form must reproduce the above ++ * copyright notice, this list of conditions and the following ++ * disclaimer in the documentation and/or other materials ++ * provided with the distribution. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS ++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN ++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE ++ * SOFTWARE. ++ */ ++#ifndef _QIB_KNX_H ++#define _QIB_KNX_H ++ ++#include "qib.h" ++ ++enum qib_knx_ctxtinfo_type { ++ QIB_KNX_CTXTINFO_UREG, ++ QIB_KNX_CTXTINFO_PIOAVAIL, ++ QIB_KNX_CTXTINFO_STATUS, ++ QIB_KNX_CTXTINFO_PIOBUFBASE, ++ QIB_KNX_CTXTINFO_FLAGS ++}; ++ ++#ifdef QIB_CONFIG_KNX ++int __init qib_knx_server_init(void); ++void __exit qib_knx_server_exit(void); ++ ++void qib_knx_remove_device(struct qib_devdata *); ++ ++inline struct qib_knx *qib_knx_get(uint16_t); ++inline struct qib_devdata *qib_knx_node_to_dd(uint16_t); ++int qib_knx_alloc_ctxt(u16, unsigned); ++int qib_knx_setup_piobufs(struct qib_devdata *, struct qib_ctxtdata *, __u16); ++int qib_knx_setup_pioregs(struct qib_devdata *, struct qib_ctxtdata *, ++ struct qib_base_info *); ++int qib_knx_create_rcvhdrq(struct qib_devdata *, struct qib_ctxtdata *, ++ struct qib_base_info *); ++int qib_knx_setup_eagerbufs(struct qib_ctxtdata *, struct qib_base_info *); ++void qib_knx_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *); ++__u64 qib_knx_ctxt_info(struct qib_ctxtdata *, enum qib_knx_ctxtinfo_type, ++ struct file *); ++int qib_knx_sdma_queue_create(struct file *); ++void qib_knx_sdma_queue_destroy(struct qib_filedata *); ++#else ++static inline u64 qib_knx_ctxt_info( ++ struct qib_ctxtdata *rcd, ++ enum qib_knx_ctxtinfo_type type, ++ struct file *fp) ++{ ++ return 0; ++} ++#endif ++#endif /* _QIB_KNX_H */ +diff -urN a9/drivers/infiniband/hw/qib/qib_user_sdma.c a10/drivers/infiniband/hw/qib/qib_user_sdma.c +--- a9/drivers/infiniband/hw/qib/qib_user_sdma.c 2015-01-05 15:05:04.279461602 -0800 ++++ a10/drivers/infiniband/hw/qib/qib_user_sdma.c 2015-01-05 15:10:58.252446692 -0800 +@@ -63,80 +63,6 @@ + pid_t pid; + }; + +-struct qib_user_sdma_pkt { +- struct list_head list; /* list element */ +- +- u8 tiddma; /* if this is NEW tid-sdma */ +- u8 largepkt; /* this is large pkt from kmalloc */ +- u16 frag_size; /* frag size used by PSM */ +- u16 index; /* last header index or push index */ +- u16 naddr; /* dimension of addr (1..3) ... */ +- u16 addrlimit; /* addr array size */ +- u16 tidsmidx; /* current tidsm index */ +- u16 tidsmcount; /* tidsm array item count */ +- u16 payload_size; /* payload size so far for header */ +- u32 bytes_togo; /* bytes for processing */ +- u32 counter; /* sdma pkts queued counter for this entry */ +- struct qib_tid_session_member *tidsm; /* tid session member array */ +- struct qib_user_sdma_queue *pq; /* which pq this pkt belongs to */ +- u64 added; /* global descq number of entries */ +- +- struct { +- u16 offset; /* offset for kvaddr, addr */ +- u16 length; /* length in page */ +- u16 first_desc; /* first desc */ +- u16 last_desc; /* last desc */ +- u16 put_page; /* should we put_page? */ +- u16 dma_mapped; /* is page dma_mapped? */ +- u16 dma_length; /* for dma_unmap_page() */ +- u16 padding; +- struct page *page; /* may be NULL (coherent mem) */ +- void *kvaddr; /* FIXME: only for pio hack */ +- dma_addr_t addr; +- } addr[4]; /* max pages, any more and we coalesce */ +-}; +- +-struct qib_user_sdma_queue { +- /* +- * pkts sent to dma engine are queued on this +- * list head. the type of the elements of this +- * list are struct qib_user_sdma_pkt... +- */ +- struct list_head sent; +- +- /* +- * Because above list will be accessed by both process and +- * signal handler, we need a spinlock for it. +- */ +- spinlock_t sent_lock ____cacheline_aligned_in_smp; +- +- /* headers with expected length are allocated from here... */ +- char header_cache_name[64]; +- struct dma_pool *header_cache; +- +- /* packets are allocated from the slab cache... */ +- char pkt_slab_name[64]; +- struct kmem_cache *pkt_slab; +- +- /* as packets go on the queued queue, they are counted... */ +- u32 counter; +- u32 sent_counter; +- /* pending packets, not sending yet */ +- u32 num_pending; +- /* sending packets, not complete yet */ +- u32 num_sending; +- /* global descq number of entry of last sending packet */ +- u64 added; +- +- /* dma page table */ +- struct rb_root dma_pages_root; +- +- struct qib_user_sdma_rb_node *sdma_rb_node; +- +- /* protect everything above... */ +- struct mutex lock; +-}; +- + static struct qib_user_sdma_rb_node * + qib_user_sdma_rb_search(struct rb_root *root, pid_t pid) + { +@@ -254,12 +180,12 @@ + return pq; + } + +-static void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt, +- int i, u16 offset, u16 len, +- u16 first_desc, u16 last_desc, +- u16 put_page, u16 dma_mapped, +- struct page *page, void *kvaddr, +- dma_addr_t dma_addr, u16 dma_length) ++void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt, ++ int i, u16 offset, u16 len, ++ u16 first_desc, u16 last_desc, ++ u16 put_page, u16 dma_mapped, ++ struct page *page, void *kvaddr, ++ dma_addr_t dma_addr, u16 dma_length) + { + pkt->addr[i].offset = offset; + pkt->addr[i].length = len; +@@ -273,7 +199,7 @@ + pkt->addr[i].dma_length = dma_length; + } + +-static void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq, ++void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq, + size_t len, dma_addr_t *dma_addr) + { + void *hdr; +@@ -295,11 +221,11 @@ + return hdr; + } + +-static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd, +- struct qib_user_sdma_queue *pq, +- struct qib_user_sdma_pkt *pkt, +- struct page *page, u16 put, +- u16 offset, u16 len, void *kvaddr) ++int qib_user_sdma_page_to_frags(const struct qib_devdata *dd, ++ struct qib_user_sdma_queue *pq, ++ struct qib_user_sdma_pkt *pkt, ++ struct page *page, u16 put, ++ u16 offset, u16 len, void *kvaddr) + { + __le16 *pbc16; + void *pbcvaddr; +@@ -314,21 +240,27 @@ + int ret = 0; + + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { +- /* +- * dma mapping error, pkt has not managed +- * this page yet, return the page here so +- * the caller can ignore this page. +- */ +- if (put) { +- put_page(page); +- } else { +- /* coalesce case */ +- kunmap(page); +- __free_page(page); ++#ifdef QIB_CONFIG_KNX ++ if (!pkt->remote) { ++#endif ++ /* ++ * dma mapping error, pkt has not managed ++ * this page yet, return the page here so ++ * the caller can ignore this page. ++ */ ++ if (put) { ++ put_page(page); ++ } else { ++ /* coalesce case */ ++ kunmap(page); ++ __free_page(page); ++ } ++ ret = -ENOMEM; ++ goto done; + } +- ret = -ENOMEM; +- goto done; ++#ifdef QIB_CONFIG_KNX + } ++#endif + offset = 0; + dma_mapped = 1; + +@@ -630,13 +562,19 @@ + pkt->addr[i].dma_length, + DMA_TO_DEVICE); + +- if (pkt->addr[i].kvaddr) +- kunmap(pkt->addr[i].page); ++#ifdef QIB_CONFIG_KNX ++ if (!pkt->remote) { ++#endif ++ if (pkt->addr[i].kvaddr) ++ kunmap(pkt->addr[i].page); + +- if (pkt->addr[i].put_page) +- put_page(pkt->addr[i].page); +- else +- __free_page(pkt->addr[i].page); ++ if (pkt->addr[i].put_page) ++ put_page(pkt->addr[i].page); ++ else ++ __free_page(pkt->addr[i].page); ++#ifdef QIB_CONFIG_KNX ++ } ++#endif + } else if (pkt->addr[i].kvaddr) { + /* for headers */ + if (pkt->addr[i].dma_mapped) { +@@ -775,9 +713,9 @@ + } + + /* free a packet list -- return counter value of last packet */ +-static void qib_user_sdma_free_pkt_list(struct device *dev, +- struct qib_user_sdma_queue *pq, +- struct list_head *list) ++void qib_user_sdma_free_pkt_list(struct device *dev, ++ struct qib_user_sdma_queue *pq, ++ struct list_head *list) + { + struct qib_user_sdma_pkt *pkt, *pkt_next; + +@@ -787,6 +725,10 @@ + for (i = 0; i < pkt->naddr; i++) + qib_user_sdma_free_pkt_frag(dev, pq, pkt, i); + ++#ifdef QIB_CONFIG_KNX ++ if (pkt->remote) ++ qib_knx_sdma_free_pkt(pkt); ++#endif + if (pkt->largepkt) + kfree(pkt); + else +@@ -970,6 +912,9 @@ + pkt->payload_size = 0; + pkt->counter = counter; + pkt->tiddma = tiddma; ++#ifdef QIB_CONFIG_KNX ++ pkt->remote = 0; ++#endif + + /* setup the first header */ + qib_user_sdma_init_frag(pkt, 0, /* index */ +@@ -1045,8 +990,8 @@ + } + + /* try to clean out queue -- needs pq->lock */ +-static int qib_user_sdma_queue_clean(struct qib_pportdata *ppd, +- struct qib_user_sdma_queue *pq) ++int qib_user_sdma_queue_clean(struct qib_pportdata *ppd, ++ struct qib_user_sdma_queue *pq) + { + struct qib_devdata *dd = ppd->dd; + struct list_head free_list; +@@ -1110,7 +1055,7 @@ + } + + /* clean descriptor queue, returns > 0 if some elements cleaned */ +-static int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd) ++int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd) + { + int ret; + unsigned long flags; +@@ -1321,9 +1266,9 @@ + } + + /* pq->lock must be held, get packets on the wire... */ +-static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd, +- struct qib_user_sdma_queue *pq, +- struct list_head *pktlist, int count) ++int qib_user_sdma_push_pkts(struct qib_pportdata *ppd, ++ struct qib_user_sdma_queue *pq, ++ struct list_head *pktlist, int count) + { + unsigned long flags; + +diff -urN a9/drivers/infiniband/hw/qib/qib_user_sdma.h a10/drivers/infiniband/hw/qib/qib_user_sdma.h +--- a9/drivers/infiniband/hw/qib/qib_user_sdma.h 2015-01-05 15:05:04.280461602 -0800 ++++ a10/drivers/infiniband/hw/qib/qib_user_sdma.h 2015-01-05 15:10:58.253446692 -0800 +@@ -31,12 +31,108 @@ + */ + #include + +-struct qib_user_sdma_queue; ++struct qib_user_sdma_pkt { ++ struct list_head list; /* list element */ ++ ++ u8 tiddma; /* if this is NEW tid-sdma */ ++ u8 largepkt; /* this is large pkt from kmalloc */ ++ u16 frag_size; /* frag size used by PSM */ ++ u16 index; /* last header index or push index */ ++ u16 naddr; /* dimension of addr (1..3) ... */ ++ u16 addrlimit; /* addr array size */ ++ u16 tidsmidx; /* current tidsm index */ ++ u16 tidsmcount; /* tidsm array item count */ ++ u16 payload_size; /* payload size so far for header */ ++ u32 bytes_togo; /* bytes for processing */ ++ u32 counter; /* sdma pkts queued counter for this entry */ ++ struct qib_tid_session_member *tidsm; /* tid session member array */ ++ struct qib_user_sdma_queue *pq; /* which pq this pkt belongs to */ ++ u64 added; /* global descq number of entries */ ++#ifdef QIB_CONFIG_KNX ++ u64 remote; /* does the packet originate on the host */ ++#endif ++ ++ struct { ++ u16 offset; /* offset for kvaddr, addr */ ++ u16 length; /* length in page */ ++ u16 first_desc; /* first desc */ ++ u16 last_desc; /* last desc */ ++ u16 put_page; /* should we put_page? */ ++ u16 dma_mapped; /* is page dma_mapped? */ ++ u16 dma_length; /* for dma_unmap_page() */ ++ u16 padding; ++ struct page *page; /* may be NULL (coherent mem) */ ++ void *kvaddr; /* FIXME: only for pio hack */ ++ dma_addr_t addr; ++ } addr[4]; /* max pages, any more and we coalesce */ ++}; ++ ++struct qib_user_sdma_queue { ++ /* ++ * pkts sent to dma engine are queued on this ++ * list head. the type of the elements of this ++ * list are struct qib_user_sdma_pkt... ++ */ ++ struct list_head sent; ++ ++ /* ++ * Because above list will be accessed by both process and ++ * signal handler, we need a spinlock for it. ++ */ ++ spinlock_t sent_lock ____cacheline_aligned_in_smp; ++ ++ /* headers with expected length are allocated from here... */ ++ char header_cache_name[64]; ++ struct dma_pool *header_cache; ++ ++ /* packets are allocated from the slab cache... */ ++ char pkt_slab_name[64]; ++ struct kmem_cache *pkt_slab; ++ ++ /* as packets go on the queued queue, they are counted... */ ++ u32 counter; ++ u32 sent_counter; ++ /* pending packets, not sending yet */ ++ u32 num_pending; ++ /* sending packets, not complete yet */ ++ u32 num_sending; ++ /* global descq number of entry of last sending packet */ ++ u64 added; ++ ++ /* dma page table */ ++ struct rb_root dma_pages_root; ++ ++ struct qib_user_sdma_rb_node *sdma_rb_node; ++ ++ /* protect everything above... */ ++ struct mutex lock; ++}; + + struct qib_user_sdma_queue * + qib_user_sdma_queue_create(struct device *dev, int unit, int port, int sport); + void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq); +- ++void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq, ++ size_t len, dma_addr_t *dma_addr); ++void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt, ++ int i, u16 offset, u16 len, ++ u16 first_desc, u16 last_desc, ++ u16 put_page, u16 dma_mapped, ++ struct page *page, void *kvaddr, ++ dma_addr_t dma_addr, u16 dma_length); ++int qib_user_sdma_page_to_frags(const struct qib_devdata *dd, ++ struct qib_user_sdma_queue *pq, ++ struct qib_user_sdma_pkt *pkt, ++ struct page *page, u16 put, ++ u16 offset, u16 len, void *kvaddr); ++int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd); ++int qib_user_sdma_queue_clean(struct qib_pportdata *ppd, ++ struct qib_user_sdma_queue *pq); ++void qib_user_sdma_free_pkt_list(struct device *dev, ++ struct qib_user_sdma_queue *pq, ++ struct list_head *list); ++int qib_user_sdma_push_pkts(struct qib_pportdata *ppd, ++ struct qib_user_sdma_queue *pq, ++ struct list_head *pktlist, int count); + int qib_user_sdma_writev(struct qib_ctxtdata *pd, + struct qib_user_sdma_queue *pq, + const struct iovec *iov, +@@ -50,3 +146,9 @@ + + u32 qib_user_sdma_complete_counter(const struct qib_user_sdma_queue *pq); + u32 qib_user_sdma_inflight_counter(struct qib_user_sdma_queue *pq); ++ ++/* ++ * This function prototype somewhat polutes this header file ++ * but I don't want to create a new header file just for it. ++ */ ++void qib_knx_sdma_free_pkt(struct qib_user_sdma_pkt *pkt); diff --git a/tech-preview/xeon-phi/0011-correct-ib_addr.h-for-older-kernels.patch b/tech-preview/xeon-phi/0011-correct-ib_addr.h-for-older-kernels.patch new file mode 100644 index 0000000..bce5ecd --- /dev/null +++ b/tech-preview/xeon-phi/0011-correct-ib_addr.h-for-older-kernels.patch @@ -0,0 +1,46 @@ +From 536a8d5b5c68ecd2ca73446f25443fe8bb234a46 Mon Sep 17 00:00:00 2001 +From: Phil Cayton +Date: Thu, 29 May 2014 14:35:13 -0700 +Subject: [PATCH 11/12] correct ib_addr.h for older kernels + +Signed-off-by: Phil Cayton +--- +diff -urN a10/include/rdma/ib_addr.h a11/include/rdma/ib_addr.h +--- a10/include/rdma/ib_addr.h 2015-01-05 15:10:42.263447365 -0800 ++++ a11/include/rdma/ib_addr.h 2015-01-05 15:12:36.058442572 -0800 +@@ -239,6 +239,27 @@ + return 0; + } + ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0) ++static inline int iboe_get_rate(struct net_device *dev) ++{ ++ struct ethtool_cmd cmd; ++ ++ if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings || ++ dev->ethtool_ops->get_settings(dev, &cmd)) ++ return IB_RATE_PORT_CURRENT; ++ ++ if (cmd.speed >= 40000) ++ return IB_RATE_40_GBPS; ++ else if (cmd.speed >= 30000) ++ return IB_RATE_30_GBPS; ++ else if (cmd.speed >= 20000) ++ return IB_RATE_20_GBPS; ++ else if (cmd.speed >= 10000) ++ return IB_RATE_10_GBPS; ++ else ++ return IB_RATE_PORT_CURRENT; ++} ++#else + static inline int iboe_get_rate(struct net_device *dev) + { + struct ethtool_cmd cmd; +@@ -263,6 +284,7 @@ + else + return IB_RATE_PORT_CURRENT; + } ++#endif + + static inline int rdma_link_local_addr(struct in6_addr *addr) + { diff --git a/tech-preview/xeon-phi/0012-add-mlx4-cq_comp-locking-already-done-in-event-handl.patch b/tech-preview/xeon-phi/0012-add-mlx4-cq_comp-locking-already-done-in-event-handl.patch new file mode 100644 index 0000000..498b1c8 --- /dev/null +++ b/tech-preview/xeon-phi/0012-add-mlx4-cq_comp-locking-already-done-in-event-handl.patch @@ -0,0 +1,39 @@ +From 6d88a748ca017a22c08d25e29144dd392c988eb9 Mon Sep 17 00:00:00 2001 +From: Phil Cayton +Date: Thu, 5 Jun 2014 09:44:42 -0700 +Subject: [PATCH 12/12] add mlx4 cq_comp locking already done in event handler + +--- +diff -urN a11/drivers/net/ethernet/mellanox/mlx4/cq.c a12/drivers/net/ethernet/mellanox/mlx4/cq.c +--- a11/drivers/net/ethernet/mellanox/mlx4/cq.c 2015-01-05 15:12:24.028443079 -0800 ++++ a12/drivers/net/ethernet/mellanox/mlx4/cq.c 2015-01-05 15:14:27.994437857 -0800 +@@ -54,10 +54,17 @@ + + void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn) + { ++ struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table; + struct mlx4_cq *cq; + +- cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree, +- cqn & (dev->caps.num_cqs - 1)); ++ spin_lock(&cq_table->lock); ++ ++ cq = radix_tree_lookup(&cq_table->tree, cqn & (dev->caps.num_cqs - 1)); ++ if (cq) ++ atomic_inc(&cq->refcount); ++ ++ spin_unlock(&cq_table->lock); ++ + if (!cq) { + mlx4_dbg(dev, "Completion event for bogus CQ %08x\n", cqn); + return; +@@ -66,6 +73,9 @@ + ++cq->arm_sn; + + cq->comp(cq); ++ ++ if (atomic_dec_and_test(&cq->refcount)) ++ complete(&cq->free); + } + + void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type)