]> git.openfabrics.org - ~emulex/for-vlad/old/compat-rdma.git/commitdiff
NFSoRDMA: fixes for 3.12 and RHEL7, RHEL6.5, SLES11SP3 backports
authorJeffrey C. Becker <Jeffrey.C.Becker@nasa.gov>
Sat, 16 Aug 2014 00:10:44 +0000 (17:10 -0700)
committerJeffrey C. Becker <Jeffrey.C.Becker@nasa.gov>
Sat, 16 Aug 2014 00:10:44 +0000 (17:10 -0700)
Signed-off-by: Jeff Becker <Jeffrey.C.Becker@nasa.gov>
37 files changed:
linux-next-cherry-picks/0101-SUNRPC-Fix-large-reads-on-NFS-RDMA.patch [new file with mode: 0644]
linux-next-cherry-picks/0102-SUNRPC-remove-KERN_INFO-from-dprintk-call-sites.patch [new file with mode: 0644]
linux-next-cherry-picks/0103-svcrdma-fix-printk-when-memory-allocation-fails.patch [new file with mode: 0644]
linux-next-cherry-picks/0104-Fix-regression-in-NFSRDMA-server.patch [new file with mode: 0644]
linux-next-cherry-picks/0105-svcrdma-fix-offset-calculation-for-non-page-aligned-.patch [new file with mode: 0644]
linux-next-cherry-picks/0106-xprtrdma-Backport-RPC_CWNDSHIFT-from-sunrpc.patch [new file with mode: 0644]
linux-next-cherry-picks/0107-xprtrdma-mind-the-device-s-max-fast-register-page-li.patch [new file with mode: 0644]
linux-next-cherry-picks/0108-nfs-rdma-Fix-for-FMR-leaks.patch [new file with mode: 0644]
linux-next-cherry-picks/0109-xprtrdma-RPC-RDMA-must-invoke-xprt_wake_pending_task.patch [new file with mode: 0644]
linux-next-cherry-picks/0110-xprtrdma-Remove-BOUNCEBUFFERS-memory-registration-mo.patch [new file with mode: 0644]
linux-next-cherry-picks/0111-xprtrdma-Remove-MEMWINDOWS-registration-modes.patch [new file with mode: 0644]
linux-next-cherry-picks/0112-xprtrdma-Remove-REGISTER-memory-registration-mode.patch [new file with mode: 0644]
linux-next-cherry-picks/0113-xprtrdma-Fall-back-to-MTHCAFMR-when-FRMR-is-not-supp.patch [new file with mode: 0644]
linux-next-cherry-picks/0114-xprtrdma-mount-reports-Invalid-mount-option-if-memre.patch [new file with mode: 0644]
linux-next-cherry-picks/0115-xprtrdma-Simplify-rpcrdma_deregister_external-synops.patch [new file with mode: 0644]
linux-next-cherry-picks/0116-xprtrdma-Make-rpcrdma_ep_destroy-return-void.patch [new file with mode: 0644]
linux-next-cherry-picks/0117-xprtrdma-Split-the-completion-queue.patch [new file with mode: 0644]
linux-next-cherry-picks/0118-xprtrmda-Reduce-lock-contention-in-completion-handle.patch [new file with mode: 0644]
linux-next-cherry-picks/0119-xprtrmda-Reduce-calls-to-ib_poll_cq-in-completion-ha.patch [new file with mode: 0644]
linux-next-cherry-picks/0120-xprtrdma-Limit-work-done-by-completion-handler.patch [new file with mode: 0644]
linux-next-cherry-picks/0121-xprtrdma-Reduce-the-number-of-hardway-buffer-allocat.patch [new file with mode: 0644]
linux-next-cherry-picks/0122-xprtrdma-Ensure-ia-ri_id-qp-is-not-NULL-when-reconne.patch [new file with mode: 0644]
linux-next-cherry-picks/0123-xprtrdma-Remove-Tavor-MTU-setting.patch [new file with mode: 0644]
linux-next-cherry-picks/0124-xprtrdma-Allocate-missing-pagelist.patch [new file with mode: 0644]
linux-next-cherry-picks/0125-xprtrdma-Use-macros-for-reconnection-timeout-constan.patch [new file with mode: 0644]
linux-next-cherry-picks/0126-xprtrdma-Reset-connection-timeout-after-successful-r.patch [new file with mode: 0644]
linux-next-cherry-picks/0127-xprtrdma-Avoid-deadlock-when-credit-window-is-reset.patch [new file with mode: 0644]
linux-next-cherry-picks/0128-xprtrdma-Remove-BUG_ON-call-sites.patch [new file with mode: 0644]
linux-next-cherry-picks/0129-xprtrdma-Disconnect-on-registration-failure.patch [new file with mode: 0644]
linux-next-cherry-picks/0130-svcrdma-refactor-marshalling-logic.patch [new file with mode: 0644]
linux-next-cherry-picks/0131-svcrdma-Fence-LOCAL_INV-work-requests.patch [new file with mode: 0644]
linux-next-cherry-picks/0132-svcrdma-send_write-must-not-overflow.patch [new file with mode: 0644]
linux-next-cherry-picks/0133-nfsrdma-backport-fixes.patch [new file with mode: 0644]
linux-next-pending/0024-SUNRPC-Fix-large_reads-on-NFS-RDMA.patch [deleted file]
linux-next-pending/0025-NFSRDMA-Fix-regression-in-NFSRDMA-server.patch [deleted file]
patches/0023-nfsrdma-Backport-for-rhel6.5.patch
patches/0026-nfsrdma-Backport-for-sles11sp3.patch

diff --git a/linux-next-cherry-picks/0101-SUNRPC-Fix-large-reads-on-NFS-RDMA.patch b/linux-next-cherry-picks/0101-SUNRPC-Fix-large-reads-on-NFS-RDMA.patch
new file mode 100644 (file)
index 0000000..20a476d
--- /dev/null
@@ -0,0 +1,51 @@
+From 2b7bbc963da8d076f263574af4138b5df2e1581f Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 12 Mar 2014 12:51:30 -0400
+Subject: [PATCH 101/132] SUNRPC: Fix large reads on NFS/RDMA
+
+After commit a11a2bf4, "SUNRPC: Optimise away unnecessary data moves
+in xdr_align_pages", Thu Aug 2 13:21:43 2012, READs larger than a
+few hundred bytes via NFS/RDMA no longer work.  This commit exposed
+a long-standing bug in rpcrdma_inline_fixup().
+
+I reproduce this with an rsize=4096 mount using the cthon04 basic
+tests.  Test 5 fails with an EIO error.
+
+For my reproducer, kernel log shows:
+
+  NFS: server cheating in read reply: count 4096 > recvd 0
+
+rpcrdma_inline_fixup() is zeroing the xdr_stream::page_len field,
+and xdr_align_pages() is now returning that value to the READ XDR
+decoder function.
+
+That field is set up by xdr_inline_pages() by the READ XDR encoder
+function.  As far as I can tell, it is supposed to be left alone
+after that, as it describes the dimensions of the reply xdr_stream,
+not the contents of that stream.
+
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=68391
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c |    4 +---
+ 1 files changed, 1 insertions(+), 3 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index e03725b..96ead52 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -649,9 +649,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
+                               break;
+                       page_base = 0;
+               }
+-              rqst->rq_rcv_buf.page_len = olen - copy_len;
+-      } else
+-              rqst->rq_rcv_buf.page_len = 0;
++      }
+       if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
+               curlen = copy_len;
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0102-SUNRPC-remove-KERN_INFO-from-dprintk-call-sites.patch b/linux-next-cherry-picks/0102-SUNRPC-remove-KERN_INFO-from-dprintk-call-sites.patch
new file mode 100644 (file)
index 0000000..54a2345
--- /dev/null
@@ -0,0 +1,49 @@
+From 3a0799a94c0384a3b275a73267aaa10517b1bf7d Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 12 Mar 2014 12:51:39 -0400
+Subject: [PATCH 102/132] SUNRPC: remove KERN_INFO from dprintk() call sites
+
+The use of KERN_INFO causes garbage characters to appear when
+debugging is enabled.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
+---
+ net/sunrpc/xprtrdma/transport.c |   10 +++++-----
+ 1 files changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index 285dc08..1eb9c46 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -733,7 +733,7 @@ static void __exit xprt_rdma_cleanup(void)
+ {
+       int rc;
+-      dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n");
++      dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
+ #ifdef RPC_DEBUG
+       if (sunrpc_table_header) {
+               unregister_sysctl_table(sunrpc_table_header);
+@@ -755,14 +755,14 @@ static int __init xprt_rdma_init(void)
+       if (rc)
+               return rc;
+-      dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n");
++      dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
+-      dprintk(KERN_INFO "Defaults:\n");
+-      dprintk(KERN_INFO "\tSlots %d\n"
++      dprintk("Defaults:\n");
++      dprintk("\tSlots %d\n"
+               "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
+               xprt_rdma_slot_table_entries,
+               xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
+-      dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n",
++      dprintk("\tPadding %d\n\tMemreg %d\n",
+               xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
+ #ifdef RPC_DEBUG
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0103-svcrdma-fix-printk-when-memory-allocation-fails.patch b/linux-next-cherry-picks/0103-svcrdma-fix-printk-when-memory-allocation-fails.patch
new file mode 100644 (file)
index 0000000..9efe81c
--- /dev/null
@@ -0,0 +1,30 @@
+From c42a01eee74dfd9ba8f8abb7cb81dd9a8839dc7b Mon Sep 17 00:00:00 2001
+From: Jeff Layton <jlayton@redhat.com>
+Date: Mon, 10 Mar 2014 11:33:48 -0400
+Subject: [PATCH 103/132] svcrdma: fix printk when memory allocation fails
+
+It retries in 1s, not 1000 jiffies.
+
+Signed-off-by: Jeff Layton <jlayton@redhat.com>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+---
+ net/sunrpc/xprtrdma/svc_rdma_transport.c |    3 +--
+ 1 files changed, 1 insertions(+), 2 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
+index 62e4f9b..25688fa 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
+@@ -477,8 +477,7 @@ struct page *svc_rdma_get_page(void)
+       while ((page = alloc_page(GFP_KERNEL)) == NULL) {
+               /* If we can't get memory, wait a bit and try again */
+-              printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 "
+-                     "jiffies.\n");
++              printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n");
+               schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
+       }
+       return page;
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0104-Fix-regression-in-NFSRDMA-server.patch b/linux-next-cherry-picks/0104-Fix-regression-in-NFSRDMA-server.patch
new file mode 100644 (file)
index 0000000..172f9a2
--- /dev/null
@@ -0,0 +1,74 @@
+From 7e4359e2611f95a97037e2b6905eab52f28afbeb Mon Sep 17 00:00:00 2001
+From: Tom Tucker <tom@ogc.us>
+Date: Tue, 25 Mar 2014 15:14:57 -0500
+Subject: [PATCH 104/132] Fix regression in NFSRDMA server
+
+The server regression was caused by the addition of rq_next_page
+(afc59400d6c65bad66d4ad0b2daf879cbff8e23e). There were a few places that
+were missed with the update of the rq_respages array.
+
+Signed-off-by: Tom Tucker <tom@ogc.us>
+Tested-by: Steve Wise <swise@ogc.us>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+---
+ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |   12 ++++--------
+ net/sunrpc/xprtrdma/svc_rdma_sendto.c   |    1 +
+ 2 files changed, 5 insertions(+), 8 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+index 0ce7552..8d904e4 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+@@ -90,6 +90,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
+               sge_no++;
+       }
+       rqstp->rq_respages = &rqstp->rq_pages[sge_no];
++      rqstp->rq_next_page = rqstp->rq_respages + 1;
+       /* We should never run out of SGE because the limit is defined to
+        * support the max allowed RPC data length
+@@ -169,6 +170,7 @@ static int map_read_chunks(struct svcxprt_rdma *xprt,
+                */
+               head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
+               rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
++              rqstp->rq_next_page = rqstp->rq_respages + 1;
+               byte_count -= sge_bytes;
+               ch_bytes -= sge_bytes;
+@@ -276,6 +278,7 @@ static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
+       /* rq_respages points one past arg pages */
+       rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
++      rqstp->rq_next_page = rqstp->rq_respages + 1;
+       /* Create the reply and chunk maps */
+       offset = 0;
+@@ -520,13 +523,6 @@ next_sge:
+       for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
+               rqstp->rq_pages[ch_no] = NULL;
+-      /*
+-       * Detach res pages. If svc_release sees any it will attempt to
+-       * put them.
+-       */
+-      while (rqstp->rq_next_page != rqstp->rq_respages)
+-              *(--rqstp->rq_next_page) = NULL;
+-
+       return err;
+ }
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+index c1d124d..11e90f8 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+@@ -625,6 +625,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
+               if (page_no+1 >= sge_no)
+                       ctxt->sge[page_no+1].length = 0;
+       }
++      rqstp->rq_next_page = rqstp->rq_respages + 1;
+       BUG_ON(sge_no > rdma->sc_max_sge);
+       memset(&send_wr, 0, sizeof send_wr);
+       ctxt->wr_op = IB_WR_SEND;
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0105-svcrdma-fix-offset-calculation-for-non-page-aligned-.patch b/linux-next-cherry-picks/0105-svcrdma-fix-offset-calculation-for-non-page-aligned-.patch
new file mode 100644 (file)
index 0000000..8e3e81f
--- /dev/null
@@ -0,0 +1,33 @@
+From 3cbe01a94c7b369f943f8a9d40394198d757cdd4 Mon Sep 17 00:00:00 2001
+From: Jeff Layton <jlayton@redhat.com>
+Date: Mon, 17 Mar 2014 13:10:05 -0400
+Subject: [PATCH 106/132] svcrdma: fix offset calculation for non-page aligned sge entries
+
+The xdr_off value in dma_map_xdr gets passed to ib_dma_map_page as the
+offset into the page to be mapped. This calculation does not correctly
+take into account the case where the data starts at some offset into
+the page. Increment the xdr_off by the page_base to ensure that it is
+respected.
+
+Cc: Tom Tucker <tom@opengridcomputing.com>
+Signed-off-by: Jeff Layton <jlayton@redhat.com>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+---
+ net/sunrpc/xprtrdma/svc_rdma_sendto.c |    1 +
+ 1 files changed, 1 insertions(+), 0 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+index 11e90f8..7e024a5 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+@@ -265,6 +265,7 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
+               xdr_off -= xdr->head[0].iov_len;
+               if (xdr_off < xdr->page_len) {
+                       /* This offset is in the page list */
++                      xdr_off += xdr->page_base;
+                       page = xdr->pages[xdr_off >> PAGE_SHIFT];
+                       xdr_off &= ~PAGE_MASK;
+               } else {
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0106-xprtrdma-Backport-RPC_CWNDSHIFT-from-sunrpc.patch b/linux-next-cherry-picks/0106-xprtrdma-Backport-RPC_CWNDSHIFT-from-sunrpc.patch
new file mode 100644 (file)
index 0000000..11d70f2
--- /dev/null
@@ -0,0 +1,12 @@
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index cc1445d..f1cd3d3 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -53,6 +53,7 @@
+ #define RDMA_RESOLVE_TIMEOUT  (5000)  /* 5 seconds */
+ #define RDMA_CONNECT_RETRY_MAX        (2)     /* retries if no listener backlog */
++#define RPC_CWNDSHIFT         (8U)    /* backported from linux/sunrpc/xprt.h */
+ /*
+  * Interface Adapter -- one per transport instance
diff --git a/linux-next-cherry-picks/0107-xprtrdma-mind-the-device-s-max-fast-register-page-li.patch b/linux-next-cherry-picks/0107-xprtrdma-mind-the-device-s-max-fast-register-page-li.patch
new file mode 100644 (file)
index 0000000..935efdb
--- /dev/null
@@ -0,0 +1,146 @@
+From 0fc6c4e7bb287148eb5e949efd89327929d4841d Mon Sep 17 00:00:00 2001
+From: Steve Wise <swise@opengridcomputing.com>
+Date: Wed, 28 May 2014 10:32:00 -0400
+Subject: [PATCH 108/132] xprtrdma: mind the device's max fast register page list depth
+
+Some rdma devices don't support a fast register page list depth of
+at least RPCRDMA_MAX_DATA_SEGS.  So xprtrdma needs to chunk its fast
+register regions according to the minimum of the device max supported
+depth or RPCRDMA_MAX_DATA_SEGS.
+
+Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c  |    4 ---
+ net/sunrpc/xprtrdma/verbs.c     |   47 +++++++++++++++++++++++++++++----------
+ net/sunrpc/xprtrdma/xprt_rdma.h |    1 +
+ 3 files changed, 36 insertions(+), 16 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index 96ead52..400aa1b 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -248,10 +248,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+       /* success. all failures return above */
+       req->rl_nchunks = nchunks;
+-      BUG_ON(nchunks == 0);
+-      BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+-             && (nchunks > 3));
+-
+       /*
+        * finish off header. If write, marshal discrim and nchunks.
+        */
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 9372656..55fb09a 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -539,6 +539,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+                               __func__);
+                       memreg = RPCRDMA_REGISTER;
+ #endif
++              } else {
++                      /* Mind the ia limit on FRMR page list depth */
++                      ia->ri_max_frmr_depth = min_t(unsigned int,
++                              RPCRDMA_MAX_DATA_SEGS,
++                              devattr.max_fast_reg_page_list_len);
+               }
+               break;
+       }
+@@ -659,24 +664,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+       ep->rep_attr.srq = NULL;
+       ep->rep_attr.cap.max_send_wr = cdata->max_requests;
+       switch (ia->ri_memreg_strategy) {
+-      case RPCRDMA_FRMR:
++      case RPCRDMA_FRMR: {
++              int depth = 7;
++
+               /* Add room for frmr register and invalidate WRs.
+                * 1. FRMR reg WR for head
+                * 2. FRMR invalidate WR for head
+-               * 3. FRMR reg WR for pagelist
+-               * 4. FRMR invalidate WR for pagelist
++               * 3. N FRMR reg WRs for pagelist
++               * 4. N FRMR invalidate WRs for pagelist
+                * 5. FRMR reg WR for tail
+                * 6. FRMR invalidate WR for tail
+                * 7. The RDMA_SEND WR
+                */
+-              ep->rep_attr.cap.max_send_wr *= 7;
++
++              /* Calculate N if the device max FRMR depth is smaller than
++               * RPCRDMA_MAX_DATA_SEGS.
++               */
++              if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
++                      int delta = RPCRDMA_MAX_DATA_SEGS -
++                                  ia->ri_max_frmr_depth;
++
++                      do {
++                              depth += 2; /* FRMR reg + invalidate */
++                              delta -= ia->ri_max_frmr_depth;
++                      } while (delta > 0);
++
++              }
++              ep->rep_attr.cap.max_send_wr *= depth;
+               if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
+-                      cdata->max_requests = devattr.max_qp_wr / 7;
++                      cdata->max_requests = devattr.max_qp_wr / depth;
+                       if (!cdata->max_requests)
+                               return -EINVAL;
+-                      ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
++                      ep->rep_attr.cap.max_send_wr = cdata->max_requests *
++                                                     depth;
+               }
+               break;
++      }
+       case RPCRDMA_MEMWINDOWS_ASYNC:
+       case RPCRDMA_MEMWINDOWS:
+               /* Add room for mw_binds+unbinds - overkill! */
+@@ -1043,16 +1066,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+       case RPCRDMA_FRMR:
+               for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
+                       r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+-                                                       RPCRDMA_MAX_SEGS);
++                                              ia->ri_max_frmr_depth);
+                       if (IS_ERR(r->r.frmr.fr_mr)) {
+                               rc = PTR_ERR(r->r.frmr.fr_mr);
+                               dprintk("RPC:       %s: ib_alloc_fast_reg_mr"
+                                       " failed %i\n", __func__, rc);
+                               goto out;
+                       }
+-                      r->r.frmr.fr_pgl =
+-                              ib_alloc_fast_reg_page_list(ia->ri_id->device,
+-                                                          RPCRDMA_MAX_SEGS);
++                      r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
++                                              ia->ri_id->device,
++                                              ia->ri_max_frmr_depth);
+                       if (IS_ERR(r->r.frmr.fr_pgl)) {
+                               rc = PTR_ERR(r->r.frmr.fr_pgl);
+                               dprintk("RPC:       %s: "
+@@ -1498,8 +1521,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
+       seg1->mr_offset -= pageoff;     /* start of page */
+       seg1->mr_len += pageoff;
+       len = -pageoff;
+-      if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+-              *nsegs = RPCRDMA_MAX_DATA_SEGS;
++      if (*nsegs > ia->ri_max_frmr_depth)
++              *nsegs = ia->ri_max_frmr_depth;
+       for (page_no = i = 0; i < *nsegs;) {
+               rpcrdma_map_one(ia, seg, writing);
+               pa = seg->mr_dma;
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index cc1445d..98340a3 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -66,6 +66,7 @@ struct rpcrdma_ia {
+       struct completion       ri_done;
+       int                     ri_async_rc;
+       enum rpcrdma_memreg     ri_memreg_strategy;
++      unsigned int            ri_max_frmr_depth;
+ };
+ /*
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0108-nfs-rdma-Fix-for-FMR-leaks.patch b/linux-next-cherry-picks/0108-nfs-rdma-Fix-for-FMR-leaks.patch
new file mode 100644 (file)
index 0000000..5e01e25
--- /dev/null
@@ -0,0 +1,140 @@
+From 4034ba04231f554abb97ad8900a4c1af03f8e21d Mon Sep 17 00:00:00 2001
+From: Allen Andrews <allen.andrews@emulex.com>
+Date: Wed, 28 May 2014 10:32:09 -0400
+Subject: [PATCH 109/132] nfs-rdma: Fix for FMR leaks
+
+Two memory region leaks were found during testing:
+
+1. rpcrdma_buffer_create: While allocating RPCRDMA_FRMR's
+ib_alloc_fast_reg_mr is called and then ib_alloc_fast_reg_page_list is
+called.  If ib_alloc_fast_reg_page_list returns an error it bails out of
+the routine dropping the last ib_alloc_fast_reg_mr frmr region creating a
+memory leak.  Added code to dereg the last frmr if
+ib_alloc_fast_reg_page_list fails.
+
+2. rpcrdma_buffer_destroy: While cleaning up, the routine will only free
+the MR's on the rb_mws list if there are rb_send_bufs present.  However, in
+rpcrdma_buffer_create while the rb_mws list is being built if one of the MR
+allocation requests fail after some MR's have been allocated on the rb_mws
+list the routine never gets to create any rb_send_bufs but instead jumps to
+the rpcrdma_buffer_destroy routine which will never free the MR's on rb_mws
+list because the rb_send_bufs were never created.   This leaks all the MR's
+on the rb_mws list that were created prior to one of the MR allocations
+failing.
+
+Issue(2) was seen during testing. Our adapter had a finite number of MR's
+available and we created enough connections to where we saw an MR
+allocation failure on our Nth NFS connection request. After the kernel
+cleaned up the resources it had allocated for the Nth connection we noticed
+that FMR's had been leaked due to the coding error described above.
+
+Issue(1) was seen during a code review while debugging issue(2).
+
+Signed-off-by: Allen Andrews <allen.andrews@emulex.com>
+Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c |   73 ++++++++++++++++++++++--------------------
+ 1 files changed, 38 insertions(+), 35 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 55fb09a..8f9704e 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -1081,6 +1081,8 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+                               dprintk("RPC:       %s: "
+                                       "ib_alloc_fast_reg_page_list "
+                                       "failed %i\n", __func__, rc);
++
++                              ib_dereg_mr(r->r.frmr.fr_mr);
+                               goto out;
+                       }
+                       list_add(&r->mw_list, &buf->rb_mws);
+@@ -1217,41 +1219,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+                       kfree(buf->rb_recv_bufs[i]);
+               }
+               if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
+-                      while (!list_empty(&buf->rb_mws)) {
+-                              r = list_entry(buf->rb_mws.next,
+-                                      struct rpcrdma_mw, mw_list);
+-                              list_del(&r->mw_list);
+-                              switch (ia->ri_memreg_strategy) {
+-                              case RPCRDMA_FRMR:
+-                                      rc = ib_dereg_mr(r->r.frmr.fr_mr);
+-                                      if (rc)
+-                                              dprintk("RPC:       %s:"
+-                                                      " ib_dereg_mr"
+-                                                      " failed %i\n",
+-                                                      __func__, rc);
+-                                      ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+-                                      break;
+-                              case RPCRDMA_MTHCAFMR:
+-                                      rc = ib_dealloc_fmr(r->r.fmr);
+-                                      if (rc)
+-                                              dprintk("RPC:       %s:"
+-                                                      " ib_dealloc_fmr"
+-                                                      " failed %i\n",
+-                                                      __func__, rc);
+-                                      break;
+-                              case RPCRDMA_MEMWINDOWS_ASYNC:
+-                              case RPCRDMA_MEMWINDOWS:
+-                                      rc = ib_dealloc_mw(r->r.mw);
+-                                      if (rc)
+-                                              dprintk("RPC:       %s:"
+-                                                      " ib_dealloc_mw"
+-                                                      " failed %i\n",
+-                                                      __func__, rc);
+-                                      break;
+-                              default:
+-                                      break;
+-                              }
+-                      }
+                       rpcrdma_deregister_internal(ia,
+                                       buf->rb_send_bufs[i]->rl_handle,
+                                       &buf->rb_send_bufs[i]->rl_iov);
+@@ -1259,6 +1226,42 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+               }
+       }
++      while (!list_empty(&buf->rb_mws)) {
++              r = list_entry(buf->rb_mws.next,
++                      struct rpcrdma_mw, mw_list);
++              list_del(&r->mw_list);
++              switch (ia->ri_memreg_strategy) {
++              case RPCRDMA_FRMR:
++                      rc = ib_dereg_mr(r->r.frmr.fr_mr);
++                      if (rc)
++                              dprintk("RPC:       %s:"
++                                      " ib_dereg_mr"
++                                      " failed %i\n",
++                                      __func__, rc);
++                      ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
++                      break;
++              case RPCRDMA_MTHCAFMR:
++                      rc = ib_dealloc_fmr(r->r.fmr);
++                      if (rc)
++                              dprintk("RPC:       %s:"
++                                      " ib_dealloc_fmr"
++                                      " failed %i\n",
++                                      __func__, rc);
++                      break;
++              case RPCRDMA_MEMWINDOWS_ASYNC:
++              case RPCRDMA_MEMWINDOWS:
++                      rc = ib_dealloc_mw(r->r.mw);
++                      if (rc)
++                              dprintk("RPC:       %s:"
++                                      " ib_dealloc_mw"
++                                      " failed %i\n",
++                                      __func__, rc);
++                      break;
++              default:
++                      break;
++              }
++      }
++
+       kfree(buf->rb_pool);
+ }
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0109-xprtrdma-RPC-RDMA-must-invoke-xprt_wake_pending_task.patch b/linux-next-cherry-picks/0109-xprtrdma-RPC-RDMA-must-invoke-xprt_wake_pending_task.patch
new file mode 100644 (file)
index 0000000..53ca2a6
--- /dev/null
@@ -0,0 +1,112 @@
+From 254f91e2fa1f4cc18fd2eb9d5481888ffe126d5b Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:32:17 -0400
+Subject: [PATCH 110/132] xprtrdma: RPC/RDMA must invoke xprt_wake_pending_tasks() in process context
+
+An IB provider can invoke rpcrdma_conn_func() in an IRQ context,
+thus rpcrdma_conn_func() cannot be allowed to directly invoke
+generic RPC functions like xprt_wake_pending_tasks().
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c  |   22 +++++++++++++++-------
+ net/sunrpc/xprtrdma/verbs.c     |    3 +++
+ net/sunrpc/xprtrdma/xprt_rdma.h |    3 +++
+ 3 files changed, 21 insertions(+), 7 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index 400aa1b..c296468 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -676,15 +676,11 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
+       rqst->rq_private_buf = rqst->rq_rcv_buf;
+ }
+-/*
+- * This function is called when an async event is posted to
+- * the connection which changes the connection state. All it
+- * does at this point is mark the connection up/down, the rpc
+- * timers do the rest.
+- */
+ void
+-rpcrdma_conn_func(struct rpcrdma_ep *ep)
++rpcrdma_connect_worker(struct work_struct *work)
+ {
++      struct rpcrdma_ep *ep =
++              container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
+       struct rpc_xprt *xprt = ep->rep_xprt;
+       spin_lock_bh(&xprt->transport_lock);
+@@ -701,6 +697,18 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
+ }
+ /*
++ * This function is called when an async event is posted to
++ * the connection which changes the connection state. All it
++ * does at this point is mark the connection up/down, the rpc
++ * timers do the rest.
++ */
++void
++rpcrdma_conn_func(struct rpcrdma_ep *ep)
++{
++      schedule_delayed_work(&ep->rep_connect_worker, 0);
++}
++
++/*
+  * This function is called when memory window unbind which we are waiting
+  * for completes. Just use rr_func (zeroed by upcall) to signal completion.
+  */
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 8f9704e..9cb88f3 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -742,6 +742,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+       INIT_CQCOUNT(ep);
+       ep->rep_ia = ia;
+       init_waitqueue_head(&ep->rep_connect_wait);
++      INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
+       /*
+        * Create a single cq for receive dto and mw_bind (only ever
+@@ -817,6 +818,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+       dprintk("RPC:       %s: entering, connected is %d\n",
+               __func__, ep->rep_connected);
++      cancel_delayed_work_sync(&ep->rep_connect_worker);
++
+       if (ia->ri_id->qp) {
+               rc = rpcrdma_ep_disconnect(ep, ia);
+               if (rc)
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index 98340a3..c620d13 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -43,6 +43,7 @@
+ #include <linux/wait.h>               /* wait_queue_head_t, etc */
+ #include <linux/spinlock.h>           /* spinlock_t, etc */
+ #include <linux/atomic.h>                     /* atomic_t, etc */
++#include <linux/workqueue.h>          /* struct work_struct */
+ #include <rdma/rdma_cm.h>             /* RDMA connection api */
+ #include <rdma/ib_verbs.h>            /* RDMA verbs api */
+@@ -87,6 +88,7 @@ struct rpcrdma_ep {
+       struct rpc_xprt         *rep_xprt;      /* for rep_func */
+       struct rdma_conn_param  rep_remote_cma;
+       struct sockaddr_storage rep_remote_addr;
++      struct delayed_work     rep_connect_worker;
+ };
+ #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
+@@ -336,6 +338,7 @@ int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
+ /*
+  * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
+  */
++void rpcrdma_connect_worker(struct work_struct *);
+ void rpcrdma_conn_func(struct rpcrdma_ep *);
+ void rpcrdma_reply_handler(struct rpcrdma_rep *);
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0110-xprtrdma-Remove-BOUNCEBUFFERS-memory-registration-mo.patch b/linux-next-cherry-picks/0110-xprtrdma-Remove-BOUNCEBUFFERS-memory-registration-mo.patch
new file mode 100644 (file)
index 0000000..30307fd
--- /dev/null
@@ -0,0 +1,104 @@
+From 03ff8821eb5ed168792667cfc3ddff903e97af99 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:32:26 -0400
+Subject: [PATCH 111/132] xprtrdma: Remove BOUNCEBUFFERS memory registration mode
+
+Clean up: This memory registration mode is slow and was never
+meant for use in production environments. Remove it to reduce
+implementation complexity.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c  |   11 -----------
+ net/sunrpc/xprtrdma/transport.c |   13 -------------
+ net/sunrpc/xprtrdma/verbs.c     |    5 +----
+ 3 files changed, 1 insertions(+), 28 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index c296468..02b2941 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -77,9 +77,6 @@ static const char transfertypes[][12] = {
+  * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
+  * elements. Segments are then coalesced when registered, if possible
+  * within the selected memreg mode.
+- *
+- * Note, this routine is never called if the connection's memory
+- * registration strategy is 0 (bounce buffers).
+  */
+ static int
+@@ -439,14 +436,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
+               wtype = rpcrdma_noch;
+       BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch);
+-      if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS &&
+-          (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) {
+-              /* forced to "pure inline"? */
+-              dprintk("RPC:       %s: too much data (%d/%d) for inline\n",
+-                      __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len);
+-              return -1;
+-      }
+-
+       hdrlen = 28; /*sizeof *headerp;*/
+       padlen = 0;
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index 1eb9c46..8c5035a 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -503,18 +503,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
+                * If the allocation or registration fails, the RPC framework
+                * will (doggedly) retry.
+                */
+-              if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy ==
+-                              RPCRDMA_BOUNCEBUFFERS) {
+-                      /* forced to "pure inline" */
+-                      dprintk("RPC:       %s: too much data (%zd) for inline "
+-                                      "(r/w max %d/%d)\n", __func__, size,
+-                                      rpcx_to_rdmad(xprt).inline_rsize,
+-                                      rpcx_to_rdmad(xprt).inline_wsize);
+-                      size = req->rl_size;
+-                      rpc_exit(task, -EIO);           /* fail the operation */
+-                      rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
+-                      goto out;
+-              }
+               if (task->tk_flags & RPC_TASK_SWAPPER)
+                       nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
+               else
+@@ -543,7 +531,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
+               req = nreq;
+       }
+       dprintk("RPC:       %s: size %zd, request 0x%p\n", __func__, size, req);
+-out:
+       req->rl_connect_cookie = 0;     /* our reserved value */
+       return req->rl_xdr_buf;
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 9cb88f3..4a4e4ea 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -557,7 +557,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+        * adapter.
+        */
+       switch (memreg) {
+-      case RPCRDMA_BOUNCEBUFFERS:
+       case RPCRDMA_REGISTER:
+       case RPCRDMA_FRMR:
+               break;
+@@ -778,9 +777,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+       /* Client offers RDMA Read but does not initiate */
+       ep->rep_remote_cma.initiator_depth = 0;
+-      if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
+-              ep->rep_remote_cma.responder_resources = 0;
+-      else if (devattr.max_qp_rd_atom > 32)   /* arbitrary but <= 255 */
++      if (devattr.max_qp_rd_atom > 32)        /* arbitrary but <= 255 */
+               ep->rep_remote_cma.responder_resources = 32;
+       else
+               ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0111-xprtrdma-Remove-MEMWINDOWS-registration-modes.patch b/linux-next-cherry-picks/0111-xprtrdma-Remove-MEMWINDOWS-registration-modes.patch
new file mode 100644 (file)
index 0000000..fb7158d
--- /dev/null
@@ -0,0 +1,455 @@
+From b45ccfd25d506e83d9ecf93d0ac7edf031d35d2f Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:32:34 -0400
+Subject: [PATCH 112/132] xprtrdma: Remove MEMWINDOWS registration modes
+
+The MEMWINDOWS and MEMWINDOWS_ASYNC memory registration modes were
+intended as stop-gap modes before the introduction of FRMR. They
+are now considered obsolete.
+
+MEMWINDOWS_ASYNC is also considered unsafe because it can leave
+client memory registered and exposed for an indeterminant time after
+each I/O.
+
+At this point, the MEMWINDOWS modes add needless complexity, so
+remove them.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c  |   34 +--------
+ net/sunrpc/xprtrdma/transport.c |    9 +--
+ net/sunrpc/xprtrdma/verbs.c     |  165 +-------------------------------------
+ net/sunrpc/xprtrdma/xprt_rdma.h |    2 -
+ 4 files changed, 7 insertions(+), 203 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index 02b2941..46b5172 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -199,7 +199,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+               return 0;
+       do {
+-              /* bind/register the memory, then build chunk from result. */
+               int n = rpcrdma_register_external(seg, nsegs,
+                                               cur_wchunk != NULL, r_xprt);
+               if (n <= 0)
+@@ -698,16 +697,6 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
+ }
+ /*
+- * This function is called when memory window unbind which we are waiting
+- * for completes. Just use rr_func (zeroed by upcall) to signal completion.
+- */
+-static void
+-rpcrdma_unbind_func(struct rpcrdma_rep *rep)
+-{
+-      wake_up(&rep->rr_unbind);
+-}
+-
+-/*
+  * Called as a tasklet to do req/reply match and complete a request
+  * Errors must result in the RPC task either being awakened, or
+  * allowed to timeout, to discover the errors at that time.
+@@ -721,7 +710,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+       struct rpc_xprt *xprt = rep->rr_xprt;
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+       __be32 *iptr;
+-      int i, rdmalen, status;
++      int rdmalen, status;
+       /* Check status. If bad, signal disconnect and return rep to pool */
+       if (rep->rr_len == ~0U) {
+@@ -850,27 +839,6 @@ badheader:
+               break;
+       }
+-      /* If using mw bind, start the deregister process now. */
+-      /* (Note: if mr_free(), cannot perform it here, in tasklet context) */
+-      if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) {
+-      case RPCRDMA_MEMWINDOWS:
+-              for (i = 0; req->rl_nchunks-- > 1;)
+-                      i += rpcrdma_deregister_external(
+-                              &req->rl_segments[i], r_xprt, NULL);
+-              /* Optionally wait (not here) for unbinds to complete */
+-              rep->rr_func = rpcrdma_unbind_func;
+-              (void) rpcrdma_deregister_external(&req->rl_segments[i],
+-                                                 r_xprt, rep);
+-              break;
+-      case RPCRDMA_MEMWINDOWS_ASYNC:
+-              for (i = 0; req->rl_nchunks--;)
+-                      i += rpcrdma_deregister_external(&req->rl_segments[i],
+-                                                       r_xprt, NULL);
+-              break;
+-      default:
+-              break;
+-      }
+-
+       dprintk("RPC:       %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
+                       __func__, xprt, rqst, status);
+       xprt_complete_rqst(rqst->rq_task, status);
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index 8c5035a..c23b0c1 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -566,9 +566,7 @@ xprt_rdma_free(void *buffer)
+               __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
+       /*
+-       * Finish the deregistration. When using mw bind, this was
+-       * begun in rpcrdma_reply_handler(). In all other modes, we
+-       * do it here, in thread context. The process is considered
++       * Finish the deregistration.  The process is considered
+        * complete when the rr_func vector becomes NULL - this
+        * was put in place during rpcrdma_reply_handler() - the wait
+        * call below will not block if the dereg is "done". If
+@@ -580,11 +578,6 @@ xprt_rdma_free(void *buffer)
+                       &req->rl_segments[i], r_xprt, NULL);
+       }
+-      if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) {
+-              rep->rr_func = NULL;    /* abandon the callback */
+-              req->rl_reply = NULL;
+-      }
+-
+       if (req->rl_iov.length == 0) {  /* see allocate above */
+               struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer;
+               oreq->rl_reply = req->rl_reply;
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 4a4e4ea..304c7ad 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -152,7 +152,7 @@ void rpcrdma_event_process(struct ib_wc *wc)
+       dprintk("RPC:       %s: event rep %p status %X opcode %X length %u\n",
+               __func__, rep, wc->status, wc->opcode, wc->byte_len);
+-      if (!rep) /* send or bind completion that we don't care about */
++      if (!rep) /* send completion that we don't care about */
+               return;
+       if (IB_WC_SUCCESS != wc->status) {
+@@ -197,8 +197,6 @@ void rpcrdma_event_process(struct ib_wc *wc)
+                       }
+                       atomic_set(&rep->rr_buffer->rb_credits, credits);
+               }
+-              /* fall through */
+-      case IB_WC_BIND_MW:
+               rpcrdma_schedule_tasklet(rep);
+               break;
+       default:
+@@ -233,7 +231,7 @@ rpcrdma_cq_poll(struct ib_cq *cq)
+ /*
+  * rpcrdma_cq_event_upcall
+  *
+- * This upcall handles recv, send, bind and unbind events.
++ * This upcall handles recv and send events.
+  * It is reentrant but processes single events in order to maintain
+  * ordering of receives to keep server credits.
+  *
+@@ -494,16 +492,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+       }
+       switch (memreg) {
+-      case RPCRDMA_MEMWINDOWS:
+-      case RPCRDMA_MEMWINDOWS_ASYNC:
+-              if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
+-                      dprintk("RPC:       %s: MEMWINDOWS registration "
+-                              "specified but not supported by adapter, "
+-                              "using slower RPCRDMA_REGISTER\n",
+-                              __func__);
+-                      memreg = RPCRDMA_REGISTER;
+-              }
+-              break;
+       case RPCRDMA_MTHCAFMR:
+               if (!ia->ri_id->device->alloc_fmr) {
+ #if RPCRDMA_PERSISTENT_REGISTRATION
+@@ -567,16 +555,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+                               IB_ACCESS_REMOTE_READ;
+               goto register_setup;
+ #endif
+-      case RPCRDMA_MEMWINDOWS_ASYNC:
+-      case RPCRDMA_MEMWINDOWS:
+-              mem_priv = IB_ACCESS_LOCAL_WRITE |
+-                              IB_ACCESS_MW_BIND;
+-              goto register_setup;
+       case RPCRDMA_MTHCAFMR:
+               if (ia->ri_have_dma_lkey)
+                       break;
+               mem_priv = IB_ACCESS_LOCAL_WRITE;
++#if RPCRDMA_PERSISTENT_REGISTRATION
+       register_setup:
++#endif
+               ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
+               if (IS_ERR(ia->ri_bind_mem)) {
+                       printk(KERN_ALERT "%s: ib_get_dma_mr for "
+@@ -699,14 +684,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+               }
+               break;
+       }
+-      case RPCRDMA_MEMWINDOWS_ASYNC:
+-      case RPCRDMA_MEMWINDOWS:
+-              /* Add room for mw_binds+unbinds - overkill! */
+-              ep->rep_attr.cap.max_send_wr++;
+-              ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
+-              if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
+-                      return -EINVAL;
+-              break;
+       default:
+               break;
+       }
+@@ -728,14 +705,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+       /* set trigger for requesting send completion */
+       ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /*  - 1*/;
+-      switch (ia->ri_memreg_strategy) {
+-      case RPCRDMA_MEMWINDOWS_ASYNC:
+-      case RPCRDMA_MEMWINDOWS:
+-              ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
+-              break;
+-      default:
+-              break;
+-      }
+       if (ep->rep_cqinit <= 2)
+               ep->rep_cqinit = 0;
+       INIT_CQCOUNT(ep);
+@@ -743,11 +712,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+       init_waitqueue_head(&ep->rep_connect_wait);
+       INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
+-      /*
+-       * Create a single cq for receive dto and mw_bind (only ever
+-       * care about unbind, really). Send completions are suppressed.
+-       * Use single threaded tasklet upcalls to maintain ordering.
+-       */
+       ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
+                                 rpcrdma_cq_async_error_upcall, NULL,
+                                 ep->rep_attr.cap.max_recv_wr +
+@@ -1020,11 +984,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+               len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
+                               sizeof(struct rpcrdma_mw);
+               break;
+-      case RPCRDMA_MEMWINDOWS_ASYNC:
+-      case RPCRDMA_MEMWINDOWS:
+-              len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
+-                              sizeof(struct rpcrdma_mw);
+-              break;
+       default:
+               break;
+       }
+@@ -1055,11 +1014,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+       }
+       p += cdata->padding;
+-      /*
+-       * Allocate the fmr's, or mw's for mw_bind chunk registration.
+-       * We "cycle" the mw's in order to minimize rkey reuse,
+-       * and also reduce unbind-to-bind collision.
+-       */
+       INIT_LIST_HEAD(&buf->rb_mws);
+       r = (struct rpcrdma_mw *)p;
+       switch (ia->ri_memreg_strategy) {
+@@ -1107,21 +1061,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+                       ++r;
+               }
+               break;
+-      case RPCRDMA_MEMWINDOWS_ASYNC:
+-      case RPCRDMA_MEMWINDOWS:
+-              /* Allocate one extra request's worth, for full cycling */
+-              for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
+-                      r->r.mw = ib_alloc_mw(ia->ri_pd, IB_MW_TYPE_1);
+-                      if (IS_ERR(r->r.mw)) {
+-                              rc = PTR_ERR(r->r.mw);
+-                              dprintk("RPC:       %s: ib_alloc_mw"
+-                                      " failed %i\n", __func__, rc);
+-                              goto out;
+-                      }
+-                      list_add(&r->mw_list, &buf->rb_mws);
+-                      ++r;
+-              }
+-              break;
+       default:
+               break;
+       }
+@@ -1170,7 +1109,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+               memset(rep, 0, sizeof(struct rpcrdma_rep));
+               buf->rb_recv_bufs[i] = rep;
+               buf->rb_recv_bufs[i]->rr_buffer = buf;
+-              init_waitqueue_head(&rep->rr_unbind);
+               rc = rpcrdma_register_internal(ia, rep->rr_base,
+                               len - offsetof(struct rpcrdma_rep, rr_base),
+@@ -1204,7 +1142,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+       /* clean up in reverse order from create
+        *   1.  recv mr memory (mr free, then kfree)
+-       *   1a. bind mw memory
+        *   2.  send mr memory (mr free, then kfree)
+        *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
+        *   4.  arrays
+@@ -1248,15 +1185,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+                                       " failed %i\n",
+                                       __func__, rc);
+                       break;
+-              case RPCRDMA_MEMWINDOWS_ASYNC:
+-              case RPCRDMA_MEMWINDOWS:
+-                      rc = ib_dealloc_mw(r->r.mw);
+-                      if (rc)
+-                              dprintk("RPC:       %s:"
+-                                      " ib_dealloc_mw"
+-                                      " failed %i\n",
+-                                      __func__, rc);
+-                      break;
+               default:
+                       break;
+               }
+@@ -1331,15 +1259,12 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
+       req->rl_niovs = 0;
+       if (req->rl_reply) {
+               buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
+-              init_waitqueue_head(&req->rl_reply->rr_unbind);
+               req->rl_reply->rr_func = NULL;
+               req->rl_reply = NULL;
+       }
+       switch (ia->ri_memreg_strategy) {
+       case RPCRDMA_FRMR:
+       case RPCRDMA_MTHCAFMR:
+-      case RPCRDMA_MEMWINDOWS_ASYNC:
+-      case RPCRDMA_MEMWINDOWS:
+               /*
+                * Cycle mw's back in reverse order, and "spin" them.
+                * This delays and scrambles reuse as much as possible.
+@@ -1384,8 +1309,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
+ /*
+  * Put reply buffers back into pool when not attached to
+- * request. This happens in error conditions, and when
+- * aborting unbinds. Pre-decrement counter/array index.
++ * request. This happens in error conditions.
+  */
+ void
+ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
+@@ -1688,74 +1612,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
+ }
+ static int
+-rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
+-                      int *nsegs, int writing, struct rpcrdma_ia *ia,
+-                      struct rpcrdma_xprt *r_xprt)
+-{
+-      int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
+-                                IB_ACCESS_REMOTE_READ);
+-      struct ib_mw_bind param;
+-      int rc;
+-
+-      *nsegs = 1;
+-      rpcrdma_map_one(ia, seg, writing);
+-      param.bind_info.mr = ia->ri_bind_mem;
+-      param.wr_id = 0ULL;     /* no send cookie */
+-      param.bind_info.addr = seg->mr_dma;
+-      param.bind_info.length = seg->mr_len;
+-      param.send_flags = 0;
+-      param.bind_info.mw_access_flags = mem_priv;
+-
+-      DECR_CQCOUNT(&r_xprt->rx_ep);
+-      rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
+-      if (rc) {
+-              dprintk("RPC:       %s: failed ib_bind_mw "
+-                      "%u@0x%llx status %i\n",
+-                      __func__, seg->mr_len,
+-                      (unsigned long long)seg->mr_dma, rc);
+-              rpcrdma_unmap_one(ia, seg);
+-      } else {
+-              seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
+-              seg->mr_base = param.bind_info.addr;
+-              seg->mr_nsegs = 1;
+-      }
+-      return rc;
+-}
+-
+-static int
+-rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
+-                      struct rpcrdma_ia *ia,
+-                      struct rpcrdma_xprt *r_xprt, void **r)
+-{
+-      struct ib_mw_bind param;
+-      LIST_HEAD(l);
+-      int rc;
+-
+-      BUG_ON(seg->mr_nsegs != 1);
+-      param.bind_info.mr = ia->ri_bind_mem;
+-      param.bind_info.addr = 0ULL;    /* unbind */
+-      param.bind_info.length = 0;
+-      param.bind_info.mw_access_flags = 0;
+-      if (*r) {
+-              param.wr_id = (u64) (unsigned long) *r;
+-              param.send_flags = IB_SEND_SIGNALED;
+-              INIT_CQCOUNT(&r_xprt->rx_ep);
+-      } else {
+-              param.wr_id = 0ULL;
+-              param.send_flags = 0;
+-              DECR_CQCOUNT(&r_xprt->rx_ep);
+-      }
+-      rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
+-      rpcrdma_unmap_one(ia, seg);
+-      if (rc)
+-              dprintk("RPC:       %s: failed ib_(un)bind_mw,"
+-                      " status %i\n", __func__, rc);
+-      else
+-              *r = NULL;      /* will upcall on completion */
+-      return rc;
+-}
+-
+-static int
+ rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
+                       int *nsegs, int writing, struct rpcrdma_ia *ia)
+ {
+@@ -1845,12 +1701,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
+               rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
+               break;
+-      /* Registration using memory windows */
+-      case RPCRDMA_MEMWINDOWS_ASYNC:
+-      case RPCRDMA_MEMWINDOWS:
+-              rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
+-              break;
+-
+       /* Default registration each time */
+       default:
+               rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
+@@ -1887,11 +1737,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
+               rc = rpcrdma_deregister_fmr_external(seg, ia);
+               break;
+-      case RPCRDMA_MEMWINDOWS_ASYNC:
+-      case RPCRDMA_MEMWINDOWS:
+-              rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
+-              break;
+-
+       default:
+               rc = rpcrdma_deregister_default_external(seg, ia);
+               break;
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index c620d13..bf08ee0 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -127,7 +127,6 @@ struct rpcrdma_rep {
+       struct rpc_xprt *rr_xprt;       /* needed for request/reply matching */
+       void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
+       struct list_head rr_list;       /* tasklet list */
+-      wait_queue_head_t rr_unbind;    /* optional unbind wait */
+       struct ib_sge   rr_iov;         /* for posting */
+       struct ib_mr    *rr_handle;     /* handle for mem in rr_iov */
+       char    rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
+@@ -162,7 +161,6 @@ struct rpcrdma_mr_seg {            /* chunk descriptors */
+               struct ib_mr    *rl_mr;         /* if registered directly */
+               struct rpcrdma_mw {             /* if registered from region */
+                       union {
+-                              struct ib_mw    *mw;
+                               struct ib_fmr   *fmr;
+                               struct {
+                                       struct ib_fast_reg_page_list *fr_pgl;
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0112-xprtrdma-Remove-REGISTER-memory-registration-mode.patch b/linux-next-cherry-picks/0112-xprtrdma-Remove-REGISTER-memory-registration-mode.patch
new file mode 100644 (file)
index 0000000..06c8809
--- /dev/null
@@ -0,0 +1,191 @@
+From 0ac531c1832318efa3dc3d723e356a7e09330e80 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:32:43 -0400
+Subject: [PATCH 113/132] xprtrdma: Remove REGISTER memory registration mode
+
+All kernel RDMA providers except amso1100 support either MTHCAFMR
+or FRMR, both of which are faster than REGISTER.  amso1100 can
+continue to use ALLPHYSICAL.
+
+The only other ULP consumer in the kernel that uses the reg_phys_mr
+verb is Lustre.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c |    3 +-
+ net/sunrpc/xprtrdma/verbs.c    |   90 ++--------------------------------------
+ 2 files changed, 5 insertions(+), 88 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index 46b5172..aae1726 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -476,8 +476,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
+                        * on receive. Therefore, we request a reply chunk
+                        * for non-writes wherever feasible and efficient.
+                        */
+-                      if (wtype == rpcrdma_noch &&
+-                          r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER)
++                      if (wtype == rpcrdma_noch)
+                               wtype = rpcrdma_replych;
+               }
+       }
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 304c7ad..6bb9a07 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -494,19 +494,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+       switch (memreg) {
+       case RPCRDMA_MTHCAFMR:
+               if (!ia->ri_id->device->alloc_fmr) {
+-#if RPCRDMA_PERSISTENT_REGISTRATION
+                       dprintk("RPC:       %s: MTHCAFMR registration "
+                               "specified but not supported by adapter, "
+                               "using riskier RPCRDMA_ALLPHYSICAL\n",
+                               __func__);
+                       memreg = RPCRDMA_ALLPHYSICAL;
+-#else
+-                      dprintk("RPC:       %s: MTHCAFMR registration "
+-                              "specified but not supported by adapter, "
+-                              "using slower RPCRDMA_REGISTER\n",
+-                              __func__);
+-                      memreg = RPCRDMA_REGISTER;
+-#endif
+               }
+               break;
+       case RPCRDMA_FRMR:
+@@ -514,19 +506,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+               if ((devattr.device_cap_flags &
+                    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
+                   (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
+-#if RPCRDMA_PERSISTENT_REGISTRATION
+                       dprintk("RPC:       %s: FRMR registration "
+                               "specified but not supported by adapter, "
+                               "using riskier RPCRDMA_ALLPHYSICAL\n",
+                               __func__);
+                       memreg = RPCRDMA_ALLPHYSICAL;
+-#else
+-                      dprintk("RPC:       %s: FRMR registration "
+-                              "specified but not supported by adapter, "
+-                              "using slower RPCRDMA_REGISTER\n",
+-                              __func__);
+-                      memreg = RPCRDMA_REGISTER;
+-#endif
+               } else {
+                       /* Mind the ia limit on FRMR page list depth */
+                       ia->ri_max_frmr_depth = min_t(unsigned int,
+@@ -545,7 +529,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+        * adapter.
+        */
+       switch (memreg) {
+-      case RPCRDMA_REGISTER:
+       case RPCRDMA_FRMR:
+               break;
+ #if RPCRDMA_PERSISTENT_REGISTRATION
+@@ -565,11 +548,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+               ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
+               if (IS_ERR(ia->ri_bind_mem)) {
+                       printk(KERN_ALERT "%s: ib_get_dma_mr for "
+-                              "phys register failed with %lX\n\t"
+-                              "Will continue with degraded performance\n",
++                              "phys register failed with %lX\n",
+                               __func__, PTR_ERR(ia->ri_bind_mem));
+-                      memreg = RPCRDMA_REGISTER;
+-                      ia->ri_bind_mem = NULL;
++                      rc = -ENOMEM;
++                      goto out2;
+               }
+               break;
+       default:
+@@ -1611,67 +1593,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
+       return rc;
+ }
+-static int
+-rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
+-                      int *nsegs, int writing, struct rpcrdma_ia *ia)
+-{
+-      int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
+-                                IB_ACCESS_REMOTE_READ);
+-      struct rpcrdma_mr_seg *seg1 = seg;
+-      struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
+-      int len, i, rc = 0;
+-
+-      if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+-              *nsegs = RPCRDMA_MAX_DATA_SEGS;
+-      for (len = 0, i = 0; i < *nsegs;) {
+-              rpcrdma_map_one(ia, seg, writing);
+-              ipb[i].addr = seg->mr_dma;
+-              ipb[i].size = seg->mr_len;
+-              len += seg->mr_len;
+-              ++seg;
+-              ++i;
+-              /* Check for holes */
+-              if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+-                  offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
+-                      break;
+-      }
+-      seg1->mr_base = seg1->mr_dma;
+-      seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
+-                              ipb, i, mem_priv, &seg1->mr_base);
+-      if (IS_ERR(seg1->mr_chunk.rl_mr)) {
+-              rc = PTR_ERR(seg1->mr_chunk.rl_mr);
+-              dprintk("RPC:       %s: failed ib_reg_phys_mr "
+-                      "%u@0x%llx (%d)... status %i\n",
+-                      __func__, len,
+-                      (unsigned long long)seg1->mr_dma, i, rc);
+-              while (i--)
+-                      rpcrdma_unmap_one(ia, --seg);
+-      } else {
+-              seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
+-              seg1->mr_nsegs = i;
+-              seg1->mr_len = len;
+-      }
+-      *nsegs = i;
+-      return rc;
+-}
+-
+-static int
+-rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
+-                      struct rpcrdma_ia *ia)
+-{
+-      struct rpcrdma_mr_seg *seg1 = seg;
+-      int rc;
+-
+-      rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
+-      seg1->mr_chunk.rl_mr = NULL;
+-      while (seg1->mr_nsegs--)
+-              rpcrdma_unmap_one(ia, seg++);
+-      if (rc)
+-              dprintk("RPC:       %s: failed ib_dereg_mr,"
+-                      " status %i\n", __func__, rc);
+-      return rc;
+-}
+-
+ int
+ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
+                       int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
+@@ -1701,10 +1622,8 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
+               rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
+               break;
+-      /* Default registration each time */
+       default:
+-              rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
+-              break;
++              return -1;
+       }
+       if (rc)
+               return -1;
+@@ -1738,7 +1657,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
+               break;
+       default:
+-              rc = rpcrdma_deregister_default_external(seg, ia);
+               break;
+       }
+       if (r) {
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0113-xprtrdma-Fall-back-to-MTHCAFMR-when-FRMR-is-not-supp.patch b/linux-next-cherry-picks/0113-xprtrdma-Fall-back-to-MTHCAFMR-when-FRMR-is-not-supp.patch
new file mode 100644 (file)
index 0000000..0743d58
--- /dev/null
@@ -0,0 +1,73 @@
+From f10eafd3a6ce9da7e96999c124b643ea6c4921f3 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:32:51 -0400
+Subject: [PATCH 114/132] xprtrdma: Fall back to MTHCAFMR when FRMR is not supported
+
+An audit of in-kernel RDMA providers that do not support the FRMR
+memory registration shows that several of them support MTHCAFMR.
+Prefer MTHCAFMR when FRMR is not supported.
+
+If MTHCAFMR is not supported, only then choose ALLPHYSICAL.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c |   31 +++++++++++++++----------------
+ 1 files changed, 15 insertions(+), 16 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 6bb9a07..a352798 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -491,33 +491,32 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+               ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
+       }
+-      switch (memreg) {
+-      case RPCRDMA_MTHCAFMR:
+-              if (!ia->ri_id->device->alloc_fmr) {
+-                      dprintk("RPC:       %s: MTHCAFMR registration "
+-                              "specified but not supported by adapter, "
+-                              "using riskier RPCRDMA_ALLPHYSICAL\n",
+-                              __func__);
+-                      memreg = RPCRDMA_ALLPHYSICAL;
+-              }
+-              break;
+-      case RPCRDMA_FRMR:
++      if (memreg == RPCRDMA_FRMR) {
+               /* Requires both frmr reg and local dma lkey */
+               if ((devattr.device_cap_flags &
+                    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
+                   (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
+                       dprintk("RPC:       %s: FRMR registration "
+-                              "specified but not supported by adapter, "
+-                              "using riskier RPCRDMA_ALLPHYSICAL\n",
+-                              __func__);
+-                      memreg = RPCRDMA_ALLPHYSICAL;
++                              "not supported by HCA\n", __func__);
++                      memreg = RPCRDMA_MTHCAFMR;
+               } else {
+                       /* Mind the ia limit on FRMR page list depth */
+                       ia->ri_max_frmr_depth = min_t(unsigned int,
+                               RPCRDMA_MAX_DATA_SEGS,
+                               devattr.max_fast_reg_page_list_len);
+               }
+-              break;
++      }
++      if (memreg == RPCRDMA_MTHCAFMR) {
++              if (!ia->ri_id->device->alloc_fmr) {
++                      dprintk("RPC:       %s: MTHCAFMR registration "
++                              "not supported by HCA\n", __func__);
++#if RPCRDMA_PERSISTENT_REGISTRATION
++                      memreg = RPCRDMA_ALLPHYSICAL;
++#else
++                      rc = -EINVAL;
++                      goto out2;
++#endif
++              }
+       }
+       /*
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0114-xprtrdma-mount-reports-Invalid-mount-option-if-memre.patch b/linux-next-cherry-picks/0114-xprtrdma-mount-reports-Invalid-mount-option-if-memre.patch
new file mode 100644 (file)
index 0000000..8052dfa
--- /dev/null
@@ -0,0 +1,46 @@
+From cdd9ade711599e7672a635add0406080856f8b92 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:00 -0400
+Subject: [PATCH 115/132] xprtrdma: mount reports "Invalid mount option" if memreg mode not supported
+
+If the selected memory registration mode is not supported by the
+underlying provider/HCA, the NFS mount command reports that there was
+an invalid mount option, and fails. This is misleading.
+
+Reporting a problem allocating memory is a lot closer to the truth.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c |    8 ++++----
+ 1 files changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index a352798..7c7e9b4 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -513,7 +513,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+ #if RPCRDMA_PERSISTENT_REGISTRATION
+                       memreg = RPCRDMA_ALLPHYSICAL;
+ #else
+-                      rc = -EINVAL;
++                      rc = -ENOMEM;
+                       goto out2;
+ #endif
+               }
+@@ -554,9 +554,9 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+               }
+               break;
+       default:
+-              printk(KERN_ERR "%s: invalid memory registration mode %d\n",
+-                              __func__, memreg);
+-              rc = -EINVAL;
++              printk(KERN_ERR "RPC: Unsupported memory "
++                              "registration mode: %d\n", memreg);
++              rc = -ENOMEM;
+               goto out2;
+       }
+       dprintk("RPC:       %s: memory registration strategy is %d\n",
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0115-xprtrdma-Simplify-rpcrdma_deregister_external-synops.patch b/linux-next-cherry-picks/0115-xprtrdma-Simplify-rpcrdma_deregister_external-synops.patch
new file mode 100644 (file)
index 0000000..883ab79
--- /dev/null
@@ -0,0 +1,86 @@
+From 13c9ff8f673862b69e795ea99a237b461c557eb3 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:08 -0400
+Subject: [PATCH 116/132] xprtrdma: Simplify rpcrdma_deregister_external() synopsis
+
+Clean up: All remaining callers of rpcrdma_deregister_external()
+pass NULL as the last argument, so remove that argument.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c  |    2 +-
+ net/sunrpc/xprtrdma/transport.c |    2 +-
+ net/sunrpc/xprtrdma/verbs.c     |    8 +-------
+ net/sunrpc/xprtrdma/xprt_rdma.h |    2 +-
+ 4 files changed, 4 insertions(+), 10 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index aae1726..436d229 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -270,7 +270,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+ out:
+       for (pos = 0; nchunks--;)
+               pos += rpcrdma_deregister_external(
+-                              &req->rl_segments[pos], r_xprt, NULL);
++                              &req->rl_segments[pos], r_xprt);
+       return 0;
+ }
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index c23b0c1..430cabb 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -575,7 +575,7 @@ xprt_rdma_free(void *buffer)
+       for (i = 0; req->rl_nchunks;) {
+               --req->rl_nchunks;
+               i += rpcrdma_deregister_external(
+-                      &req->rl_segments[i], r_xprt, NULL);
++                      &req->rl_segments[i], r_xprt);
+       }
+       if (req->rl_iov.length == 0) {  /* see allocate above */
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 7c7e9b4..0cbc83c 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -1632,7 +1632,7 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
+ int
+ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
+-              struct rpcrdma_xprt *r_xprt, void *r)
++              struct rpcrdma_xprt *r_xprt)
+ {
+       struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+       int nsegs = seg->mr_nsegs, rc;
+@@ -1658,12 +1658,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
+       default:
+               break;
+       }
+-      if (r) {
+-              struct rpcrdma_rep *rep = r;
+-              void (*func)(struct rpcrdma_rep *) = rep->rr_func;
+-              rep->rr_func = NULL;
+-              func(rep);      /* dereg done, callback now */
+-      }
+       return nsegs;
+ }
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index bf08ee0..3f44d6a 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -331,7 +331,7 @@ int rpcrdma_deregister_internal(struct rpcrdma_ia *,
+ int rpcrdma_register_external(struct rpcrdma_mr_seg *,
+                               int, int, struct rpcrdma_xprt *);
+ int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
+-                              struct rpcrdma_xprt *, void *);
++                              struct rpcrdma_xprt *);
+ /*
+  * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0116-xprtrdma-Make-rpcrdma_ep_destroy-return-void.patch b/linux-next-cherry-picks/0116-xprtrdma-Make-rpcrdma_ep_destroy-return-void.patch
new file mode 100644 (file)
index 0000000..bf5a979
--- /dev/null
@@ -0,0 +1,95 @@
+From 7f1d54191ed6fa0f79f584fe3ebf6519738e817f Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:16 -0400
+Subject: [PATCH 117/132] xprtrdma: Make rpcrdma_ep_destroy() return void
+
+Clean up: rpcrdma_ep_destroy() returns a value that is used
+only to print a debugging message. rpcrdma_ep_destroy() already
+prints debugging messages in all error cases.
+
+Make rpcrdma_ep_destroy() return void instead.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/transport.c |    8 ++------
+ net/sunrpc/xprtrdma/verbs.c     |    7 +------
+ net/sunrpc/xprtrdma/xprt_rdma.h |    2 +-
+ 3 files changed, 4 insertions(+), 13 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index 430cabb..d18b2a3 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -229,7 +229,6 @@ static void
+ xprt_rdma_destroy(struct rpc_xprt *xprt)
+ {
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+-      int rc;
+       dprintk("RPC:       %s: called\n", __func__);
+@@ -238,10 +237,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
+       xprt_clear_connected(xprt);
+       rpcrdma_buffer_destroy(&r_xprt->rx_buf);
+-      rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
+-      if (rc)
+-              dprintk("RPC:       %s: rpcrdma_ep_destroy returned %i\n",
+-                      __func__, rc);
++      rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
+       rpcrdma_ia_close(&r_xprt->rx_ia);
+       xprt_rdma_free_addresses(xprt);
+@@ -391,7 +387,7 @@ out4:
+       xprt_rdma_free_addresses(xprt);
+       rc = -EINVAL;
+ out3:
+-      (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
++      rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
+ out2:
+       rpcrdma_ia_close(&new_xprt->rx_ia);
+ out1:
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 0cbc83c..edc951e 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -748,11 +748,8 @@ out1:
+  * Disconnect and destroy endpoint. After this, the only
+  * valid operations on the ep are to free it (if dynamically
+  * allocated) or re-create it.
+- *
+- * The caller's error handling must be sure to not leak the endpoint
+- * if this function fails.
+  */
+-int
++void
+ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+ {
+       int rc;
+@@ -782,8 +779,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+       if (rc)
+               dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
+                       __func__, rc);
+-
+-      return rc;
+ }
+ /*
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index 3f44d6a..362a19d 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -301,7 +301,7 @@ void rpcrdma_ia_close(struct rpcrdma_ia *);
+  */
+ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
+                               struct rpcrdma_create_data_internal *);
+-int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
++void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
+ int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+ int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0117-xprtrdma-Split-the-completion-queue.patch b/linux-next-cherry-picks/0117-xprtrdma-Split-the-completion-queue.patch
new file mode 100644 (file)
index 0000000..8c813af
--- /dev/null
@@ -0,0 +1,395 @@
+From fc66448549bbb77f2f1a38b270ab2d6b6a22da33 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:25 -0400
+Subject: [PATCH 118/132] xprtrdma: Split the completion queue
+
+The current CQ handler uses the ib_wc.opcode field to distinguish
+between event types. However, the contents of that field are not
+reliable if the completion status is not IB_WC_SUCCESS.
+
+When an error completion occurs on a send event, the CQ handler
+schedules a tasklet with something that is not a struct rpcrdma_rep.
+This is never correct behavior, and sometimes it results in a panic.
+
+To resolve this issue, split the completion queue into a send CQ and
+a receive CQ. The send CQ handler now handles only struct rpcrdma_mw
+wr_id's, and the receive CQ handler now handles only struct
+rpcrdma_rep wr_id's.
+
+Fix suggested by Shirley Ma <shirley.ma@oracle.com>
+
+Reported-by: Rafael Reiter <rafael.reiter@ims.co.at>
+Fixes: 5c635e09cec0feeeb310968e51dad01040244851
+BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=73211
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Klemens Senn <klemens.senn@ims.co.at>
+Tested-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c     |  228 +++++++++++++++++++++++----------------
+ net/sunrpc/xprtrdma/xprt_rdma.h |    1 -
+ 2 files changed, 137 insertions(+), 92 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index edc951e..af2d097 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -142,96 +142,115 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
+       }
+ }
+-static inline
+-void rpcrdma_event_process(struct ib_wc *wc)
++static void
++rpcrdma_sendcq_process_wc(struct ib_wc *wc)
+ {
+-      struct rpcrdma_mw *frmr;
+-      struct rpcrdma_rep *rep =
+-                      (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
++      struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+-      dprintk("RPC:       %s: event rep %p status %X opcode %X length %u\n",
+-              __func__, rep, wc->status, wc->opcode, wc->byte_len);
++      dprintk("RPC:       %s: frmr %p status %X opcode %d\n",
++              __func__, frmr, wc->status, wc->opcode);
+-      if (!rep) /* send completion that we don't care about */
++      if (wc->wr_id == 0ULL)
+               return;
+-
+-      if (IB_WC_SUCCESS != wc->status) {
+-              dprintk("RPC:       %s: WC opcode %d status %X, connection lost\n",
+-                      __func__, wc->opcode, wc->status);
+-              rep->rr_len = ~0U;
+-              if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
+-                      rpcrdma_schedule_tasklet(rep);
++      if (wc->status != IB_WC_SUCCESS)
+               return;
+-      }
+-      switch (wc->opcode) {
+-      case IB_WC_FAST_REG_MR:
+-              frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
++      if (wc->opcode == IB_WC_FAST_REG_MR)
+               frmr->r.frmr.state = FRMR_IS_VALID;
+-              break;
+-      case IB_WC_LOCAL_INV:
+-              frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
++      else if (wc->opcode == IB_WC_LOCAL_INV)
+               frmr->r.frmr.state = FRMR_IS_INVALID;
+-              break;
+-      case IB_WC_RECV:
+-              rep->rr_len = wc->byte_len;
+-              ib_dma_sync_single_for_cpu(
+-                      rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+-                      rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
+-              /* Keep (only) the most recent credits, after check validity */
+-              if (rep->rr_len >= 16) {
+-                      struct rpcrdma_msg *p =
+-                                      (struct rpcrdma_msg *) rep->rr_base;
+-                      unsigned int credits = ntohl(p->rm_credit);
+-                      if (credits == 0) {
+-                              dprintk("RPC:       %s: server"
+-                                      " dropped credits to 0!\n", __func__);
+-                              /* don't deadlock */
+-                              credits = 1;
+-                      } else if (credits > rep->rr_buffer->rb_max_requests) {
+-                              dprintk("RPC:       %s: server"
+-                                      " over-crediting: %d (%d)\n",
+-                                      __func__, credits,
+-                                      rep->rr_buffer->rb_max_requests);
+-                              credits = rep->rr_buffer->rb_max_requests;
+-                      }
+-                      atomic_set(&rep->rr_buffer->rb_credits, credits);
+-              }
+-              rpcrdma_schedule_tasklet(rep);
+-              break;
+-      default:
+-              dprintk("RPC:       %s: unexpected WC event %X\n",
+-                      __func__, wc->opcode);
+-              break;
+-      }
+ }
+-static inline int
+-rpcrdma_cq_poll(struct ib_cq *cq)
++static int
++rpcrdma_sendcq_poll(struct ib_cq *cq)
+ {
+       struct ib_wc wc;
+       int rc;
+-      for (;;) {
+-              rc = ib_poll_cq(cq, 1, &wc);
+-              if (rc < 0) {
+-                      dprintk("RPC:       %s: ib_poll_cq failed %i\n",
+-                              __func__, rc);
+-                      return rc;
+-              }
+-              if (rc == 0)
+-                      break;
++      while ((rc = ib_poll_cq(cq, 1, &wc)) == 1)
++              rpcrdma_sendcq_process_wc(&wc);
++      return rc;
++}
+-              rpcrdma_event_process(&wc);
++/*
++ * Handle send, fast_reg_mr, and local_inv completions.
++ *
++ * Send events are typically suppressed and thus do not result
++ * in an upcall. Occasionally one is signaled, however. This
++ * prevents the provider's completion queue from wrapping and
++ * losing a completion.
++ */
++static void
++rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
++{
++      int rc;
++
++      rc = rpcrdma_sendcq_poll(cq);
++      if (rc) {
++              dprintk("RPC:       %s: ib_poll_cq failed: %i\n",
++                      __func__, rc);
++              return;
+       }
+-      return 0;
++      rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
++      if (rc) {
++              dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
++                      __func__, rc);
++              return;
++      }
++
++      rpcrdma_sendcq_poll(cq);
++}
++
++static void
++rpcrdma_recvcq_process_wc(struct ib_wc *wc)
++{
++      struct rpcrdma_rep *rep =
++                      (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
++
++      dprintk("RPC:       %s: rep %p status %X opcode %X length %u\n",
++              __func__, rep, wc->status, wc->opcode, wc->byte_len);
++
++      if (wc->status != IB_WC_SUCCESS) {
++              rep->rr_len = ~0U;
++              goto out_schedule;
++      }
++      if (wc->opcode != IB_WC_RECV)
++              return;
++
++      rep->rr_len = wc->byte_len;
++      ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
++                      rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
++
++      if (rep->rr_len >= 16) {
++              struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
++              unsigned int credits = ntohl(p->rm_credit);
++
++              if (credits == 0)
++                      credits = 1;    /* don't deadlock */
++              else if (credits > rep->rr_buffer->rb_max_requests)
++                      credits = rep->rr_buffer->rb_max_requests;
++              atomic_set(&rep->rr_buffer->rb_credits, credits);
++      }
++
++out_schedule:
++      rpcrdma_schedule_tasklet(rep);
++}
++
++static int
++rpcrdma_recvcq_poll(struct ib_cq *cq)
++{
++      struct ib_wc wc;
++      int rc;
++
++      while ((rc = ib_poll_cq(cq, 1, &wc)) == 1)
++              rpcrdma_recvcq_process_wc(&wc);
++      return rc;
+ }
+ /*
+- * rpcrdma_cq_event_upcall
++ * Handle receive completions.
+  *
+- * This upcall handles recv and send events.
+  * It is reentrant but processes single events in order to maintain
+  * ordering of receives to keep server credits.
+  *
+@@ -240,26 +259,27 @@ rpcrdma_cq_poll(struct ib_cq *cq)
+  * connection shutdown. That is, the structures required for
+  * the completion of the reply handler must remain intact until
+  * all memory has been reclaimed.
+- *
+- * Note that send events are suppressed and do not result in an upcall.
+  */
+ static void
+-rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
++rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
+ {
+       int rc;
+-      rc = rpcrdma_cq_poll(cq);
+-      if (rc)
++      rc = rpcrdma_recvcq_poll(cq);
++      if (rc) {
++              dprintk("RPC:       %s: ib_poll_cq failed: %i\n",
++                      __func__, rc);
+               return;
++      }
+       rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+       if (rc) {
+-              dprintk("RPC:       %s: ib_req_notify_cq failed %i\n",
++              dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
+                       __func__, rc);
+               return;
+       }
+-      rpcrdma_cq_poll(cq);
++      rpcrdma_recvcq_poll(cq);
+ }
+ #ifdef RPC_DEBUG
+@@ -610,6 +630,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+                               struct rpcrdma_create_data_internal *cdata)
+ {
+       struct ib_device_attr devattr;
++      struct ib_cq *sendcq, *recvcq;
+       int rc, err;
+       rc = ib_query_device(ia->ri_id->device, &devattr);
+@@ -685,7 +706,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+               ep->rep_attr.cap.max_recv_sge);
+       /* set trigger for requesting send completion */
+-      ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /*  - 1*/;
++      ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
+       if (ep->rep_cqinit <= 2)
+               ep->rep_cqinit = 0;
+       INIT_CQCOUNT(ep);
+@@ -693,26 +714,43 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+       init_waitqueue_head(&ep->rep_connect_wait);
+       INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
+-      ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
++      sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
+                                 rpcrdma_cq_async_error_upcall, NULL,
+-                                ep->rep_attr.cap.max_recv_wr +
+                                 ep->rep_attr.cap.max_send_wr + 1, 0);
+-      if (IS_ERR(ep->rep_cq)) {
+-              rc = PTR_ERR(ep->rep_cq);
+-              dprintk("RPC:       %s: ib_create_cq failed: %i\n",
++      if (IS_ERR(sendcq)) {
++              rc = PTR_ERR(sendcq);
++              dprintk("RPC:       %s: failed to create send CQ: %i\n",
+                       __func__, rc);
+               goto out1;
+       }
+-      rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
++      rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
+       if (rc) {
+               dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
+                       __func__, rc);
+               goto out2;
+       }
+-      ep->rep_attr.send_cq = ep->rep_cq;
+-      ep->rep_attr.recv_cq = ep->rep_cq;
++      recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
++                                rpcrdma_cq_async_error_upcall, NULL,
++                                ep->rep_attr.cap.max_recv_wr + 1, 0);
++      if (IS_ERR(recvcq)) {
++              rc = PTR_ERR(recvcq);
++              dprintk("RPC:       %s: failed to create recv CQ: %i\n",
++                      __func__, rc);
++              goto out2;
++      }
++
++      rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
++      if (rc) {
++              dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
++                      __func__, rc);
++              ib_destroy_cq(recvcq);
++              goto out2;
++      }
++
++      ep->rep_attr.send_cq = sendcq;
++      ep->rep_attr.recv_cq = recvcq;
+       /* Initialize cma parameters */
+@@ -734,7 +772,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+       return 0;
+ out2:
+-      err = ib_destroy_cq(ep->rep_cq);
++      err = ib_destroy_cq(sendcq);
+       if (err)
+               dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
+                       __func__, err);
+@@ -774,8 +812,14 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+               ep->rep_pad_mr = NULL;
+       }
+-      rpcrdma_clean_cq(ep->rep_cq);
+-      rc = ib_destroy_cq(ep->rep_cq);
++      rpcrdma_clean_cq(ep->rep_attr.recv_cq);
++      rc = ib_destroy_cq(ep->rep_attr.recv_cq);
++      if (rc)
++              dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
++                      __func__, rc);
++
++      rpcrdma_clean_cq(ep->rep_attr.send_cq);
++      rc = ib_destroy_cq(ep->rep_attr.send_cq);
+       if (rc)
+               dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
+                       __func__, rc);
+@@ -798,7 +842,9 @@ retry:
+               if (rc && rc != -ENOTCONN)
+                       dprintk("RPC:       %s: rpcrdma_ep_disconnect"
+                               " status %i\n", __func__, rc);
+-              rpcrdma_clean_cq(ep->rep_cq);
++
++              rpcrdma_clean_cq(ep->rep_attr.recv_cq);
++              rpcrdma_clean_cq(ep->rep_attr.send_cq);
+               xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
+               id = rpcrdma_create_id(xprt, ia,
+@@ -907,7 +953,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+ {
+       int rc;
+-      rpcrdma_clean_cq(ep->rep_cq);
++      rpcrdma_clean_cq(ep->rep_attr.recv_cq);
++      rpcrdma_clean_cq(ep->rep_attr.send_cq);
+       rc = rdma_disconnect(ia->ri_id);
+       if (!rc) {
+               /* returns without wait if not connected */
+@@ -1727,7 +1774,6 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
+       ib_dma_sync_single_for_cpu(ia->ri_id->device,
+               rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
+-      DECR_CQCOUNT(ep);
+       rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
+       if (rc)
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index 362a19d..334ab6e 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -79,7 +79,6 @@ struct rpcrdma_ep {
+       int                     rep_cqinit;
+       int                     rep_connected;
+       struct rpcrdma_ia       *rep_ia;
+-      struct ib_cq            *rep_cq;
+       struct ib_qp_init_attr  rep_attr;
+       wait_queue_head_t       rep_connect_wait;
+       struct ib_sge           rep_pad;        /* holds zeroed pad */
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0118-xprtrmda-Reduce-lock-contention-in-completion-handle.patch b/linux-next-cherry-picks/0118-xprtrmda-Reduce-lock-contention-in-completion-handle.patch
new file mode 100644 (file)
index 0000000..115ab68
--- /dev/null
@@ -0,0 +1,50 @@
+From 7f23f6f6e388d2003c4ecf5d558f3c2191e12530 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:34 -0400
+Subject: [PATCH 119/132] xprtrmda: Reduce lock contention in completion handlers
+
+Skip the ib_poll_cq() after re-arming, if the provider knows there
+are no additional items waiting. (Have a look at commit ed23a727 for
+more details).
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c |   14 ++++++++++----
+ 1 files changed, 10 insertions(+), 4 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index af2d097..c7d5281 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -192,8 +192,11 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
+               return;
+       }
+-      rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+-      if (rc) {
++      rc = ib_req_notify_cq(cq,
++                      IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
++      if (rc == 0)
++              return;
++      if (rc < 0) {
+               dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
+                       __func__, rc);
+               return;
+@@ -272,8 +275,11 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
+               return;
+       }
+-      rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+-      if (rc) {
++      rc = ib_req_notify_cq(cq,
++                      IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
++      if (rc == 0)
++              return;
++      if (rc < 0) {
+               dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
+                       __func__, rc);
+               return;
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0119-xprtrmda-Reduce-calls-to-ib_poll_cq-in-completion-ha.patch b/linux-next-cherry-picks/0119-xprtrmda-Reduce-calls-to-ib_poll_cq-in-completion-ha.patch
new file mode 100644 (file)
index 0000000..49703e6
--- /dev/null
@@ -0,0 +1,165 @@
+From 1c00dd0776543608e13c74a527660cb8cd28a74f Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:42 -0400
+Subject: [PATCH 120/132] xprtrmda: Reduce calls to ib_poll_cq() in completion handlers
+
+Change the completion handlers to grab up to 16 items per
+ib_poll_cq() call. No extra ib_poll_cq() is needed if fewer than 16
+items are returned.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c     |   56 ++++++++++++++++++++++++++------------
+ net/sunrpc/xprtrdma/xprt_rdma.h |    4 +++
+ 2 files changed, 42 insertions(+), 18 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index c7d5281..b8caee9 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -162,14 +162,23 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
+ }
+ static int
+-rpcrdma_sendcq_poll(struct ib_cq *cq)
++rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+ {
+-      struct ib_wc wc;
+-      int rc;
++      struct ib_wc *wcs;
++      int count, rc;
+-      while ((rc = ib_poll_cq(cq, 1, &wc)) == 1)
+-              rpcrdma_sendcq_process_wc(&wc);
+-      return rc;
++      do {
++              wcs = ep->rep_send_wcs;
++
++              rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
++              if (rc <= 0)
++                      return rc;
++
++              count = rc;
++              while (count-- > 0)
++                      rpcrdma_sendcq_process_wc(wcs++);
++      } while (rc == RPCRDMA_POLLSIZE);
++      return 0;
+ }
+ /*
+@@ -183,9 +192,10 @@ rpcrdma_sendcq_poll(struct ib_cq *cq)
+ static void
+ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
+ {
++      struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
+       int rc;
+-      rc = rpcrdma_sendcq_poll(cq);
++      rc = rpcrdma_sendcq_poll(cq, ep);
+       if (rc) {
+               dprintk("RPC:       %s: ib_poll_cq failed: %i\n",
+                       __func__, rc);
+@@ -202,7 +212,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
+               return;
+       }
+-      rpcrdma_sendcq_poll(cq);
++      rpcrdma_sendcq_poll(cq, ep);
+ }
+ static void
+@@ -241,14 +251,23 @@ out_schedule:
+ }
+ static int
+-rpcrdma_recvcq_poll(struct ib_cq *cq)
++rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+ {
+-      struct ib_wc wc;
+-      int rc;
++      struct ib_wc *wcs;
++      int count, rc;
+-      while ((rc = ib_poll_cq(cq, 1, &wc)) == 1)
+-              rpcrdma_recvcq_process_wc(&wc);
+-      return rc;
++      do {
++              wcs = ep->rep_recv_wcs;
++
++              rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
++              if (rc <= 0)
++                      return rc;
++
++              count = rc;
++              while (count-- > 0)
++                      rpcrdma_recvcq_process_wc(wcs++);
++      } while (rc == RPCRDMA_POLLSIZE);
++      return 0;
+ }
+ /*
+@@ -266,9 +285,10 @@ rpcrdma_recvcq_poll(struct ib_cq *cq)
+ static void
+ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
+ {
++      struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
+       int rc;
+-      rc = rpcrdma_recvcq_poll(cq);
++      rc = rpcrdma_recvcq_poll(cq, ep);
+       if (rc) {
+               dprintk("RPC:       %s: ib_poll_cq failed: %i\n",
+                       __func__, rc);
+@@ -285,7 +305,7 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
+               return;
+       }
+-      rpcrdma_recvcq_poll(cq);
++      rpcrdma_recvcq_poll(cq, ep);
+ }
+ #ifdef RPC_DEBUG
+@@ -721,7 +741,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+       INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
+       sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
+-                                rpcrdma_cq_async_error_upcall, NULL,
++                                rpcrdma_cq_async_error_upcall, ep,
+                                 ep->rep_attr.cap.max_send_wr + 1, 0);
+       if (IS_ERR(sendcq)) {
+               rc = PTR_ERR(sendcq);
+@@ -738,7 +758,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+       }
+       recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
+-                                rpcrdma_cq_async_error_upcall, NULL,
++                                rpcrdma_cq_async_error_upcall, ep,
+                                 ep->rep_attr.cap.max_recv_wr + 1, 0);
+       if (IS_ERR(recvcq)) {
+               rc = PTR_ERR(recvcq);
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index 334ab6e..cb4c882 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -74,6 +74,8 @@ struct rpcrdma_ia {
+  * RDMA Endpoint -- one per transport instance
+  */
++#define RPCRDMA_POLLSIZE      (16)
++
+ struct rpcrdma_ep {
+       atomic_t                rep_cqcount;
+       int                     rep_cqinit;
+@@ -88,6 +90,8 @@ struct rpcrdma_ep {
+       struct rdma_conn_param  rep_remote_cma;
+       struct sockaddr_storage rep_remote_addr;
+       struct delayed_work     rep_connect_worker;
++      struct ib_wc            rep_send_wcs[RPCRDMA_POLLSIZE];
++      struct ib_wc            rep_recv_wcs[RPCRDMA_POLLSIZE];
+ };
+ #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0120-xprtrdma-Limit-work-done-by-completion-handler.patch b/linux-next-cherry-picks/0120-xprtrdma-Limit-work-done-by-completion-handler.patch
new file mode 100644 (file)
index 0000000..9b7c86e
--- /dev/null
@@ -0,0 +1,79 @@
+From 8301a2c047cc25dabd645e5590c1db0ead4c5af4 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:51 -0400
+Subject: [PATCH 121/132] xprtrdma: Limit work done by completion handler
+
+Sagi Grimberg <sagig@dev.mellanox.co.il> points out that a steady
+stream of CQ events could starve other work because of the boundless
+loop pooling in rpcrdma_{send,recv}_poll().
+
+Instead of a (potentially infinite) while loop, return after
+collecting a budgeted number of completions.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Acked-by: Sagi Grimberg <sagig@dev.mellanox.co.il>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c     |   10 ++++++----
+ net/sunrpc/xprtrdma/xprt_rdma.h |    1 +
+ 2 files changed, 7 insertions(+), 4 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index b8caee9..1d08366 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -165,8 +165,9 @@ static int
+ rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+ {
+       struct ib_wc *wcs;
+-      int count, rc;
++      int budget, count, rc;
++      budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
+       do {
+               wcs = ep->rep_send_wcs;
+@@ -177,7 +178,7 @@ rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+               count = rc;
+               while (count-- > 0)
+                       rpcrdma_sendcq_process_wc(wcs++);
+-      } while (rc == RPCRDMA_POLLSIZE);
++      } while (rc == RPCRDMA_POLLSIZE && --budget);
+       return 0;
+ }
+@@ -254,8 +255,9 @@ static int
+ rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+ {
+       struct ib_wc *wcs;
+-      int count, rc;
++      int budget, count, rc;
++      budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
+       do {
+               wcs = ep->rep_recv_wcs;
+@@ -266,7 +268,7 @@ rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+               count = rc;
+               while (count-- > 0)
+                       rpcrdma_recvcq_process_wc(wcs++);
+-      } while (rc == RPCRDMA_POLLSIZE);
++      } while (rc == RPCRDMA_POLLSIZE && --budget);
+       return 0;
+ }
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index cb4c882..0c3b88e 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -74,6 +74,7 @@ struct rpcrdma_ia {
+  * RDMA Endpoint -- one per transport instance
+  */
++#define RPCRDMA_WC_BUDGET     (128)
+ #define RPCRDMA_POLLSIZE      (16)
+ struct rpcrdma_ep {
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0121-xprtrdma-Reduce-the-number-of-hardway-buffer-allocat.patch b/linux-next-cherry-picks/0121-xprtrdma-Reduce-the-number-of-hardway-buffer-allocat.patch
new file mode 100644 (file)
index 0000000..cb74e73
--- /dev/null
@@ -0,0 +1,128 @@
+From 65866f8259851cea5e356d2fd46fc37a4e26330e Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:59 -0400
+Subject: [PATCH 122/132] xprtrdma: Reduce the number of hardway buffer allocations
+
+While marshaling an RPC/RDMA request, the inline_{rsize,wsize}
+settings determine whether an inline request is used, or whether
+read or write chunks lists are built. The current default value of
+these settings is 1024. Any RPC request smaller than 1024 bytes is
+sent to the NFS server completely inline.
+
+rpcrdma_buffer_create() allocates and pre-registers a set of RPC
+buffers for each transport instance, also based on the inline rsize
+and wsize settings.
+
+RPC/RDMA requests and replies are built in these buffers. However,
+if an RPC/RDMA request is expected to be larger than 1024, a buffer
+has to be allocated and registered for that RPC, and deregistered
+and released when the RPC is complete. This is known has a
+"hardway allocation."
+
+Since the introduction of NFSv4, the size of RPC requests has become
+larger, and hardway allocations are thus more frequent. Hardway
+allocations are significant overhead, and they waste the existing
+RPC buffers pre-allocated by rpcrdma_buffer_create().
+
+We'd like fewer hardway allocations.
+
+Increasing the size of the pre-registered buffers is the most direct
+way to do this. However, a blanket increase of the inline thresholds
+has interoperability consequences.
+
+On my 64-bit system, rpcrdma_buffer_create() requests roughly 7000
+bytes for each RPC request buffer, using kmalloc(). Due to internal
+fragmentation, this wastes nearly 1200 bytes because kmalloc()
+already returns an 8192-byte piece of memory for a 7000-byte
+allocation request, though the extra space remains unused.
+
+So let's round up the size of the pre-allocated buffers, and make
+use of the unused space in the kmalloc'd memory.
+
+This change reduces the amount of hardway allocated memory for an
+NFSv4 general connectathon run from 1322092 to 9472 bytes (99%).
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c |   25 +++++++++++++------------
+ 1 files changed, 13 insertions(+), 12 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 1d08366..c80995a 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -50,6 +50,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/pci.h>        /* for Tavor hack below */
+ #include <linux/slab.h>
++#include <asm/bitops.h>
+ #include "xprt_rdma.h"
+@@ -1005,7 +1006,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+       struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
+ {
+       char *p;
+-      size_t len;
++      size_t len, rlen, wlen;
+       int i, rc;
+       struct rpcrdma_mw *r;
+@@ -1120,16 +1121,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+        * Allocate/init the request/reply buffers. Doing this
+        * using kmalloc for now -- one for each buf.
+        */
++      wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
++      rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
++      dprintk("RPC:       %s: wlen = %zu, rlen = %zu\n",
++              __func__, wlen, rlen);
++
+       for (i = 0; i < buf->rb_max_requests; i++) {
+               struct rpcrdma_req *req;
+               struct rpcrdma_rep *rep;
+-              len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
+-              /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
+-              /* Typical ~2400b, so rounding up saves work later */
+-              if (len < 4096)
+-                      len = 4096;
+-              req = kmalloc(len, GFP_KERNEL);
++              req = kmalloc(wlen, GFP_KERNEL);
+               if (req == NULL) {
+                       dprintk("RPC:       %s: request buffer %d alloc"
+                               " failed\n", __func__, i);
+@@ -1141,16 +1142,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+               buf->rb_send_bufs[i]->rl_buffer = buf;
+               rc = rpcrdma_register_internal(ia, req->rl_base,
+-                              len - offsetof(struct rpcrdma_req, rl_base),
++                              wlen - offsetof(struct rpcrdma_req, rl_base),
+                               &buf->rb_send_bufs[i]->rl_handle,
+                               &buf->rb_send_bufs[i]->rl_iov);
+               if (rc)
+                       goto out;
+-              buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
++              buf->rb_send_bufs[i]->rl_size = wlen -
++                                              sizeof(struct rpcrdma_req);
+-              len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
+-              rep = kmalloc(len, GFP_KERNEL);
++              rep = kmalloc(rlen, GFP_KERNEL);
+               if (rep == NULL) {
+                       dprintk("RPC:       %s: reply buffer %d alloc failed\n",
+                               __func__, i);
+@@ -1162,7 +1163,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+               buf->rb_recv_bufs[i]->rr_buffer = buf;
+               rc = rpcrdma_register_internal(ia, rep->rr_base,
+-                              len - offsetof(struct rpcrdma_rep, rr_base),
++                              rlen - offsetof(struct rpcrdma_rep, rr_base),
+                               &buf->rb_recv_bufs[i]->rr_handle,
+                               &buf->rb_recv_bufs[i]->rr_iov);
+               if (rc)
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0122-xprtrdma-Ensure-ia-ri_id-qp-is-not-NULL-when-reconne.patch b/linux-next-cherry-picks/0122-xprtrdma-Ensure-ia-ri_id-qp-is-not-NULL-when-reconne.patch
new file mode 100644 (file)
index 0000000..3581bee
--- /dev/null
@@ -0,0 +1,94 @@
+From ec62f40d3505a643497d105c297093bb90afd44e Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:34:07 -0400
+Subject: [PATCH 123/132] xprtrdma: Ensure ia->ri_id->qp is not NULL when reconnecting
+
+Devesh Sharma <Devesh.Sharma@Emulex.Com> reports that after a
+disconnect, his HCA is failing to create a fresh QP, leaving
+ia_ri->ri_id->qp set to NULL. But xprtrdma still allows RPCs to
+wake up and post LOCAL_INV as they exit, causing an oops.
+
+rpcrdma_ep_connect() is allowing the wake-up by leaking the QP
+creation error code (-EPERM in this case) to the RPC client's
+generic layer. xprt_connect_status() does not recognize -EPERM, so
+it kills pending RPC tasks immediately rather than retrying the
+connect.
+
+Re-arrange the QP creation logic so that when it fails on reconnect,
+it leaves ->qp with the old QP rather than NULL.  If pending RPC
+tasks wake and exit, LOCAL_INV work requests will flush rather than
+oops.
+
+On initial connect, leaving ->qp == NULL is OK, since there are no
+pending RPCs that might use ->qp. But be sure not to try to destroy
+a NULL QP when rpcrdma_ep_connect() is retried.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c |   29 ++++++++++++++++++++---------
+ 1 files changed, 20 insertions(+), 9 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index c80995a..54edf2a 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -867,6 +867,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+       if (ep->rep_connected != 0) {
+               struct rpcrdma_xprt *xprt;
+ retry:
++              dprintk("RPC:       %s: reconnecting...\n", __func__);
+               rc = rpcrdma_ep_disconnect(ep, ia);
+               if (rc && rc != -ENOTCONN)
+                       dprintk("RPC:       %s: rpcrdma_ep_disconnect"
+@@ -879,7 +880,7 @@ retry:
+               id = rpcrdma_create_id(xprt, ia,
+                               (struct sockaddr *)&xprt->rx_data.addr);
+               if (IS_ERR(id)) {
+-                      rc = PTR_ERR(id);
++                      rc = -EHOSTUNREACH;
+                       goto out;
+               }
+               /* TEMP TEMP TEMP - fail if new device:
+@@ -893,20 +894,30 @@ retry:
+                       printk("RPC:       %s: can't reconnect on "
+                               "different device!\n", __func__);
+                       rdma_destroy_id(id);
+-                      rc = -ENETDOWN;
++                      rc = -ENETUNREACH;
+                       goto out;
+               }
+               /* END TEMP */
++              rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
++              if (rc) {
++                      dprintk("RPC:       %s: rdma_create_qp failed %i\n",
++                              __func__, rc);
++                      rdma_destroy_id(id);
++                      rc = -ENETUNREACH;
++                      goto out;
++              }
+               rdma_destroy_qp(ia->ri_id);
+               rdma_destroy_id(ia->ri_id);
+               ia->ri_id = id;
+-      }
+-
+-      rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+-      if (rc) {
+-              dprintk("RPC:       %s: rdma_create_qp failed %i\n",
+-                      __func__, rc);
+-              goto out;
++      } else {
++              dprintk("RPC:       %s: connecting...\n", __func__);
++              rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
++              if (rc) {
++                      dprintk("RPC:       %s: rdma_create_qp failed %i\n",
++                              __func__, rc);
++                      /* do not update ep->rep_connected */
++                      return -ENETUNREACH;
++              }
+       }
+ /* XXX Tavor device performs badly with 2K MTU! */
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0123-xprtrdma-Remove-Tavor-MTU-setting.patch b/linux-next-cherry-picks/0123-xprtrdma-Remove-Tavor-MTU-setting.patch
new file mode 100644 (file)
index 0000000..4fde4bf
--- /dev/null
@@ -0,0 +1,55 @@
+From 5bc4bc729275a0bfc2bfd04466e8ab7c85af2f6e Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:34:16 -0400
+Subject: [PATCH 124/132] xprtrdma: Remove Tavor MTU setting
+
+Clean up.  Remove HCA-specific clutter in xprtrdma, which is
+supposed to be device-independent.
+
+Hal Rosenstock <hal@dev.mellanox.co.il> observes:
+> Note that there is OpenSM option (enable_quirks) to return 1K MTU
+> in SA PathRecord responses for Tavor so that can be used for this.
+> The default setting for enable_quirks is FALSE so that would need
+> changing.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c |   14 --------------
+ 1 files changed, 0 insertions(+), 14 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 54edf2a..515dfc1 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -48,7 +48,6 @@
+  */
+ #include <linux/interrupt.h>
+-#include <linux/pci.h>        /* for Tavor hack below */
+ #include <linux/slab.h>
+ #include <asm/bitops.h>
+@@ -920,19 +919,6 @@ retry:
+               }
+       }
+-/* XXX Tavor device performs badly with 2K MTU! */
+-if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
+-      struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
+-      if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
+-          (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
+-           pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
+-              struct ib_qp_attr attr = {
+-                      .path_mtu = IB_MTU_1024
+-              };
+-              rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
+-      }
+-}
+-
+       ep->rep_connected = 0;
+       rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0124-xprtrdma-Allocate-missing-pagelist.patch b/linux-next-cherry-picks/0124-xprtrdma-Allocate-missing-pagelist.patch
new file mode 100644 (file)
index 0000000..cdf6409
--- /dev/null
@@ -0,0 +1,38 @@
+From 196c69989d84ab902bbe545f7bd8ce78ee74dac4 Mon Sep 17 00:00:00 2001
+From: Shirley Ma <shirley.ma@oracle.com>
+Date: Wed, 28 May 2014 10:34:24 -0400
+Subject: [PATCH 125/132] xprtrdma: Allocate missing pagelist
+
+GETACL relies on transport layer to alloc memory for reply buffer.
+However xprtrdma assumes that the reply buffer (pagelist) has been
+pre-allocated in upper layer. This problem was reported by IOL OFA lab
+test on PPC.
+
+Signed-off-by: Shirley Ma <shirley.ma@oracle.com>
+Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Edward Mossman <emossman@iol.unh.edu>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c |    6 ++++++
+ 1 files changed, 6 insertions(+), 0 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index 436d229..dc4a826 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -99,6 +99,12 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
+       page_base = xdrbuf->page_base & ~PAGE_MASK;
+       p = 0;
+       while (len && n < nsegs) {
++              if (!ppages[p]) {
++                      /* alloc the pagelist for receiving buffer */
++                      ppages[p] = alloc_page(GFP_ATOMIC);
++                      if (!ppages[p])
++                              return 0;
++              }
+               seg[n].mr_page = ppages[p];
+               seg[n].mr_offset = (void *)(unsigned long) page_base;
+               seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0125-xprtrdma-Use-macros-for-reconnection-timeout-constan.patch b/linux-next-cherry-picks/0125-xprtrdma-Use-macros-for-reconnection-timeout-constan.patch
new file mode 100644 (file)
index 0000000..79de6d2
--- /dev/null
@@ -0,0 +1,61 @@
+From bfaee096deaa680195df5491eb650f81051c145d Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:34:32 -0400
+Subject: [PATCH 126/132] xprtrdma: Use macros for reconnection timeout constants
+
+Clean up: Ensure the same max and min constant values are used
+everywhere when setting reconnect timeouts.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/transport.c |   19 ++++++++++++-------
+ 1 files changed, 12 insertions(+), 7 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index d18b2a3..6b84d7d 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -149,6 +149,11 @@ static struct ctl_table sunrpc_table[] = {
+ #endif
++#define RPCRDMA_BIND_TO               (60U * HZ)
++#define RPCRDMA_INIT_REEST_TO (5U * HZ)
++#define RPCRDMA_MAX_REEST_TO  (30U * HZ)
++#define RPCRDMA_IDLE_DISC_TO  (5U * 60 * HZ)
++
+ static struct rpc_xprt_ops xprt_rdma_procs;   /* forward reference */
+ static void
+@@ -285,9 +290,9 @@ xprt_setup_rdma(struct xprt_create *args)
+       /* 60 second timeout, no retries */
+       xprt->timeout = &xprt_rdma_default_timeout;
+-      xprt->bind_timeout = (60U * HZ);
+-      xprt->reestablish_timeout = (5U * HZ);
+-      xprt->idle_timeout = (5U * 60 * HZ);
++      xprt->bind_timeout = RPCRDMA_BIND_TO;
++      xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
++      xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
+       xprt->resvport = 0;             /* privileged port not needed */
+       xprt->tsh_size = 0;             /* RPC-RDMA handles framing */
+@@ -432,10 +437,10 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
+               schedule_delayed_work(&r_xprt->rdma_connect,
+                       xprt->reestablish_timeout);
+               xprt->reestablish_timeout <<= 1;
+-              if (xprt->reestablish_timeout > (30 * HZ))
+-                      xprt->reestablish_timeout = (30 * HZ);
+-              else if (xprt->reestablish_timeout < (5 * HZ))
+-                      xprt->reestablish_timeout = (5 * HZ);
++              if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
++                      xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
++              else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
++                      xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+       } else {
+               schedule_delayed_work(&r_xprt->rdma_connect, 0);
+               if (!RPC_IS_ASYNC(task))
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0126-xprtrdma-Reset-connection-timeout-after-successful-r.patch b/linux-next-cherry-picks/0126-xprtrdma-Reset-connection-timeout-after-successful-r.patch
new file mode 100644 (file)
index 0000000..8a678dc
--- /dev/null
@@ -0,0 +1,33 @@
+From 18906972aa1103c07869c9b43860a52e0e27e8e5 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:34:41 -0400
+Subject: [PATCH 127/132] xprtrdma: Reset connection timeout after successful reconnect
+
+If the new connection is able to make forward progress, reset the
+re-establish timeout. Otherwise it keeps growing even if disconnect
+events are rare.
+
+The same behavior as TCP is adopted: reconnect immediately if the
+transport instance has been able to make some forward progress.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c |    1 +
+ 1 files changed, 1 insertions(+), 0 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index dc4a826..ac65b0c 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -770,6 +770,7 @@ repost:
+       /* from here on, the reply is no longer an orphan */
+       req->rl_reply = rep;
++      xprt->reestablish_timeout = 0;
+       /* check for expected message types */
+       /* The order of some of these tests is important. */
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0127-xprtrdma-Avoid-deadlock-when-credit-window-is-reset.patch b/linux-next-cherry-picks/0127-xprtrdma-Avoid-deadlock-when-credit-window-is-reset.patch
new file mode 100644 (file)
index 0000000..1578dbb
--- /dev/null
@@ -0,0 +1,104 @@
+From e7ce710a8802351bd4118c5d6136c1d850f67cf9 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:34:57 -0400
+Subject: [PATCH 128/132] xprtrdma: Avoid deadlock when credit window is reset
+
+Update the cwnd while processing the server's reply.  Otherwise the
+next task on the xprt_sending queue is still subject to the old
+credit window. Currently, no task is awoken if the old congestion
+window is still exceeded, even if the new window is larger, and a
+deadlock results.
+
+This is an issue during a transport reconnect. Servers don't
+normally shrink the credit window, but the client does reset it to
+1 when reconnecting so the server can safely grow it again.
+
+As a minor optimization, remove the hack of grabbing the initial
+cwnd size (which happens to be RPC_CWNDSCALE) and using that value
+as the congestion scaling factor. The scaling value is invariant,
+and we are better off without the multiplication operation.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c  |    6 ++++++
+ net/sunrpc/xprtrdma/transport.c |   19 +------------------
+ net/sunrpc/xprtrdma/xprt_rdma.h |    1 -
+ 3 files changed, 7 insertions(+), 19 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index ac65b0c..77b84cf 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -716,6 +716,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+       __be32 *iptr;
+       int rdmalen, status;
++      unsigned long cwnd;
+       /* Check status. If bad, signal disconnect and return rep to pool */
+       if (rep->rr_len == ~0U) {
+@@ -845,6 +846,11 @@ badheader:
+               break;
+       }
++      cwnd = xprt->cwnd;
++      xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
++      if (xprt->cwnd > cwnd)
++              xprt_release_rqst_cong(rqst->rq_task);
++
+       dprintk("RPC:       %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
+                       __func__, xprt, rqst, status);
+       xprt_complete_rqst(rqst->rq_task, status);
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index 6b84d7d..187894b 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -448,23 +448,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
+       }
+ }
+-static int
+-xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
+-{
+-      struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+-      int credits = atomic_read(&r_xprt->rx_buf.rb_credits);
+-
+-      /* == RPC_CWNDSCALE @ init, but *after* setup */
+-      if (r_xprt->rx_buf.rb_cwndscale == 0UL) {
+-              r_xprt->rx_buf.rb_cwndscale = xprt->cwnd;
+-              dprintk("RPC:       %s: cwndscale %lu\n", __func__,
+-                      r_xprt->rx_buf.rb_cwndscale);
+-              BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0);
+-      }
+-      xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale;
+-      return xprt_reserve_xprt_cong(xprt, task);
+-}
+-
+ /*
+  * The RDMA allocate/free functions need the task structure as a place
+  * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
+@@ -686,7 +669,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+  */
+ static struct rpc_xprt_ops xprt_rdma_procs = {
+-      .reserve_xprt           = xprt_rdma_reserve_xprt,
++      .reserve_xprt           = xprt_reserve_xprt_cong,
+       .release_xprt           = xprt_release_xprt_cong, /* sunrpc/xprt.c */
+       .alloc_slot             = xprt_alloc_slot,
+       .release_request        = xprt_release_rqst_cong,       /* ditto */
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index 0c3b88e..89e7cd4 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -212,7 +212,6 @@ struct rpcrdma_req {
+ struct rpcrdma_buffer {
+       spinlock_t      rb_lock;        /* protects indexes */
+       atomic_t        rb_credits;     /* most recent server credits */
+-      unsigned long   rb_cwndscale;   /* cached framework rpc_cwndscale */
+       int             rb_max_requests;/* client max requests */
+       struct list_head rb_mws;        /* optional memory windows/fmrs/frmrs */
+       int             rb_send_index;
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0128-xprtrdma-Remove-BUG_ON-call-sites.patch b/linux-next-cherry-picks/0128-xprtrdma-Remove-BUG_ON-call-sites.patch
new file mode 100644 (file)
index 0000000..809584e
--- /dev/null
@@ -0,0 +1,83 @@
+From c977dea22708688eae31774f70126c97aa4dfe83 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:35:06 -0400
+Subject: [PATCH 129/132] xprtrdma: Remove BUG_ON() call sites
+
+If an error occurs in the marshaling logic, fail the RPC request
+being processed, but leave the client running.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/transport.c |    3 ++-
+ net/sunrpc/xprtrdma/verbs.c     |   18 ++++++++++--------
+ 2 files changed, 12 insertions(+), 9 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index 187894b..93fe775 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -463,7 +463,8 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
+       struct rpcrdma_req *req, *nreq;
+       req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);
+-      BUG_ON(NULL == req);
++      if (req == NULL)
++              return NULL;
+       if (size > req->rl_size) {
+               dprintk("RPC:       %s: size %zd too large for buffer[%zd]: "
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 515dfc1..13dbd1c 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -1302,7 +1302,6 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
+       int i;
+       unsigned long flags;
+-      BUG_ON(req->rl_nchunks != 0);
+       spin_lock_irqsave(&buffers->rb_lock, flags);
+       buffers->rb_send_bufs[--buffers->rb_send_index] = req;
+       req->rl_niovs = 0;
+@@ -1535,10 +1534,6 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
+       } else
+               post_wr = &frmr_wr;
+-      /* Bump the key */
+-      key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
+-      ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
+-
+       /* Prepare FRMR WR */
+       memset(&frmr_wr, 0, sizeof frmr_wr);
+       frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
+@@ -1549,7 +1544,16 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
+       frmr_wr.wr.fast_reg.page_list_len = page_no;
+       frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+       frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
+-      BUG_ON(frmr_wr.wr.fast_reg.length < len);
++      if (frmr_wr.wr.fast_reg.length < len) {
++              while (seg1->mr_nsegs--)
++                      rpcrdma_unmap_one(ia, seg++);
++              return -EIO;
++      }
++
++      /* Bump the key */
++      key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
++      ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
++
+       frmr_wr.wr.fast_reg.access_flags = (writing ?
+                               IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+                               IB_ACCESS_REMOTE_READ);
+@@ -1709,9 +1713,7 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
+ #if RPCRDMA_PERSISTENT_REGISTRATION
+       case RPCRDMA_ALLPHYSICAL:
+-              BUG_ON(nsegs != 1);
+               rpcrdma_unmap_one(ia, seg);
+-              rc = 0;
+               break;
+ #endif
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0129-xprtrdma-Disconnect-on-registration-failure.patch b/linux-next-cherry-picks/0129-xprtrdma-Disconnect-on-registration-failure.patch
new file mode 100644 (file)
index 0000000..e943208
--- /dev/null
@@ -0,0 +1,215 @@
+From c93c62231cf55df4a26bd08937efeea97e6fc5e8 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:35:14 -0400
+Subject: [PATCH 130/132] xprtrdma: Disconnect on registration failure
+
+If rpcrdma_register_external() fails during request marshaling, the
+current RPC request is killed. Instead, this RPC should be retried
+after reconnecting the transport instance.
+
+The most likely reason for registration failure with FRMR is a
+failed post_send, which would be due to a remote transport
+disconnect or memory exhaustion. These issues can be recovered
+by a retry.
+
+Problems encountered in the marshaling logic itself will not be
+corrected by trying again, so these should still kill a request.
+
+Now that we've added a clean exit for marshaling errors, take the
+opportunity to defang some BUG_ON's.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c  |   48 +++++++++++++++++++++++++-------------
+ net/sunrpc/xprtrdma/transport.c |   17 +++++++++-----
+ 2 files changed, 42 insertions(+), 23 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index 77b84cf..693966d 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -77,6 +77,8 @@ static const char transfertypes[][12] = {
+  * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
+  * elements. Segments are then coalesced when registered, if possible
+  * within the selected memreg mode.
++ *
++ * Returns positive number of segments converted, or a negative errno.
+  */
+ static int
+@@ -103,12 +105,13 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
+                       /* alloc the pagelist for receiving buffer */
+                       ppages[p] = alloc_page(GFP_ATOMIC);
+                       if (!ppages[p])
+-                              return 0;
++                              return -ENOMEM;
+               }
+               seg[n].mr_page = ppages[p];
+               seg[n].mr_offset = (void *)(unsigned long) page_base;
+               seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
+-              BUG_ON(seg[n].mr_len > PAGE_SIZE);
++              if (seg[n].mr_len > PAGE_SIZE)
++                      return -EIO;
+               len -= seg[n].mr_len;
+               ++n;
+               ++p;
+@@ -117,7 +120,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
+       /* Message overflows the seg array */
+       if (len && n == nsegs)
+-              return 0;
++              return -EIO;
+       if (xdrbuf->tail[0].iov_len) {
+               /* the rpcrdma protocol allows us to omit any trailing
+@@ -126,7 +129,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
+                       return n;
+               if (n == nsegs)
+                       /* Tail remains, but we're out of segments */
+-                      return 0;
++                      return -EIO;
+               seg[n].mr_page = NULL;
+               seg[n].mr_offset = xdrbuf->tail[0].iov_base;
+               seg[n].mr_len = xdrbuf->tail[0].iov_len;
+@@ -167,15 +170,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
+  *  Reply chunk (a counted array):
+  *   N elements:
+  *    1 - N - HLOO - HLOO - ... - HLOO
++ *
++ * Returns positive RPC/RDMA header size, or negative errno.
+  */
+-static unsigned int
++static ssize_t
+ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+               struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
+ {
+       struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+-      int nsegs, nchunks = 0;
++      int n, nsegs, nchunks = 0;
+       unsigned int pos;
+       struct rpcrdma_mr_seg *seg = req->rl_segments;
+       struct rpcrdma_read_chunk *cur_rchunk = NULL;
+@@ -201,11 +206,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+               pos = target->head[0].iov_len;
+       nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
+-      if (nsegs == 0)
+-              return 0;
++      if (nsegs < 0)
++              return nsegs;
+       do {
+-              int n = rpcrdma_register_external(seg, nsegs,
++              n = rpcrdma_register_external(seg, nsegs,
+                                               cur_wchunk != NULL, r_xprt);
+               if (n <= 0)
+                       goto out;
+@@ -277,7 +282,7 @@ out:
+       for (pos = 0; nchunks--;)
+               pos += rpcrdma_deregister_external(
+                               &req->rl_segments[pos], r_xprt);
+-      return 0;
++      return n;
+ }
+ /*
+@@ -359,6 +364,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
+  *  [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
+  *  [2] -- optional padding.
+  *  [3] -- if padded, header only in [1] and data here.
++ *
++ * Returns zero on success, otherwise a negative errno.
+  */
+ int
+@@ -368,7 +375,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+       struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+       char *base;
+-      size_t hdrlen, rpclen, padlen;
++      size_t rpclen, padlen;
++      ssize_t hdrlen;
+       enum rpcrdma_chunktype rtype, wtype;
+       struct rpcrdma_msg *headerp;
+@@ -439,7 +447,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
+       /* The following simplification is not true forever */
+       if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
+               wtype = rpcrdma_noch;
+-      BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch);
++      if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
++              dprintk("RPC:       %s: cannot marshal multiple chunk lists\n",
++                      __func__);
++              return -EIO;
++      }
+       hdrlen = 28; /*sizeof *headerp;*/
+       padlen = 0;
+@@ -464,8 +476,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
+                       headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
+                       headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
+                       hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
+-                      BUG_ON(wtype != rpcrdma_noch);
+-
++                      if (wtype != rpcrdma_noch) {
++                              dprintk("RPC:       %s: invalid chunk list\n",
++                                      __func__);
++                              return -EIO;
++                      }
+               } else {
+                       headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
+                       headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
+@@ -500,9 +515,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
+               hdrlen = rpcrdma_create_chunks(rqst,
+                                       &rqst->rq_rcv_buf, headerp, wtype);
+       }
+-
+-      if (hdrlen == 0)
+-              return -1;
++      if (hdrlen < 0)
++              return hdrlen;
+       dprintk("RPC:       %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+               " headerp 0x%p base 0x%p lkey 0x%x\n",
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index 93fe775..66f91f0 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -595,13 +595,12 @@ xprt_rdma_send_request(struct rpc_task *task)
+       struct rpc_xprt *xprt = rqst->rq_xprt;
+       struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
++      int rc;
+-      /* marshal the send itself */
+-      if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) {
+-              r_xprt->rx_stats.failed_marshal_count++;
+-              dprintk("RPC:       %s: rpcrdma_marshal_req failed\n",
+-                      __func__);
+-              return -EIO;
++      if (req->rl_niovs == 0) {
++              rc = rpcrdma_marshal_req(rqst);
++              if (rc < 0)
++                      goto failed_marshal;
+       }
+       if (req->rl_reply == NULL)              /* e.g. reconnection */
+@@ -625,6 +624,12 @@ xprt_rdma_send_request(struct rpc_task *task)
+       rqst->rq_bytes_sent = 0;
+       return 0;
++failed_marshal:
++      r_xprt->rx_stats.failed_marshal_count++;
++      dprintk("RPC:       %s: rpcrdma_marshal_req failed, status %i\n",
++              __func__, rc);
++      if (rc == -EIO)
++              return -EIO;
+ drop_connection:
+       xprt_disconnect_done(xprt);
+       return -ENOTCONN;       /* implies disconnect */
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0130-svcrdma-refactor-marshalling-logic.patch b/linux-next-cherry-picks/0130-svcrdma-refactor-marshalling-logic.patch
new file mode 100644 (file)
index 0000000..aac5be8
--- /dev/null
@@ -0,0 +1,1243 @@
+From 0bf4828983dff062cd502f27ab8644b32774e72e Mon Sep 17 00:00:00 2001
+From: Steve Wise <swise@opengridcomputing.com>
+Date: Wed, 28 May 2014 15:12:01 -0500
+Subject: [PATCH 131/132] svcrdma: refactor marshalling logic
+
+This patch refactors the NFSRDMA server marshalling logic to
+remove the intermediary map structures.  It also fixes an existing bug
+where the NFSRDMA server was not minding the device fast register page
+list length limitations.
+
+Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
+Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+---
+ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  |  643 +++++++++++++-----------------
+ net/sunrpc/xprtrdma/svc_rdma_sendto.c    |  230 +----------
+ net/sunrpc/xprtrdma/svc_rdma_transport.c |   62 ++--
+ 3 files changed, 331 insertions(+), 604 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+index 8d904e4..52d9f2c 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+  * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+  *
+  * This software is available to you under a choice of one of two
+@@ -69,7 +70,8 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
+       /* Set up the XDR head */
+       rqstp->rq_arg.head[0].iov_base = page_address(page);
+-      rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length);
++      rqstp->rq_arg.head[0].iov_len =
++              min_t(size_t, byte_count, ctxt->sge[0].length);
+       rqstp->rq_arg.len = byte_count;
+       rqstp->rq_arg.buflen = byte_count;
+@@ -85,7 +87,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
+               page = ctxt->pages[sge_no];
+               put_page(rqstp->rq_pages[sge_no]);
+               rqstp->rq_pages[sge_no] = page;
+-              bc -= min(bc, ctxt->sge[sge_no].length);
++              bc -= min_t(u32, bc, ctxt->sge[sge_no].length);
+               rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
+               sge_no++;
+       }
+@@ -113,291 +115,265 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
+       rqstp->rq_arg.tail[0].iov_len = 0;
+ }
+-/* Encode a read-chunk-list as an array of IB SGE
+- *
+- * Assumptions:
+- * - chunk[0]->position points to pages[0] at an offset of 0
+- * - pages[] is not physically or virtually contiguous and consists of
+- *   PAGE_SIZE elements.
+- *
+- * Output:
+- * - sge array pointing into pages[] array.
+- * - chunk_sge array specifying sge index and count for each
+- *   chunk in the read list
+- *
+- */
+-static int map_read_chunks(struct svcxprt_rdma *xprt,
+-                         struct svc_rqst *rqstp,
+-                         struct svc_rdma_op_ctxt *head,
+-                         struct rpcrdma_msg *rmsgp,
+-                         struct svc_rdma_req_map *rpl_map,
+-                         struct svc_rdma_req_map *chl_map,
+-                         int ch_count,
+-                         int byte_count)
++static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
+ {
+-      int sge_no;
+-      int sge_bytes;
+-      int page_off;
+-      int page_no;
+-      int ch_bytes;
+-      int ch_no;
+-      struct rpcrdma_read_chunk *ch;
++      if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
++           RDMA_TRANSPORT_IWARP)
++              return 1;
++      else
++              return min_t(int, sge_count, xprt->sc_max_sge);
++}
+-      sge_no = 0;
+-      page_no = 0;
+-      page_off = 0;
+-      ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+-      ch_no = 0;
+-      ch_bytes = ntohl(ch->rc_target.rs_length);
+-      head->arg.head[0] = rqstp->rq_arg.head[0];
+-      head->arg.tail[0] = rqstp->rq_arg.tail[0];
+-      head->arg.pages = &head->pages[head->count];
+-      head->hdr_count = head->count; /* save count of hdr pages */
+-      head->arg.page_base = 0;
+-      head->arg.page_len = ch_bytes;
+-      head->arg.len = rqstp->rq_arg.len + ch_bytes;
+-      head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
+-      head->count++;
+-      chl_map->ch[0].start = 0;
+-      while (byte_count) {
+-              rpl_map->sge[sge_no].iov_base =
+-                      page_address(rqstp->rq_arg.pages[page_no]) + page_off;
+-              sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
+-              rpl_map->sge[sge_no].iov_len = sge_bytes;
+-              /*
+-               * Don't bump head->count here because the same page
+-               * may be used by multiple SGE.
+-               */
+-              head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
+-              rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
++typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt,
++                            struct svc_rqst *rqstp,
++                            struct svc_rdma_op_ctxt *head,
++                            int *page_no,
++                            u32 *page_offset,
++                            u32 rs_handle,
++                            u32 rs_length,
++                            u64 rs_offset,
++                            int last);
++
++/* Issue an RDMA_READ using the local lkey to map the data sink */
++static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
++                             struct svc_rqst *rqstp,
++                             struct svc_rdma_op_ctxt *head,
++                             int *page_no,
++                             u32 *page_offset,
++                             u32 rs_handle,
++                             u32 rs_length,
++                             u64 rs_offset,
++                             int last)
++{
++      struct ib_send_wr read_wr;
++      int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
++      struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
++      int ret, read, pno;
++      u32 pg_off = *page_offset;
++      u32 pg_no = *page_no;
++
++      ctxt->direction = DMA_FROM_DEVICE;
++      ctxt->read_hdr = head;
++      pages_needed =
++              min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed));
++      read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
++
++      for (pno = 0; pno < pages_needed; pno++) {
++              int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
++
++              head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
++              head->arg.page_len += len;
++              head->arg.len += len;
++              if (!pg_off)
++                      head->count++;
++              rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
+               rqstp->rq_next_page = rqstp->rq_respages + 1;
++              ctxt->sge[pno].addr =
++                      ib_dma_map_page(xprt->sc_cm_id->device,
++                                      head->arg.pages[pg_no], pg_off,
++                                      PAGE_SIZE - pg_off,
++                                      DMA_FROM_DEVICE);
++              ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
++                                         ctxt->sge[pno].addr);
++              if (ret)
++                      goto err;
++              atomic_inc(&xprt->sc_dma_used);
+-              byte_count -= sge_bytes;
+-              ch_bytes -= sge_bytes;
+-              sge_no++;
+-              /*
+-               * If all bytes for this chunk have been mapped to an
+-               * SGE, move to the next SGE
+-               */
+-              if (ch_bytes == 0) {
+-                      chl_map->ch[ch_no].count =
+-                              sge_no - chl_map->ch[ch_no].start;
+-                      ch_no++;
+-                      ch++;
+-                      chl_map->ch[ch_no].start = sge_no;
+-                      ch_bytes = ntohl(ch->rc_target.rs_length);
+-                      /* If bytes remaining account for next chunk */
+-                      if (byte_count) {
+-                              head->arg.page_len += ch_bytes;
+-                              head->arg.len += ch_bytes;
+-                              head->arg.buflen += ch_bytes;
+-                      }
++              /* The lkey here is either a local dma lkey or a dma_mr lkey */
++              ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
++              ctxt->sge[pno].length = len;
++              ctxt->count++;
++
++              /* adjust offset and wrap to next page if needed */
++              pg_off += len;
++              if (pg_off == PAGE_SIZE) {
++                      pg_off = 0;
++                      pg_no++;
+               }
+-              /*
+-               * If this SGE consumed all of the page, move to the
+-               * next page
+-               */
+-              if ((sge_bytes + page_off) == PAGE_SIZE) {
+-                      page_no++;
+-                      page_off = 0;
+-                      /*
+-                       * If there are still bytes left to map, bump
+-                       * the page count
+-                       */
+-                      if (byte_count)
+-                              head->count++;
+-              } else
+-                      page_off += sge_bytes;
++              rs_length -= len;
+       }
+-      BUG_ON(byte_count != 0);
+-      return sge_no;
++
++      if (last && rs_length == 0)
++              set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
++      else
++              clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
++
++      memset(&read_wr, 0, sizeof(read_wr));
++      read_wr.wr_id = (unsigned long)ctxt;
++      read_wr.opcode = IB_WR_RDMA_READ;
++      ctxt->wr_op = read_wr.opcode;
++      read_wr.send_flags = IB_SEND_SIGNALED;
++      read_wr.wr.rdma.rkey = rs_handle;
++      read_wr.wr.rdma.remote_addr = rs_offset;
++      read_wr.sg_list = ctxt->sge;
++      read_wr.num_sge = pages_needed;
++
++      ret = svc_rdma_send(xprt, &read_wr);
++      if (ret) {
++              pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
++              set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
++              goto err;
++      }
++
++      /* return current location in page array */
++      *page_no = pg_no;
++      *page_offset = pg_off;
++      ret = read;
++      atomic_inc(&rdma_stat_read);
++      return ret;
++ err:
++      svc_rdma_unmap_dma(ctxt);
++      svc_rdma_put_context(ctxt, 0);
++      return ret;
+ }
+-/* Map a read-chunk-list to an XDR and fast register the page-list.
+- *
+- * Assumptions:
+- * - chunk[0] position points to pages[0] at an offset of 0
+- * - pages[]  will be made physically contiguous by creating a one-off memory
+- *            region using the fastreg verb.
+- * - byte_count is # of bytes in read-chunk-list
+- * - ch_count is # of chunks in read-chunk-list
+- *
+- * Output:
+- * - sge array pointing into pages[] array.
+- * - chunk_sge array specifying sge index and count for each
+- *   chunk in the read list
+- */
+-static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
++/* Issue an RDMA_READ using an FRMR to map the data sink */
++static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
+                               struct svc_rqst *rqstp,
+                               struct svc_rdma_op_ctxt *head,
+-                              struct rpcrdma_msg *rmsgp,
+-                              struct svc_rdma_req_map *rpl_map,
+-                              struct svc_rdma_req_map *chl_map,
+-                              int ch_count,
+-                              int byte_count)
++                              int *page_no,
++                              u32 *page_offset,
++                              u32 rs_handle,
++                              u32 rs_length,
++                              u64 rs_offset,
++                              int last)
+ {
+-      int page_no;
+-      int ch_no;
+-      u32 offset;
+-      struct rpcrdma_read_chunk *ch;
+-      struct svc_rdma_fastreg_mr *frmr;
+-      int ret = 0;
++      struct ib_send_wr read_wr;
++      struct ib_send_wr inv_wr;
++      struct ib_send_wr fastreg_wr;
++      u8 key;
++      int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
++      struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
++      struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt);
++      int ret, read, pno;
++      u32 pg_off = *page_offset;
++      u32 pg_no = *page_no;
+-      frmr = svc_rdma_get_frmr(xprt);
+       if (IS_ERR(frmr))
+               return -ENOMEM;
+-      head->frmr = frmr;
+-      head->arg.head[0] = rqstp->rq_arg.head[0];
+-      head->arg.tail[0] = rqstp->rq_arg.tail[0];
+-      head->arg.pages = &head->pages[head->count];
+-      head->hdr_count = head->count; /* save count of hdr pages */
+-      head->arg.page_base = 0;
+-      head->arg.page_len = byte_count;
+-      head->arg.len = rqstp->rq_arg.len + byte_count;
+-      head->arg.buflen = rqstp->rq_arg.buflen + byte_count;
++      ctxt->direction = DMA_FROM_DEVICE;
++      ctxt->frmr = frmr;
++      pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len);
++      read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
+-      /* Fast register the page list */
+-      frmr->kva = page_address(rqstp->rq_arg.pages[0]);
++      frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]);
+       frmr->direction = DMA_FROM_DEVICE;
+       frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
+-      frmr->map_len = byte_count;
+-      frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT;
+-      for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
+-              frmr->page_list->page_list[page_no] =
++      frmr->map_len = pages_needed << PAGE_SHIFT;
++      frmr->page_list_len = pages_needed;
++
++      for (pno = 0; pno < pages_needed; pno++) {
++              int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
++
++              head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
++              head->arg.page_len += len;
++              head->arg.len += len;
++              if (!pg_off)
++                      head->count++;
++              rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
++              rqstp->rq_next_page = rqstp->rq_respages + 1;
++              frmr->page_list->page_list[pno] =
+                       ib_dma_map_page(xprt->sc_cm_id->device,
+-                                      rqstp->rq_arg.pages[page_no], 0,
++                                      head->arg.pages[pg_no], 0,
+                                       PAGE_SIZE, DMA_FROM_DEVICE);
+-              if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+-                                       frmr->page_list->page_list[page_no]))
+-                      goto fatal_err;
++              ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
++                                         frmr->page_list->page_list[pno]);
++              if (ret)
++                      goto err;
+               atomic_inc(&xprt->sc_dma_used);
+-              head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
+-      }
+-      head->count += page_no;
+-
+-      /* rq_respages points one past arg pages */
+-      rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+-      rqstp->rq_next_page = rqstp->rq_respages + 1;
+-      /* Create the reply and chunk maps */
+-      offset = 0;
+-      ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+-      for (ch_no = 0; ch_no < ch_count; ch_no++) {
+-              int len = ntohl(ch->rc_target.rs_length);
+-              rpl_map->sge[ch_no].iov_base = frmr->kva + offset;
+-              rpl_map->sge[ch_no].iov_len = len;
+-              chl_map->ch[ch_no].count = 1;
+-              chl_map->ch[ch_no].start = ch_no;
+-              offset += len;
+-              ch++;
++              /* adjust offset and wrap to next page if needed */
++              pg_off += len;
++              if (pg_off == PAGE_SIZE) {
++                      pg_off = 0;
++                      pg_no++;
++              }
++              rs_length -= len;
+       }
+-      ret = svc_rdma_fastreg(xprt, frmr);
+-      if (ret)
+-              goto fatal_err;
+-
+-      return ch_no;
+-
+- fatal_err:
+-      printk("svcrdma: error fast registering xdr for xprt %p", xprt);
+-      svc_rdma_put_frmr(xprt, frmr);
+-      return -EIO;
+-}
+-
+-static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
+-                           struct svc_rdma_op_ctxt *ctxt,
+-                           struct svc_rdma_fastreg_mr *frmr,
+-                           struct kvec *vec,
+-                           u64 *sgl_offset,
+-                           int count)
+-{
+-      int i;
+-      unsigned long off;
++      if (last && rs_length == 0)
++              set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
++      else
++              clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+-      ctxt->count = count;
+-      ctxt->direction = DMA_FROM_DEVICE;
+-      for (i = 0; i < count; i++) {
+-              ctxt->sge[i].length = 0; /* in case map fails */
+-              if (!frmr) {
+-                      BUG_ON(!virt_to_page(vec[i].iov_base));
+-                      off = (unsigned long)vec[i].iov_base & ~PAGE_MASK;
+-                      ctxt->sge[i].addr =
+-                              ib_dma_map_page(xprt->sc_cm_id->device,
+-                                              virt_to_page(vec[i].iov_base),
+-                                              off,
+-                                              vec[i].iov_len,
+-                                              DMA_FROM_DEVICE);
+-                      if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+-                                               ctxt->sge[i].addr))
+-                              return -EINVAL;
+-                      ctxt->sge[i].lkey = xprt->sc_dma_lkey;
+-                      atomic_inc(&xprt->sc_dma_used);
+-              } else {
+-                      ctxt->sge[i].addr = (unsigned long)vec[i].iov_base;
+-                      ctxt->sge[i].lkey = frmr->mr->lkey;
+-              }
+-              ctxt->sge[i].length = vec[i].iov_len;
+-              *sgl_offset = *sgl_offset + vec[i].iov_len;
++      /* Bump the key */
++      key = (u8)(frmr->mr->lkey & 0x000000FF);
++      ib_update_fast_reg_key(frmr->mr, ++key);
++
++      ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset;
++      ctxt->sge[0].lkey = frmr->mr->lkey;
++      ctxt->sge[0].length = read;
++      ctxt->count = 1;
++      ctxt->read_hdr = head;
++
++      /* Prepare FASTREG WR */
++      memset(&fastreg_wr, 0, sizeof(fastreg_wr));
++      fastreg_wr.opcode = IB_WR_FAST_REG_MR;
++      fastreg_wr.send_flags = IB_SEND_SIGNALED;
++      fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
++      fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
++      fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
++      fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
++      fastreg_wr.wr.fast_reg.length = frmr->map_len;
++      fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
++      fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
++      fastreg_wr.next = &read_wr;
++
++      /* Prepare RDMA_READ */
++      memset(&read_wr, 0, sizeof(read_wr));
++      read_wr.send_flags = IB_SEND_SIGNALED;
++      read_wr.wr.rdma.rkey = rs_handle;
++      read_wr.wr.rdma.remote_addr = rs_offset;
++      read_wr.sg_list = ctxt->sge;
++      read_wr.num_sge = 1;
++      if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) {
++              read_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
++              read_wr.wr_id = (unsigned long)ctxt;
++              read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey;
++      } else {
++              read_wr.opcode = IB_WR_RDMA_READ;
++              read_wr.next = &inv_wr;
++              /* Prepare invalidate */
++              memset(&inv_wr, 0, sizeof(inv_wr));
++              inv_wr.wr_id = (unsigned long)ctxt;
++              inv_wr.opcode = IB_WR_LOCAL_INV;
++              inv_wr.send_flags = IB_SEND_SIGNALED;
++              inv_wr.ex.invalidate_rkey = frmr->mr->lkey;
++      }
++      ctxt->wr_op = read_wr.opcode;
++
++      /* Post the chain */
++      ret = svc_rdma_send(xprt, &fastreg_wr);
++      if (ret) {
++              pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
++              set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
++              goto err;
+       }
+-      return 0;
+-}
+-static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
+-{
+-      if ((rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
+-           RDMA_TRANSPORT_IWARP) &&
+-          sge_count > 1)
+-              return 1;
+-      else
+-              return min_t(int, sge_count, xprt->sc_max_sge);
++      /* return current location in page array */
++      *page_no = pg_no;
++      *page_offset = pg_off;
++      ret = read;
++      atomic_inc(&rdma_stat_read);
++      return ret;
++ err:
++      svc_rdma_unmap_dma(ctxt);
++      svc_rdma_put_context(ctxt, 0);
++      svc_rdma_put_frmr(xprt, frmr);
++      return ret;
+ }
+-/*
+- * Use RDMA_READ to read data from the advertised client buffer into the
+- * XDR stream starting at rq_arg.head[0].iov_base.
+- * Each chunk in the array
+- * contains the following fields:
+- * discrim      - '1', This isn't used for data placement
+- * position     - The xdr stream offset (the same for every chunk)
+- * handle       - RMR for client memory region
+- * length       - data transfer length
+- * offset       - 64 bit tagged offset in remote memory region
+- *
+- * On our side, we need to read into a pagelist. The first page immediately
+- * follows the RPC header.
+- *
+- * This function returns:
+- * 0 - No error and no read-list found.
+- *
+- * 1 - Successful read-list processing. The data is not yet in
+- * the pagelist and therefore the RPC request must be deferred. The
+- * I/O completion will enqueue the transport again and
+- * svc_rdma_recvfrom will complete the request.
+- *
+- * <0 - Error processing/posting read-list.
+- *
+- * NOTE: The ctxt must not be touched after the last WR has been posted
+- * because the I/O completion processing may occur on another
+- * processor and free / modify the context. Ne touche pas!
+- */
+-static int rdma_read_xdr(struct svcxprt_rdma *xprt,
+-                       struct rpcrdma_msg *rmsgp,
+-                       struct svc_rqst *rqstp,
+-                       struct svc_rdma_op_ctxt *hdr_ctxt)
++static int rdma_read_chunks(struct svcxprt_rdma *xprt,
++                          struct rpcrdma_msg *rmsgp,
++                          struct svc_rqst *rqstp,
++                          struct svc_rdma_op_ctxt *head)
+ {
+-      struct ib_send_wr read_wr;
+-      struct ib_send_wr inv_wr;
+-      int err = 0;
+-      int ch_no;
+-      int ch_count;
+-      int byte_count;
+-      int sge_count;
+-      u64 sgl_offset;
++      int page_no, ch_count, ret;
+       struct rpcrdma_read_chunk *ch;
+-      struct svc_rdma_op_ctxt *ctxt = NULL;
+-      struct svc_rdma_req_map *rpl_map;
+-      struct svc_rdma_req_map *chl_map;
++      u32 page_offset, byte_count;
++      u64 rs_offset;
++      rdma_reader_fn reader;
+       /* If no read list is present, return 0 */
+       ch = svc_rdma_get_read_chunk(rmsgp);
+@@ -408,122 +384,55 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
+       if (ch_count > RPCSVC_MAXPAGES)
+               return -EINVAL;
+-      /* Allocate temporary reply and chunk maps */
+-      rpl_map = svc_rdma_get_req_map();
+-      chl_map = svc_rdma_get_req_map();
++      /* The request is completed when the RDMA_READs complete. The
++       * head context keeps all the pages that comprise the
++       * request.
++       */
++      head->arg.head[0] = rqstp->rq_arg.head[0];
++      head->arg.tail[0] = rqstp->rq_arg.tail[0];
++      head->arg.pages = &head->pages[head->count];
++      head->hdr_count = head->count;
++      head->arg.page_base = 0;
++      head->arg.page_len = 0;
++      head->arg.len = rqstp->rq_arg.len;
++      head->arg.buflen = rqstp->rq_arg.buflen;
+-      if (!xprt->sc_frmr_pg_list_len)
+-              sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
+-                                          rpl_map, chl_map, ch_count,
+-                                          byte_count);
++      /* Use FRMR if supported */
++      if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)
++              reader = rdma_read_chunk_frmr;
+       else
+-              sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
+-                                               rpl_map, chl_map, ch_count,
+-                                               byte_count);
+-      if (sge_count < 0) {
+-              err = -EIO;
+-              goto out;
+-      }
+-
+-      sgl_offset = 0;
+-      ch_no = 0;
++              reader = rdma_read_chunk_lcl;
++      page_no = 0; page_offset = 0;
+       for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+-           ch->rc_discrim != 0; ch++, ch_no++) {
+-              u64 rs_offset;
+-next_sge:
+-              ctxt = svc_rdma_get_context(xprt);
+-              ctxt->direction = DMA_FROM_DEVICE;
+-              ctxt->frmr = hdr_ctxt->frmr;
+-              ctxt->read_hdr = NULL;
+-              clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+-              clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
++           ch->rc_discrim != 0; ch++) {
+-              /* Prepare READ WR */
+-              memset(&read_wr, 0, sizeof read_wr);
+-              read_wr.wr_id = (unsigned long)ctxt;
+-              read_wr.opcode = IB_WR_RDMA_READ;
+-              ctxt->wr_op = read_wr.opcode;
+-              read_wr.send_flags = IB_SEND_SIGNALED;
+-              read_wr.wr.rdma.rkey = ntohl(ch->rc_target.rs_handle);
+               xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset,
+                                &rs_offset);
+-              read_wr.wr.rdma.remote_addr = rs_offset + sgl_offset;
+-              read_wr.sg_list = ctxt->sge;
+-              read_wr.num_sge =
+-                      rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
+-              err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr,
+-                                      &rpl_map->sge[chl_map->ch[ch_no].start],
+-                                      &sgl_offset,
+-                                      read_wr.num_sge);
+-              if (err) {
+-                      svc_rdma_unmap_dma(ctxt);
+-                      svc_rdma_put_context(ctxt, 0);
+-                      goto out;
+-              }
+-              if (((ch+1)->rc_discrim == 0) &&
+-                  (read_wr.num_sge == chl_map->ch[ch_no].count)) {
+-                      /*
+-                       * Mark the last RDMA_READ with a bit to
+-                       * indicate all RPC data has been fetched from
+-                       * the client and the RPC needs to be enqueued.
+-                       */
+-                      set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+-                      if (hdr_ctxt->frmr) {
+-                              set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+-                              /*
+-                               * Invalidate the local MR used to map the data
+-                               * sink.
+-                               */
+-                              if (xprt->sc_dev_caps &
+-                                  SVCRDMA_DEVCAP_READ_W_INV) {
+-                                      read_wr.opcode =
+-                                              IB_WR_RDMA_READ_WITH_INV;
+-                                      ctxt->wr_op = read_wr.opcode;
+-                                      read_wr.ex.invalidate_rkey =
+-                                              ctxt->frmr->mr->lkey;
+-                              } else {
+-                                      /* Prepare INVALIDATE WR */
+-                                      memset(&inv_wr, 0, sizeof inv_wr);
+-                                      inv_wr.opcode = IB_WR_LOCAL_INV;
+-                                      inv_wr.send_flags = IB_SEND_SIGNALED;
+-                                      inv_wr.ex.invalidate_rkey =
+-                                              hdr_ctxt->frmr->mr->lkey;
+-                                      read_wr.next = &inv_wr;
+-                              }
+-                      }
+-                      ctxt->read_hdr = hdr_ctxt;
+-              }
+-              /* Post the read */
+-              err = svc_rdma_send(xprt, &read_wr);
+-              if (err) {
+-                      printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n",
+-                             err);
+-                      set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+-                      svc_rdma_unmap_dma(ctxt);
+-                      svc_rdma_put_context(ctxt, 0);
+-                      goto out;
++              byte_count = ntohl(ch->rc_target.rs_length);
++
++              while (byte_count > 0) {
++                      ret = reader(xprt, rqstp, head,
++                                   &page_no, &page_offset,
++                                   ntohl(ch->rc_target.rs_handle),
++                                   byte_count, rs_offset,
++                                   ((ch+1)->rc_discrim == 0) /* last */
++                                   );
++                      if (ret < 0)
++                              goto err;
++                      byte_count -= ret;
++                      rs_offset += ret;
++                      head->arg.buflen += ret;
+               }
+-              atomic_inc(&rdma_stat_read);
+-
+-              if (read_wr.num_sge < chl_map->ch[ch_no].count) {
+-                      chl_map->ch[ch_no].count -= read_wr.num_sge;
+-                      chl_map->ch[ch_no].start += read_wr.num_sge;
+-                      goto next_sge;
+-              }
+-              sgl_offset = 0;
+-              err = 1;
+       }
+-
+- out:
+-      svc_rdma_put_req_map(rpl_map);
+-      svc_rdma_put_req_map(chl_map);
+-
++      ret = 1;
++ err:
+       /* Detach arg pages. svc_recv will replenish them */
+-      for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
+-              rqstp->rq_pages[ch_no] = NULL;
++      for (page_no = 0;
++           &rqstp->rq_pages[page_no] < rqstp->rq_respages; page_no++)
++              rqstp->rq_pages[page_no] = NULL;
+-      return err;
++      return ret;
+ }
+ static int rdma_read_complete(struct svc_rqst *rqstp,
+@@ -595,13 +504,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
+                                 struct svc_rdma_op_ctxt,
+                                 dto_q);
+               list_del_init(&ctxt->dto_q);
+-      }
+-      if (ctxt) {
+               spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+               return rdma_read_complete(rqstp, ctxt);
+-      }
+-
+-      if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
++      } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
+               ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
+                                 struct svc_rdma_op_ctxt,
+                                 dto_q);
+@@ -621,7 +526,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
+               if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
+                       goto close_out;
+-              BUG_ON(ret);
+               goto out;
+       }
+       dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
+@@ -644,12 +548,11 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
+       }
+       /* Read read-list data. */
+-      ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt);
++      ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt);
+       if (ret > 0) {
+               /* read-list posted, defer until data received from client. */
+               goto defer;
+-      }
+-      if (ret < 0) {
++      } else if (ret < 0) {
+               /* Post of read-list failed, free context. */
+               svc_rdma_put_context(ctxt, 1);
+               return 0;
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+index 7e024a5..49fd21a 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+  * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+  *
+  * This software is available to you under a choice of one of two
+@@ -49,152 +50,6 @@
+ #define RPCDBG_FACILITY       RPCDBG_SVCXPRT
+-/* Encode an XDR as an array of IB SGE
+- *
+- * Assumptions:
+- * - head[0] is physically contiguous.
+- * - tail[0] is physically contiguous.
+- * - pages[] is not physically or virtually contiguous and consists of
+- *   PAGE_SIZE elements.
+- *
+- * Output:
+- * SGE[0]              reserved for RCPRDMA header
+- * SGE[1]              data from xdr->head[]
+- * SGE[2..sge_count-2] data from xdr->pages[]
+- * SGE[sge_count-1]    data from xdr->tail.
+- *
+- * The max SGE we need is the length of the XDR / pagesize + one for
+- * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES
+- * reserves a page for both the request and the reply header, and this
+- * array is only concerned with the reply we are assured that we have
+- * on extra page for the RPCRMDA header.
+- */
+-static int fast_reg_xdr(struct svcxprt_rdma *xprt,
+-                      struct xdr_buf *xdr,
+-                      struct svc_rdma_req_map *vec)
+-{
+-      int sge_no;
+-      u32 sge_bytes;
+-      u32 page_bytes;
+-      u32 page_off;
+-      int page_no = 0;
+-      u8 *frva;
+-      struct svc_rdma_fastreg_mr *frmr;
+-
+-      frmr = svc_rdma_get_frmr(xprt);
+-      if (IS_ERR(frmr))
+-              return -ENOMEM;
+-      vec->frmr = frmr;
+-
+-      /* Skip the RPCRDMA header */
+-      sge_no = 1;
+-
+-      /* Map the head. */
+-      frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK);
+-      vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
+-      vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
+-      vec->count = 2;
+-      sge_no++;
+-
+-      /* Map the XDR head */
+-      frmr->kva = frva;
+-      frmr->direction = DMA_TO_DEVICE;
+-      frmr->access_flags = 0;
+-      frmr->map_len = PAGE_SIZE;
+-      frmr->page_list_len = 1;
+-      page_off = (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
+-      frmr->page_list->page_list[page_no] =
+-              ib_dma_map_page(xprt->sc_cm_id->device,
+-                              virt_to_page(xdr->head[0].iov_base),
+-                              page_off,
+-                              PAGE_SIZE - page_off,
+-                              DMA_TO_DEVICE);
+-      if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+-                               frmr->page_list->page_list[page_no]))
+-              goto fatal_err;
+-      atomic_inc(&xprt->sc_dma_used);
+-
+-      /* Map the XDR page list */
+-      page_off = xdr->page_base;
+-      page_bytes = xdr->page_len + page_off;
+-      if (!page_bytes)
+-              goto encode_tail;
+-
+-      /* Map the pages */
+-      vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
+-      vec->sge[sge_no].iov_len = page_bytes;
+-      sge_no++;
+-      while (page_bytes) {
+-              struct page *page;
+-
+-              page = xdr->pages[page_no++];
+-              sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
+-              page_bytes -= sge_bytes;
+-
+-              frmr->page_list->page_list[page_no] =
+-                      ib_dma_map_page(xprt->sc_cm_id->device,
+-                                      page, page_off,
+-                                      sge_bytes, DMA_TO_DEVICE);
+-              if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+-                                       frmr->page_list->page_list[page_no]))
+-                      goto fatal_err;
+-
+-              atomic_inc(&xprt->sc_dma_used);
+-              page_off = 0; /* reset for next time through loop */
+-              frmr->map_len += PAGE_SIZE;
+-              frmr->page_list_len++;
+-      }
+-      vec->count++;
+-
+- encode_tail:
+-      /* Map tail */
+-      if (0 == xdr->tail[0].iov_len)
+-              goto done;
+-
+-      vec->count++;
+-      vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
+-
+-      if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) ==
+-          ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) {
+-              /*
+-               * If head and tail use the same page, we don't need
+-               * to map it again.
+-               */
+-              vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
+-      } else {
+-              void *va;
+-
+-              /* Map another page for the tail */
+-              page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
+-              va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK);
+-              vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
+-
+-              frmr->page_list->page_list[page_no] =
+-                  ib_dma_map_page(xprt->sc_cm_id->device, virt_to_page(va),
+-                                  page_off,
+-                                  PAGE_SIZE,
+-                                  DMA_TO_DEVICE);
+-              if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+-                                       frmr->page_list->page_list[page_no]))
+-                      goto fatal_err;
+-              atomic_inc(&xprt->sc_dma_used);
+-              frmr->map_len += PAGE_SIZE;
+-              frmr->page_list_len++;
+-      }
+-
+- done:
+-      if (svc_rdma_fastreg(xprt, frmr))
+-              goto fatal_err;
+-
+-      return 0;
+-
+- fatal_err:
+-      printk("svcrdma: Error fast registering memory for xprt %p\n", xprt);
+-      vec->frmr = NULL;
+-      svc_rdma_put_frmr(xprt, frmr);
+-      return -EIO;
+-}
+-
+ static int map_xdr(struct svcxprt_rdma *xprt,
+                  struct xdr_buf *xdr,
+                  struct svc_rdma_req_map *vec)
+@@ -208,9 +63,6 @@ static int map_xdr(struct svcxprt_rdma *xprt,
+       BUG_ON(xdr->len !=
+              (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
+-      if (xprt->sc_frmr_pg_list_len)
+-              return fast_reg_xdr(xprt, xdr, vec);
+-
+       /* Skip the first sge, this is for the RPCRDMA header */
+       sge_no = 1;
+@@ -282,8 +134,6 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
+ }
+ /* Assumptions:
+- * - We are using FRMR
+- *     - or -
+  * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
+  */
+ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+@@ -327,23 +177,16 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+               sge_bytes = min_t(size_t,
+                         bc, vec->sge[xdr_sge_no].iov_len-sge_off);
+               sge[sge_no].length = sge_bytes;
+-              if (!vec->frmr) {
+-                      sge[sge_no].addr =
+-                              dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
+-                                          sge_bytes, DMA_TO_DEVICE);
+-                      xdr_off += sge_bytes;
+-                      if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+-                                               sge[sge_no].addr))
+-                              goto err;
+-                      atomic_inc(&xprt->sc_dma_used);
+-                      sge[sge_no].lkey = xprt->sc_dma_lkey;
+-              } else {
+-                      sge[sge_no].addr = (unsigned long)
+-                              vec->sge[xdr_sge_no].iov_base + sge_off;
+-                      sge[sge_no].lkey = vec->frmr->mr->lkey;
+-              }
++              sge[sge_no].addr =
++                      dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
++                                  sge_bytes, DMA_TO_DEVICE);
++              xdr_off += sge_bytes;
++              if (ib_dma_mapping_error(xprt->sc_cm_id->device,
++                                       sge[sge_no].addr))
++                      goto err;
++              atomic_inc(&xprt->sc_dma_used);
++              sge[sge_no].lkey = xprt->sc_dma_lkey;
+               ctxt->count++;
+-              ctxt->frmr = vec->frmr;
+               sge_off = 0;
+               sge_no++;
+               xdr_sge_no++;
+@@ -369,7 +212,6 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+       return 0;
+  err:
+       svc_rdma_unmap_dma(ctxt);
+-      svc_rdma_put_frmr(xprt, vec->frmr);
+       svc_rdma_put_context(ctxt, 0);
+       /* Fatal error, close transport */
+       return -EIO;
+@@ -397,10 +239,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
+       res_ary = (struct rpcrdma_write_array *)
+               &rdma_resp->rm_body.rm_chunks[1];
+-      if (vec->frmr)
+-              max_write = vec->frmr->map_len;
+-      else
+-              max_write = xprt->sc_max_sge * PAGE_SIZE;
++      max_write = xprt->sc_max_sge * PAGE_SIZE;
+       /* Write chunks start at the pagelist */
+       for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
+@@ -472,10 +311,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
+       res_ary = (struct rpcrdma_write_array *)
+               &rdma_resp->rm_body.rm_chunks[2];
+-      if (vec->frmr)
+-              max_write = vec->frmr->map_len;
+-      else
+-              max_write = xprt->sc_max_sge * PAGE_SIZE;
++      max_write = xprt->sc_max_sge * PAGE_SIZE;
+       /* xdr offset starts at RPC message */
+       nchunks = ntohl(arg_ary->wc_nchunks);
+@@ -545,7 +381,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
+                     int byte_count)
+ {
+       struct ib_send_wr send_wr;
+-      struct ib_send_wr inv_wr;
+       int sge_no;
+       int sge_bytes;
+       int page_no;
+@@ -559,7 +394,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
+                      "svcrdma: could not post a receive buffer, err=%d."
+                      "Closing transport %p.\n", ret, rdma);
+               set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+-              svc_rdma_put_frmr(rdma, vec->frmr);
+               svc_rdma_put_context(ctxt, 0);
+               return -ENOTCONN;
+       }
+@@ -567,11 +401,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
+       /* Prepare the context */
+       ctxt->pages[0] = page;
+       ctxt->count = 1;
+-      ctxt->frmr = vec->frmr;
+-      if (vec->frmr)
+-              set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+-      else
+-              clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+       /* Prepare the SGE for the RPCRDMA Header */
+       ctxt->sge[0].lkey = rdma->sc_dma_lkey;
+@@ -590,21 +419,15 @@ static int send_reply(struct svcxprt_rdma *rdma,
+               int xdr_off = 0;
+               sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
+               byte_count -= sge_bytes;
+-              if (!vec->frmr) {
+-                      ctxt->sge[sge_no].addr =
+-                              dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
+-                                          sge_bytes, DMA_TO_DEVICE);
+-                      xdr_off += sge_bytes;
+-                      if (ib_dma_mapping_error(rdma->sc_cm_id->device,
+-                                               ctxt->sge[sge_no].addr))
+-                              goto err;
+-                      atomic_inc(&rdma->sc_dma_used);
+-                      ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+-              } else {
+-                      ctxt->sge[sge_no].addr = (unsigned long)
+-                              vec->sge[sge_no].iov_base;
+-                      ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey;
+-              }
++              ctxt->sge[sge_no].addr =
++                      dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
++                                  sge_bytes, DMA_TO_DEVICE);
++              xdr_off += sge_bytes;
++              if (ib_dma_mapping_error(rdma->sc_cm_id->device,
++                                       ctxt->sge[sge_no].addr))
++                      goto err;
++              atomic_inc(&rdma->sc_dma_used);
++              ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+               ctxt->sge[sge_no].length = sge_bytes;
+       }
+       BUG_ON(byte_count != 0);
+@@ -627,6 +450,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
+                       ctxt->sge[page_no+1].length = 0;
+       }
+       rqstp->rq_next_page = rqstp->rq_respages + 1;
++
+       BUG_ON(sge_no > rdma->sc_max_sge);
+       memset(&send_wr, 0, sizeof send_wr);
+       ctxt->wr_op = IB_WR_SEND;
+@@ -635,15 +459,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
+       send_wr.num_sge = sge_no;
+       send_wr.opcode = IB_WR_SEND;
+       send_wr.send_flags =  IB_SEND_SIGNALED;
+-      if (vec->frmr) {
+-              /* Prepare INVALIDATE WR */
+-              memset(&inv_wr, 0, sizeof inv_wr);
+-              inv_wr.opcode = IB_WR_LOCAL_INV;
+-              inv_wr.send_flags = IB_SEND_SIGNALED;
+-              inv_wr.ex.invalidate_rkey =
+-                      vec->frmr->mr->lkey;
+-              send_wr.next = &inv_wr;
+-      }
+       ret = svc_rdma_send(rdma, &send_wr);
+       if (ret)
+@@ -653,7 +468,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
+  err:
+       svc_rdma_unmap_dma(ctxt);
+-      svc_rdma_put_frmr(rdma, vec->frmr);
+       svc_rdma_put_context(ctxt, 1);
+       return -EIO;
+ }
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
+index 02db8d9..e7323fb 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+  * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
+  *
+  * This software is available to you under a choice of one of two
+@@ -162,7 +163,6 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void)
+               schedule_timeout_uninterruptible(msecs_to_jiffies(500));
+       }
+       map->count = 0;
+-      map->frmr = NULL;
+       return map;
+ }
+@@ -338,22 +338,21 @@ static void process_context(struct svcxprt_rdma *xprt,
+       switch (ctxt->wr_op) {
+       case IB_WR_SEND:
+-              if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
+-                      svc_rdma_put_frmr(xprt, ctxt->frmr);
++              BUG_ON(ctxt->frmr);
+               svc_rdma_put_context(ctxt, 1);
+               break;
+       case IB_WR_RDMA_WRITE:
++              BUG_ON(ctxt->frmr);
+               svc_rdma_put_context(ctxt, 0);
+               break;
+       case IB_WR_RDMA_READ:
+       case IB_WR_RDMA_READ_WITH_INV:
++              svc_rdma_put_frmr(xprt, ctxt->frmr);
+               if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
+                       struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
+                       BUG_ON(!read_hdr);
+-                      if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
+-                              svc_rdma_put_frmr(xprt, ctxt->frmr);
+                       spin_lock_bh(&xprt->sc_rq_dto_lock);
+                       set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+                       list_add_tail(&read_hdr->dto_q,
+@@ -365,6 +364,7 @@ static void process_context(struct svcxprt_rdma *xprt,
+               break;
+       default:
++              BUG_ON(1);
+               printk(KERN_ERR "svcrdma: unexpected completion type, "
+                      "opcode=%d\n",
+                      ctxt->wr_op);
+@@ -380,29 +380,42 @@ static void process_context(struct svcxprt_rdma *xprt,
+ static void sq_cq_reap(struct svcxprt_rdma *xprt)
+ {
+       struct svc_rdma_op_ctxt *ctxt = NULL;
+-      struct ib_wc wc;
++      struct ib_wc wc_a[6];
++      struct ib_wc *wc;
+       struct ib_cq *cq = xprt->sc_sq_cq;
+       int ret;
++      memset(wc_a, 0, sizeof(wc_a));
++
+       if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
+               return;
+       ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
+       atomic_inc(&rdma_stat_sq_poll);
+-      while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
+-              if (wc.status != IB_WC_SUCCESS)
+-                      /* Close the transport */
+-                      set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
++      while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) {
++              int i;
+-              /* Decrement used SQ WR count */
+-              atomic_dec(&xprt->sc_sq_count);
+-              wake_up(&xprt->sc_send_wait);
++              for (i = 0; i < ret; i++) {
++                      wc = &wc_a[i];
++                      if (wc->status != IB_WC_SUCCESS) {
++                              dprintk("svcrdma: sq wc err status %d\n",
++                                      wc->status);
+-              ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
+-              if (ctxt)
+-                      process_context(xprt, ctxt);
++                              /* Close the transport */
++                              set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
++                      }
+-              svc_xprt_put(&xprt->sc_xprt);
++                      /* Decrement used SQ WR count */
++                      atomic_dec(&xprt->sc_sq_count);
++                      wake_up(&xprt->sc_send_wait);
++
++                      ctxt = (struct svc_rdma_op_ctxt *)
++                              (unsigned long)wc->wr_id;
++                      if (ctxt)
++                              process_context(xprt, ctxt);
++
++                      svc_xprt_put(&xprt->sc_xprt);
++              }
+       }
+       if (ctxt)
+@@ -995,7 +1008,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
+                       need_dma_mr = 0;
+               break;
+       case RDMA_TRANSPORT_IB:
+-              if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
++              if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
++                      need_dma_mr = 1;
++                      dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
++              } else if (!(devattr.device_cap_flags &
++                           IB_DEVICE_LOCAL_DMA_LKEY)) {
+                       need_dma_mr = 1;
+                       dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+               } else
+@@ -1192,14 +1209,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
+               container_of(xprt, struct svcxprt_rdma, sc_xprt);
+       /*
+-       * If there are fewer SQ WR available than required to send a
+-       * simple response, return false.
+-       */
+-      if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3))
+-              return 0;
+-
+-      /*
+-       * ...or there are already waiters on the SQ,
++       * If there are already waiters on the SQ,
+        * return false.
+        */
+       if (waitqueue_active(&rdma->sc_send_wait))
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0131-svcrdma-Fence-LOCAL_INV-work-requests.patch b/linux-next-cherry-picks/0131-svcrdma-Fence-LOCAL_INV-work-requests.patch
new file mode 100644 (file)
index 0000000..2ecff31
--- /dev/null
@@ -0,0 +1,31 @@
+From 83710fc753d2ae158aa3cb7a7966d9c1bd05b792 Mon Sep 17 00:00:00 2001
+From: Steve Wise <swise@opengridcomputing.com>
+Date: Thu, 5 Jun 2014 09:54:31 -0500
+Subject: [PATCH 132/132] svcrdma: Fence LOCAL_INV work requests
+
+Fencing forces the invalidate to only happen after all prior send
+work requests have been completed.
+
+Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+Reported by : Devesh Sharma <Devesh.Sharma@Emulex.Com>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+---
+ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |    2 +-
+ 1 files changed, 1 insertions(+), 1 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+index 52d9f2c..8f92a61 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+@@ -338,7 +338,7 @@ static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
+               memset(&inv_wr, 0, sizeof(inv_wr));
+               inv_wr.wr_id = (unsigned long)ctxt;
+               inv_wr.opcode = IB_WR_LOCAL_INV;
+-              inv_wr.send_flags = IB_SEND_SIGNALED;
++              inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE;
+               inv_wr.ex.invalidate_rkey = frmr->mr->lkey;
+       }
+       ctxt->wr_op = read_wr.opcode;
+-- 
+1.7.1
+
diff --git a/linux-next-cherry-picks/0132-svcrdma-send_write-must-not-overflow.patch b/linux-next-cherry-picks/0132-svcrdma-send_write-must-not-overflow.patch
new file mode 100644 (file)
index 0000000..387f601
--- /dev/null
@@ -0,0 +1,129 @@
+commit 255942907e7ff498ab1545b5edce5690833ff640
+Author: Steve Wise <swise@opengridcomputing.com>
+Date:   Wed Jul 9 13:49:15 2014 -0500
+
+    svcrdma: send_write() must not overflow the device's max sge
+    
+    Function send_write() must stop creating sges when it reaches the device
+    max and return the amount sent in the RDMA Write to the caller.
+    
+    Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+    Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+index 49fd21a..9f1b506 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+@@ -192,6 +192,8 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+               xdr_sge_no++;
+               BUG_ON(xdr_sge_no > vec->count);
+               bc -= sge_bytes;
++              if (sge_no == xprt->sc_max_sge)
++                      break;
+       }
+       /* Prepare WRITE WR */
+@@ -209,7 +211,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+       atomic_inc(&rdma_stat_write);
+       if (svc_rdma_send(xprt, &write_wr))
+               goto err;
+-      return 0;
++      return write_len - bc;
+  err:
+       svc_rdma_unmap_dma(ctxt);
+       svc_rdma_put_context(ctxt, 0);
+@@ -225,7 +227,6 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
+ {
+       u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
+       int write_len;
+-      int max_write;
+       u32 xdr_off;
+       int chunk_off;
+       int chunk_no;
+@@ -239,8 +240,6 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
+       res_ary = (struct rpcrdma_write_array *)
+               &rdma_resp->rm_body.rm_chunks[1];
+-      max_write = xprt->sc_max_sge * PAGE_SIZE;
+-
+       /* Write chunks start at the pagelist */
+       for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
+            xfer_len && chunk_no < arg_ary->wc_nchunks;
+@@ -260,23 +259,21 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
+                                               write_len);
+               chunk_off = 0;
+               while (write_len) {
+-                      int this_write;
+-                      this_write = min(write_len, max_write);
+                       ret = send_write(xprt, rqstp,
+                                        ntohl(arg_ch->rs_handle),
+                                        rs_offset + chunk_off,
+                                        xdr_off,
+-                                       this_write,
++                                       write_len,
+                                        vec);
+-                      if (ret) {
++                      if (ret <= 0) {
+                               dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
+                                       ret);
+                               return -EIO;
+                       }
+-                      chunk_off += this_write;
+-                      xdr_off += this_write;
+-                      xfer_len -= this_write;
+-                      write_len -= this_write;
++                      chunk_off += ret;
++                      xdr_off += ret;
++                      xfer_len -= ret;
++                      write_len -= ret;
+               }
+       }
+       /* Update the req with the number of chunks actually used */
+@@ -293,7 +290,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
+ {
+       u32 xfer_len = rqstp->rq_res.len;
+       int write_len;
+-      int max_write;
+       u32 xdr_off;
+       int chunk_no;
+       int chunk_off;
+@@ -311,8 +307,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
+       res_ary = (struct rpcrdma_write_array *)
+               &rdma_resp->rm_body.rm_chunks[2];
+-      max_write = xprt->sc_max_sge * PAGE_SIZE;
+-
+       /* xdr offset starts at RPC message */
+       nchunks = ntohl(arg_ary->wc_nchunks);
+       for (xdr_off = 0, chunk_no = 0;
+@@ -330,24 +324,21 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
+                                               write_len);
+               chunk_off = 0;
+               while (write_len) {
+-                      int this_write;
+-
+-                      this_write = min(write_len, max_write);
+                       ret = send_write(xprt, rqstp,
+                                        ntohl(ch->rs_handle),
+                                        rs_offset + chunk_off,
+                                        xdr_off,
+-                                       this_write,
++                                       write_len,
+                                        vec);
+-                      if (ret) {
++                      if (ret <= 0) {
+                               dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
+                                       ret);
+                               return -EIO;
+                       }
+-                      chunk_off += this_write;
+-                      xdr_off += this_write;
+-                      xfer_len -= this_write;
+-                      write_len -= this_write;
++                      chunk_off += ret;
++                      xdr_off += ret;
++                      xfer_len -= ret;
++                      write_len -= ret;
+               }
+       }
+       /* Update the req with the number of chunks actually used */
diff --git a/linux-next-cherry-picks/0133-nfsrdma-backport-fixes.patch b/linux-next-cherry-picks/0133-nfsrdma-backport-fixes.patch
new file mode 100644 (file)
index 0000000..d492471
--- /dev/null
@@ -0,0 +1,48 @@
+Index: compat-rdma/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+===================================================================
+--- compat-rdma.orig/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
++++ compat-rdma/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+@@ -92,7 +92,9 @@ static void rdma_build_arg_xdr(struct sv
+               sge_no++;
+       }
+       rqstp->rq_respages = &rqstp->rq_pages[sge_no];
++#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,7,0))
+       rqstp->rq_next_page = rqstp->rq_respages + 1;
++#endif
+       /* We should never run out of SGE because the limit is defined to
+        * support the max allowed RPC data length
+@@ -167,7 +169,9 @@ static int rdma_read_chunk_lcl(struct sv
+               if (!pg_off)
+                       head->count++;
+               rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
++#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,7,0))
+               rqstp->rq_next_page = rqstp->rq_respages + 1;
++#endif
+               ctxt->sge[pno].addr =
+                       ib_dma_map_page(xprt->sc_cm_id->device,
+                                       head->arg.pages[pg_no], pg_off,
+@@ -272,7 +276,9 @@ static int rdma_read_chunk_frmr(struct s
+               if (!pg_off)
+                       head->count++;
+               rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
++#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,7,0))
+               rqstp->rq_next_page = rqstp->rq_respages + 1;
++#endif
+               frmr->page_list->page_list[pno] =
+                       ib_dma_map_page(xprt->sc_cm_id->device,
+                                       head->arg.pages[pg_no], 0,
+Index: compat-rdma/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+===================================================================
+--- compat-rdma.orig/net/sunrpc/xprtrdma/svc_rdma_sendto.c
++++ compat-rdma/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+@@ -446,7 +446,9 @@ static int send_reply(struct svcxprt_rdm
+               if (page_no+1 >= sge_no)
+                       ctxt->sge[page_no+1].length = 0;
+       }
++#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,7,0))
+       rqstp->rq_next_page = rqstp->rq_respages + 1;
++#endif
+       BUG_ON(sge_no > rdma->sc_max_sge);
+       memset(&send_wr, 0, sizeof send_wr);
diff --git a/linux-next-pending/0024-SUNRPC-Fix-large_reads-on-NFS-RDMA.patch b/linux-next-pending/0024-SUNRPC-Fix-large_reads-on-NFS-RDMA.patch
deleted file mode 100644 (file)
index d3ad959..0000000
+++ /dev/null
@@ -1,46 +0,0 @@
-commit 2b7bbc963da8d076f263574af4138b5df2e1581f
-Author: Chuck Lever <chuck.lever@oracle.com>
-Date:   Wed Mar 12 12:51:30 2014 -0400
-
-    SUNRPC: Fix large reads on NFS/RDMA
-    
-    After commit a11a2bf4, "SUNRPC: Optimise away unnecessary data moves
-    in xdr_align_pages", Thu Aug 2 13:21:43 2012, READs larger than a
-    few hundred bytes via NFS/RDMA no longer work.  This commit exposed
-    a long-standing bug in rpcrdma_inline_fixup().
-    
-    I reproduce this with an rsize=4096 mount using the cthon04 basic
-    tests.  Test 5 fails with an EIO error.
-    
-    For my reproducer, kernel log shows:
-    
-      NFS: server cheating in read reply: count 4096 > recvd 0
-    
-    rpcrdma_inline_fixup() is zeroing the xdr_stream::page_len field,
-    and xdr_align_pages() is now returning that value to the READ XDR
-    decoder function.
-    
-    That field is set up by xdr_inline_pages() by the READ XDR encoder
-    function.  As far as I can tell, it is supposed to be left alone
-    after that, as it describes the dimensions of the reply xdr_stream,
-    not the contents of that stream.
-    
-    Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=68391
-    Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
-    Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
-
-diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
-index e03725b..96ead52 100644
---- a/net/sunrpc/xprtrdma/rpc_rdma.c
-+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
-@@ -649,9 +649,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
-                               break;
-                       page_base = 0;
-               }
--              rqst->rq_rcv_buf.page_len = olen - copy_len;
--      } else
--              rqst->rq_rcv_buf.page_len = 0;
-+      }
-       if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
-               curlen = copy_len;
diff --git a/linux-next-pending/0025-NFSRDMA-Fix-regression-in-NFSRDMA-server.patch b/linux-next-pending/0025-NFSRDMA-Fix-regression-in-NFSRDMA-server.patch
deleted file mode 100644 (file)
index abd5a0a..0000000
+++ /dev/null
@@ -1,66 +0,0 @@
-Fix regression in NFSRDMA server
-
-From: Tom Tucker <tom@ogc.us>
-
-The server regression was caused by the addition of rq_next_page
-(afc59400d6c65bad66d4ad0b2daf879cbff8e23e). There were a few places that
-were missed with the update of the rq_respages array.
-
-NOTE: Patch modified to apply against OFED.
-
-Signed-off-by: Tom Tucker <tom@ogc.us>
-Tested-by: Steve Wise <swise@ogc.us>
-
----
-
---- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  2014-03-31 15:31:05.214903226 -0500
-+++ a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  2014-03-31 15:34:40.042047141 -0500
-@@ -90,6 +90,9 @@ static void rdma_build_arg_xdr(struct sv
-               sge_no++;
-       }
-       rqstp->rq_respages = &rqstp->rq_pages[sge_no];
-+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0))
-+      rqstp->rq_next_page = rqstp->rq_respages + 1;
-+#endif
-       /* We should never run out of SGE because the limit is defined to
-        * support the max allowed RPC data length
-@@ -169,6 +172,9 @@ static int map_read_chunks(struct svcxpr
-                */
-               head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
-               rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
-+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0))
-+              rqstp->rq_next_page = rqstp->rq_respages + 1;
-+#endif
-               byte_count -= sge_bytes;
-               ch_bytes -= sge_bytes;
-@@ -276,6 +282,9 @@ static int fast_reg_read_chunks(struct s
-       /* rq_respages points one past arg pages */
-       rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
-+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0))
-+      rqstp->rq_next_page = rqstp->rq_respages + 1;
-+#endif
-       /* Create the reply and chunk maps */
-       offset = 0;
-@@ -527,9 +536,6 @@ next_sge:
- #if (LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0))
-         while (rqstp->rq_resused)
-                 rqstp->rq_respages[--rqstp->rq_resused] = NULL;
--#else
--      while (rqstp->rq_next_page != rqstp->rq_respages)
--              *(--rqstp->rq_next_page) = NULL;
- #endif
-       return err;
-@@ -558,7 +564,7 @@ static int rdma_read_complete(struct svc
- #if (LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0))
-         rqstp->rq_resused = 0;
- #else
--      rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no];
-+      rqstp->rq_next_page = rqstp->rq_respages + 1;
- #endif
-       /* Rebuild rq_arg head and tail. */
index 6d184fb70ebf35f1a8477f9248345c2e721ea116..c9a0bd664012ed699079f9dc3fb1164821d3bf2a 100644 (file)
@@ -55,28 +55,15 @@ diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_r
 index xxxxxxx..xxxxxxx xxxxxx
 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
 +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
-@@ -524,8 +524,13 @@ next_sge:
-        * Detach res pages. If svc_release sees any it will attempt to
-        * put them.
-        */
-+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0))
-+        while (rqstp->rq_resused)
-+                rqstp->rq_respages[--rqstp->rq_resused] = NULL;
-+#else
-       while (rqstp->rq_next_page != rqstp->rq_respages)
-               *(--rqstp->rq_next_page) = NULL;
-+#endif
-       return err;
- }
-@@ -550,7 +555,11 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
+@@ -550,7 +556,11 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
  
        /* rq_respages starts after the last arg page */
        rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+-      rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no];
 +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0))
 +        rqstp->rq_resused = 0;
 +#else
-       rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no];
++       rqstp->rq_next_page = rqstp->rq_respages + 1;
 +#endif
  
        /* Rebuild rq_arg head and tail. */
index 84bb97ef8fb6c69b5e3e844f01ccfdff40d22c5b..ec2fc3458560497c3da1bfec4e7ba8232d5b4a00 100644 (file)
@@ -20,31 +20,3 @@ index xxxxxxx..xxxxxxx xxxxxx
                        xprt_rdma_slot_table_entries);
        if (xprt == NULL) {
                dprintk("RPC:       %s: couldn't allocate rpcrdma_xprt\n",
-@@ -450,8 +452,15 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
- }
- static int
-+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)) || defined (CONFIG_COMPAT_XPRT_RESERVE_XPRT_CONG_2PARAMS)
- xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
-+#else
-+xprt_rdma_reserve_xprt(struct rpc_task *task)
-+#endif
- {
-+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,1,0)) && !defined (CONFIG_COMPAT_XPRT_RESERVE_XPRT_CONG_2PARAMS)
-+      struct rpc_xprt *xprt = task->tk_xprt;
-+#endif
-       struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
-       int credits = atomic_read(&r_xprt->rx_buf.rb_credits);
-@@ -463,7 +472,11 @@ xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
-               BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0);
-       }
-       xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale;
-+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)) || defined (CONFIG_COMPAT_XPRT_RESERVE_XPRT_CONG_2PARAMS)
-       return xprt_reserve_xprt_cong(xprt, task);
-+#else
-+      return xprt_reserve_xprt_cong(task);
-+#endif
- }
- /*