--- /dev/null
+From 2b7bbc963da8d076f263574af4138b5df2e1581f Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 12 Mar 2014 12:51:30 -0400
+Subject: [PATCH 101/132] SUNRPC: Fix large reads on NFS/RDMA
+
+After commit a11a2bf4, "SUNRPC: Optimise away unnecessary data moves
+in xdr_align_pages", Thu Aug 2 13:21:43 2012, READs larger than a
+few hundred bytes via NFS/RDMA no longer work. This commit exposed
+a long-standing bug in rpcrdma_inline_fixup().
+
+I reproduce this with an rsize=4096 mount using the cthon04 basic
+tests. Test 5 fails with an EIO error.
+
+For my reproducer, kernel log shows:
+
+ NFS: server cheating in read reply: count 4096 > recvd 0
+
+rpcrdma_inline_fixup() is zeroing the xdr_stream::page_len field,
+and xdr_align_pages() is now returning that value to the READ XDR
+decoder function.
+
+That field is set up by xdr_inline_pages() by the READ XDR encoder
+function. As far as I can tell, it is supposed to be left alone
+after that, as it describes the dimensions of the reply xdr_stream,
+not the contents of that stream.
+
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=68391
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c | 4 +---
+ 1 files changed, 1 insertions(+), 3 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index e03725b..96ead52 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -649,9 +649,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
+ break;
+ page_base = 0;
+ }
+- rqst->rq_rcv_buf.page_len = olen - copy_len;
+- } else
+- rqst->rq_rcv_buf.page_len = 0;
++ }
+
+ if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
+ curlen = copy_len;
+--
+1.7.1
+
--- /dev/null
+From 3a0799a94c0384a3b275a73267aaa10517b1bf7d Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 12 Mar 2014 12:51:39 -0400
+Subject: [PATCH 102/132] SUNRPC: remove KERN_INFO from dprintk() call sites
+
+The use of KERN_INFO causes garbage characters to appear when
+debugging is enabled.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
+---
+ net/sunrpc/xprtrdma/transport.c | 10 +++++-----
+ 1 files changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index 285dc08..1eb9c46 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -733,7 +733,7 @@ static void __exit xprt_rdma_cleanup(void)
+ {
+ int rc;
+
+- dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n");
++ dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
+ #ifdef RPC_DEBUG
+ if (sunrpc_table_header) {
+ unregister_sysctl_table(sunrpc_table_header);
+@@ -755,14 +755,14 @@ static int __init xprt_rdma_init(void)
+ if (rc)
+ return rc;
+
+- dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n");
++ dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
+
+- dprintk(KERN_INFO "Defaults:\n");
+- dprintk(KERN_INFO "\tSlots %d\n"
++ dprintk("Defaults:\n");
++ dprintk("\tSlots %d\n"
+ "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
+ xprt_rdma_slot_table_entries,
+ xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
+- dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n",
++ dprintk("\tPadding %d\n\tMemreg %d\n",
+ xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
+
+ #ifdef RPC_DEBUG
+--
+1.7.1
+
--- /dev/null
+From c42a01eee74dfd9ba8f8abb7cb81dd9a8839dc7b Mon Sep 17 00:00:00 2001
+From: Jeff Layton <jlayton@redhat.com>
+Date: Mon, 10 Mar 2014 11:33:48 -0400
+Subject: [PATCH 103/132] svcrdma: fix printk when memory allocation fails
+
+It retries in 1s, not 1000 jiffies.
+
+Signed-off-by: Jeff Layton <jlayton@redhat.com>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+---
+ net/sunrpc/xprtrdma/svc_rdma_transport.c | 3 +--
+ 1 files changed, 1 insertions(+), 2 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
+index 62e4f9b..25688fa 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
+@@ -477,8 +477,7 @@ struct page *svc_rdma_get_page(void)
+
+ while ((page = alloc_page(GFP_KERNEL)) == NULL) {
+ /* If we can't get memory, wait a bit and try again */
+- printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 "
+- "jiffies.\n");
++ printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n");
+ schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
+ }
+ return page;
+--
+1.7.1
+
--- /dev/null
+From 7e4359e2611f95a97037e2b6905eab52f28afbeb Mon Sep 17 00:00:00 2001
+From: Tom Tucker <tom@ogc.us>
+Date: Tue, 25 Mar 2014 15:14:57 -0500
+Subject: [PATCH 104/132] Fix regression in NFSRDMA server
+
+The server regression was caused by the addition of rq_next_page
+(afc59400d6c65bad66d4ad0b2daf879cbff8e23e). There were a few places that
+were missed with the update of the rq_respages array.
+
+Signed-off-by: Tom Tucker <tom@ogc.us>
+Tested-by: Steve Wise <swise@ogc.us>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+---
+ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 12 ++++--------
+ net/sunrpc/xprtrdma/svc_rdma_sendto.c | 1 +
+ 2 files changed, 5 insertions(+), 8 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+index 0ce7552..8d904e4 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+@@ -90,6 +90,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
+ sge_no++;
+ }
+ rqstp->rq_respages = &rqstp->rq_pages[sge_no];
++ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+ /* We should never run out of SGE because the limit is defined to
+ * support the max allowed RPC data length
+@@ -169,6 +170,7 @@ static int map_read_chunks(struct svcxprt_rdma *xprt,
+ */
+ head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
+ rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
++ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+ byte_count -= sge_bytes;
+ ch_bytes -= sge_bytes;
+@@ -276,6 +278,7 @@ static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
+
+ /* rq_respages points one past arg pages */
+ rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
++ rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+ /* Create the reply and chunk maps */
+ offset = 0;
+@@ -520,13 +523,6 @@ next_sge:
+ for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
+ rqstp->rq_pages[ch_no] = NULL;
+
+- /*
+- * Detach res pages. If svc_release sees any it will attempt to
+- * put them.
+- */
+- while (rqstp->rq_next_page != rqstp->rq_respages)
+- *(--rqstp->rq_next_page) = NULL;
+-
+ return err;
+ }
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+index c1d124d..11e90f8 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+@@ -625,6 +625,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
+ if (page_no+1 >= sge_no)
+ ctxt->sge[page_no+1].length = 0;
+ }
++ rqstp->rq_next_page = rqstp->rq_respages + 1;
+ BUG_ON(sge_no > rdma->sc_max_sge);
+ memset(&send_wr, 0, sizeof send_wr);
+ ctxt->wr_op = IB_WR_SEND;
+--
+1.7.1
+
--- /dev/null
+From 3cbe01a94c7b369f943f8a9d40394198d757cdd4 Mon Sep 17 00:00:00 2001
+From: Jeff Layton <jlayton@redhat.com>
+Date: Mon, 17 Mar 2014 13:10:05 -0400
+Subject: [PATCH 106/132] svcrdma: fix offset calculation for non-page aligned sge entries
+
+The xdr_off value in dma_map_xdr gets passed to ib_dma_map_page as the
+offset into the page to be mapped. This calculation does not correctly
+take into account the case where the data starts at some offset into
+the page. Increment the xdr_off by the page_base to ensure that it is
+respected.
+
+Cc: Tom Tucker <tom@opengridcomputing.com>
+Signed-off-by: Jeff Layton <jlayton@redhat.com>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+---
+ net/sunrpc/xprtrdma/svc_rdma_sendto.c | 1 +
+ 1 files changed, 1 insertions(+), 0 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+index 11e90f8..7e024a5 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+@@ -265,6 +265,7 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
+ xdr_off -= xdr->head[0].iov_len;
+ if (xdr_off < xdr->page_len) {
+ /* This offset is in the page list */
++ xdr_off += xdr->page_base;
+ page = xdr->pages[xdr_off >> PAGE_SHIFT];
+ xdr_off &= ~PAGE_MASK;
+ } else {
+--
+1.7.1
+
--- /dev/null
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index cc1445d..f1cd3d3 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -53,6 +53,7 @@
+
+ #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */
+ #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
++#define RPC_CWNDSHIFT (8U) /* backported from linux/sunrpc/xprt.h */
+
+ /*
+ * Interface Adapter -- one per transport instance
--- /dev/null
+From 0fc6c4e7bb287148eb5e949efd89327929d4841d Mon Sep 17 00:00:00 2001
+From: Steve Wise <swise@opengridcomputing.com>
+Date: Wed, 28 May 2014 10:32:00 -0400
+Subject: [PATCH 108/132] xprtrdma: mind the device's max fast register page list depth
+
+Some rdma devices don't support a fast register page list depth of
+at least RPCRDMA_MAX_DATA_SEGS. So xprtrdma needs to chunk its fast
+register regions according to the minimum of the device max supported
+depth or RPCRDMA_MAX_DATA_SEGS.
+
+Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c | 4 ---
+ net/sunrpc/xprtrdma/verbs.c | 47 +++++++++++++++++++++++++++++----------
+ net/sunrpc/xprtrdma/xprt_rdma.h | 1 +
+ 3 files changed, 36 insertions(+), 16 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index 96ead52..400aa1b 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -248,10 +248,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+ /* success. all failures return above */
+ req->rl_nchunks = nchunks;
+
+- BUG_ON(nchunks == 0);
+- BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
+- && (nchunks > 3));
+-
+ /*
+ * finish off header. If write, marshal discrim and nchunks.
+ */
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 9372656..55fb09a 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -539,6 +539,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+ __func__);
+ memreg = RPCRDMA_REGISTER;
+ #endif
++ } else {
++ /* Mind the ia limit on FRMR page list depth */
++ ia->ri_max_frmr_depth = min_t(unsigned int,
++ RPCRDMA_MAX_DATA_SEGS,
++ devattr.max_fast_reg_page_list_len);
+ }
+ break;
+ }
+@@ -659,24 +664,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+ ep->rep_attr.srq = NULL;
+ ep->rep_attr.cap.max_send_wr = cdata->max_requests;
+ switch (ia->ri_memreg_strategy) {
+- case RPCRDMA_FRMR:
++ case RPCRDMA_FRMR: {
++ int depth = 7;
++
+ /* Add room for frmr register and invalidate WRs.
+ * 1. FRMR reg WR for head
+ * 2. FRMR invalidate WR for head
+- * 3. FRMR reg WR for pagelist
+- * 4. FRMR invalidate WR for pagelist
++ * 3. N FRMR reg WRs for pagelist
++ * 4. N FRMR invalidate WRs for pagelist
+ * 5. FRMR reg WR for tail
+ * 6. FRMR invalidate WR for tail
+ * 7. The RDMA_SEND WR
+ */
+- ep->rep_attr.cap.max_send_wr *= 7;
++
++ /* Calculate N if the device max FRMR depth is smaller than
++ * RPCRDMA_MAX_DATA_SEGS.
++ */
++ if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) {
++ int delta = RPCRDMA_MAX_DATA_SEGS -
++ ia->ri_max_frmr_depth;
++
++ do {
++ depth += 2; /* FRMR reg + invalidate */
++ delta -= ia->ri_max_frmr_depth;
++ } while (delta > 0);
++
++ }
++ ep->rep_attr.cap.max_send_wr *= depth;
+ if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
+- cdata->max_requests = devattr.max_qp_wr / 7;
++ cdata->max_requests = devattr.max_qp_wr / depth;
+ if (!cdata->max_requests)
+ return -EINVAL;
+- ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
++ ep->rep_attr.cap.max_send_wr = cdata->max_requests *
++ depth;
+ }
+ break;
++ }
+ case RPCRDMA_MEMWINDOWS_ASYNC:
+ case RPCRDMA_MEMWINDOWS:
+ /* Add room for mw_binds+unbinds - overkill! */
+@@ -1043,16 +1066,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+ case RPCRDMA_FRMR:
+ for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
+ r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
+- RPCRDMA_MAX_SEGS);
++ ia->ri_max_frmr_depth);
+ if (IS_ERR(r->r.frmr.fr_mr)) {
+ rc = PTR_ERR(r->r.frmr.fr_mr);
+ dprintk("RPC: %s: ib_alloc_fast_reg_mr"
+ " failed %i\n", __func__, rc);
+ goto out;
+ }
+- r->r.frmr.fr_pgl =
+- ib_alloc_fast_reg_page_list(ia->ri_id->device,
+- RPCRDMA_MAX_SEGS);
++ r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list(
++ ia->ri_id->device,
++ ia->ri_max_frmr_depth);
+ if (IS_ERR(r->r.frmr.fr_pgl)) {
+ rc = PTR_ERR(r->r.frmr.fr_pgl);
+ dprintk("RPC: %s: "
+@@ -1498,8 +1521,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
+ seg1->mr_offset -= pageoff; /* start of page */
+ seg1->mr_len += pageoff;
+ len = -pageoff;
+- if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+- *nsegs = RPCRDMA_MAX_DATA_SEGS;
++ if (*nsegs > ia->ri_max_frmr_depth)
++ *nsegs = ia->ri_max_frmr_depth;
+ for (page_no = i = 0; i < *nsegs;) {
+ rpcrdma_map_one(ia, seg, writing);
+ pa = seg->mr_dma;
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index cc1445d..98340a3 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -66,6 +66,7 @@ struct rpcrdma_ia {
+ struct completion ri_done;
+ int ri_async_rc;
+ enum rpcrdma_memreg ri_memreg_strategy;
++ unsigned int ri_max_frmr_depth;
+ };
+
+ /*
+--
+1.7.1
+
--- /dev/null
+From 4034ba04231f554abb97ad8900a4c1af03f8e21d Mon Sep 17 00:00:00 2001
+From: Allen Andrews <allen.andrews@emulex.com>
+Date: Wed, 28 May 2014 10:32:09 -0400
+Subject: [PATCH 109/132] nfs-rdma: Fix for FMR leaks
+
+Two memory region leaks were found during testing:
+
+1. rpcrdma_buffer_create: While allocating RPCRDMA_FRMR's
+ib_alloc_fast_reg_mr is called and then ib_alloc_fast_reg_page_list is
+called. If ib_alloc_fast_reg_page_list returns an error it bails out of
+the routine dropping the last ib_alloc_fast_reg_mr frmr region creating a
+memory leak. Added code to dereg the last frmr if
+ib_alloc_fast_reg_page_list fails.
+
+2. rpcrdma_buffer_destroy: While cleaning up, the routine will only free
+the MR's on the rb_mws list if there are rb_send_bufs present. However, in
+rpcrdma_buffer_create while the rb_mws list is being built if one of the MR
+allocation requests fail after some MR's have been allocated on the rb_mws
+list the routine never gets to create any rb_send_bufs but instead jumps to
+the rpcrdma_buffer_destroy routine which will never free the MR's on rb_mws
+list because the rb_send_bufs were never created. This leaks all the MR's
+on the rb_mws list that were created prior to one of the MR allocations
+failing.
+
+Issue(2) was seen during testing. Our adapter had a finite number of MR's
+available and we created enough connections to where we saw an MR
+allocation failure on our Nth NFS connection request. After the kernel
+cleaned up the resources it had allocated for the Nth connection we noticed
+that FMR's had been leaked due to the coding error described above.
+
+Issue(1) was seen during a code review while debugging issue(2).
+
+Signed-off-by: Allen Andrews <allen.andrews@emulex.com>
+Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c | 73 ++++++++++++++++++++++--------------------
+ 1 files changed, 38 insertions(+), 35 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 55fb09a..8f9704e 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -1081,6 +1081,8 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+ dprintk("RPC: %s: "
+ "ib_alloc_fast_reg_page_list "
+ "failed %i\n", __func__, rc);
++
++ ib_dereg_mr(r->r.frmr.fr_mr);
+ goto out;
+ }
+ list_add(&r->mw_list, &buf->rb_mws);
+@@ -1217,41 +1219,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+ kfree(buf->rb_recv_bufs[i]);
+ }
+ if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
+- while (!list_empty(&buf->rb_mws)) {
+- r = list_entry(buf->rb_mws.next,
+- struct rpcrdma_mw, mw_list);
+- list_del(&r->mw_list);
+- switch (ia->ri_memreg_strategy) {
+- case RPCRDMA_FRMR:
+- rc = ib_dereg_mr(r->r.frmr.fr_mr);
+- if (rc)
+- dprintk("RPC: %s:"
+- " ib_dereg_mr"
+- " failed %i\n",
+- __func__, rc);
+- ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+- break;
+- case RPCRDMA_MTHCAFMR:
+- rc = ib_dealloc_fmr(r->r.fmr);
+- if (rc)
+- dprintk("RPC: %s:"
+- " ib_dealloc_fmr"
+- " failed %i\n",
+- __func__, rc);
+- break;
+- case RPCRDMA_MEMWINDOWS_ASYNC:
+- case RPCRDMA_MEMWINDOWS:
+- rc = ib_dealloc_mw(r->r.mw);
+- if (rc)
+- dprintk("RPC: %s:"
+- " ib_dealloc_mw"
+- " failed %i\n",
+- __func__, rc);
+- break;
+- default:
+- break;
+- }
+- }
+ rpcrdma_deregister_internal(ia,
+ buf->rb_send_bufs[i]->rl_handle,
+ &buf->rb_send_bufs[i]->rl_iov);
+@@ -1259,6 +1226,42 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+ }
+ }
+
++ while (!list_empty(&buf->rb_mws)) {
++ r = list_entry(buf->rb_mws.next,
++ struct rpcrdma_mw, mw_list);
++ list_del(&r->mw_list);
++ switch (ia->ri_memreg_strategy) {
++ case RPCRDMA_FRMR:
++ rc = ib_dereg_mr(r->r.frmr.fr_mr);
++ if (rc)
++ dprintk("RPC: %s:"
++ " ib_dereg_mr"
++ " failed %i\n",
++ __func__, rc);
++ ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
++ break;
++ case RPCRDMA_MTHCAFMR:
++ rc = ib_dealloc_fmr(r->r.fmr);
++ if (rc)
++ dprintk("RPC: %s:"
++ " ib_dealloc_fmr"
++ " failed %i\n",
++ __func__, rc);
++ break;
++ case RPCRDMA_MEMWINDOWS_ASYNC:
++ case RPCRDMA_MEMWINDOWS:
++ rc = ib_dealloc_mw(r->r.mw);
++ if (rc)
++ dprintk("RPC: %s:"
++ " ib_dealloc_mw"
++ " failed %i\n",
++ __func__, rc);
++ break;
++ default:
++ break;
++ }
++ }
++
+ kfree(buf->rb_pool);
+ }
+
+--
+1.7.1
+
--- /dev/null
+From 254f91e2fa1f4cc18fd2eb9d5481888ffe126d5b Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:32:17 -0400
+Subject: [PATCH 110/132] xprtrdma: RPC/RDMA must invoke xprt_wake_pending_tasks() in process context
+
+An IB provider can invoke rpcrdma_conn_func() in an IRQ context,
+thus rpcrdma_conn_func() cannot be allowed to directly invoke
+generic RPC functions like xprt_wake_pending_tasks().
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c | 22 +++++++++++++++-------
+ net/sunrpc/xprtrdma/verbs.c | 3 +++
+ net/sunrpc/xprtrdma/xprt_rdma.h | 3 +++
+ 3 files changed, 21 insertions(+), 7 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index 400aa1b..c296468 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -676,15 +676,11 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
+ rqst->rq_private_buf = rqst->rq_rcv_buf;
+ }
+
+-/*
+- * This function is called when an async event is posted to
+- * the connection which changes the connection state. All it
+- * does at this point is mark the connection up/down, the rpc
+- * timers do the rest.
+- */
+ void
+-rpcrdma_conn_func(struct rpcrdma_ep *ep)
++rpcrdma_connect_worker(struct work_struct *work)
+ {
++ struct rpcrdma_ep *ep =
++ container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
+ struct rpc_xprt *xprt = ep->rep_xprt;
+
+ spin_lock_bh(&xprt->transport_lock);
+@@ -701,6 +697,18 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
+ }
+
+ /*
++ * This function is called when an async event is posted to
++ * the connection which changes the connection state. All it
++ * does at this point is mark the connection up/down, the rpc
++ * timers do the rest.
++ */
++void
++rpcrdma_conn_func(struct rpcrdma_ep *ep)
++{
++ schedule_delayed_work(&ep->rep_connect_worker, 0);
++}
++
++/*
+ * This function is called when memory window unbind which we are waiting
+ * for completes. Just use rr_func (zeroed by upcall) to signal completion.
+ */
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 8f9704e..9cb88f3 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -742,6 +742,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+ INIT_CQCOUNT(ep);
+ ep->rep_ia = ia;
+ init_waitqueue_head(&ep->rep_connect_wait);
++ INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
+
+ /*
+ * Create a single cq for receive dto and mw_bind (only ever
+@@ -817,6 +818,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+ dprintk("RPC: %s: entering, connected is %d\n",
+ __func__, ep->rep_connected);
+
++ cancel_delayed_work_sync(&ep->rep_connect_worker);
++
+ if (ia->ri_id->qp) {
+ rc = rpcrdma_ep_disconnect(ep, ia);
+ if (rc)
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index 98340a3..c620d13 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -43,6 +43,7 @@
+ #include <linux/wait.h> /* wait_queue_head_t, etc */
+ #include <linux/spinlock.h> /* spinlock_t, etc */
+ #include <linux/atomic.h> /* atomic_t, etc */
++#include <linux/workqueue.h> /* struct work_struct */
+
+ #include <rdma/rdma_cm.h> /* RDMA connection api */
+ #include <rdma/ib_verbs.h> /* RDMA verbs api */
+@@ -87,6 +88,7 @@ struct rpcrdma_ep {
+ struct rpc_xprt *rep_xprt; /* for rep_func */
+ struct rdma_conn_param rep_remote_cma;
+ struct sockaddr_storage rep_remote_addr;
++ struct delayed_work rep_connect_worker;
+ };
+
+ #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
+@@ -336,6 +338,7 @@ int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
+ /*
+ * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
+ */
++void rpcrdma_connect_worker(struct work_struct *);
+ void rpcrdma_conn_func(struct rpcrdma_ep *);
+ void rpcrdma_reply_handler(struct rpcrdma_rep *);
+
+--
+1.7.1
+
--- /dev/null
+From 03ff8821eb5ed168792667cfc3ddff903e97af99 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:32:26 -0400
+Subject: [PATCH 111/132] xprtrdma: Remove BOUNCEBUFFERS memory registration mode
+
+Clean up: This memory registration mode is slow and was never
+meant for use in production environments. Remove it to reduce
+implementation complexity.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c | 11 -----------
+ net/sunrpc/xprtrdma/transport.c | 13 -------------
+ net/sunrpc/xprtrdma/verbs.c | 5 +----
+ 3 files changed, 1 insertions(+), 28 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index c296468..02b2941 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -77,9 +77,6 @@ static const char transfertypes[][12] = {
+ * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
+ * elements. Segments are then coalesced when registered, if possible
+ * within the selected memreg mode.
+- *
+- * Note, this routine is never called if the connection's memory
+- * registration strategy is 0 (bounce buffers).
+ */
+
+ static int
+@@ -439,14 +436,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
+ wtype = rpcrdma_noch;
+ BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch);
+
+- if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS &&
+- (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) {
+- /* forced to "pure inline"? */
+- dprintk("RPC: %s: too much data (%d/%d) for inline\n",
+- __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len);
+- return -1;
+- }
+-
+ hdrlen = 28; /*sizeof *headerp;*/
+ padlen = 0;
+
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index 1eb9c46..8c5035a 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -503,18 +503,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
+ * If the allocation or registration fails, the RPC framework
+ * will (doggedly) retry.
+ */
+- if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy ==
+- RPCRDMA_BOUNCEBUFFERS) {
+- /* forced to "pure inline" */
+- dprintk("RPC: %s: too much data (%zd) for inline "
+- "(r/w max %d/%d)\n", __func__, size,
+- rpcx_to_rdmad(xprt).inline_rsize,
+- rpcx_to_rdmad(xprt).inline_wsize);
+- size = req->rl_size;
+- rpc_exit(task, -EIO); /* fail the operation */
+- rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
+- goto out;
+- }
+ if (task->tk_flags & RPC_TASK_SWAPPER)
+ nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
+ else
+@@ -543,7 +531,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
+ req = nreq;
+ }
+ dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
+-out:
+ req->rl_connect_cookie = 0; /* our reserved value */
+ return req->rl_xdr_buf;
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 9cb88f3..4a4e4ea 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -557,7 +557,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+ * adapter.
+ */
+ switch (memreg) {
+- case RPCRDMA_BOUNCEBUFFERS:
+ case RPCRDMA_REGISTER:
+ case RPCRDMA_FRMR:
+ break;
+@@ -778,9 +777,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+
+ /* Client offers RDMA Read but does not initiate */
+ ep->rep_remote_cma.initiator_depth = 0;
+- if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
+- ep->rep_remote_cma.responder_resources = 0;
+- else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
++ if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
+ ep->rep_remote_cma.responder_resources = 32;
+ else
+ ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
+--
+1.7.1
+
--- /dev/null
+From b45ccfd25d506e83d9ecf93d0ac7edf031d35d2f Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:32:34 -0400
+Subject: [PATCH 112/132] xprtrdma: Remove MEMWINDOWS registration modes
+
+The MEMWINDOWS and MEMWINDOWS_ASYNC memory registration modes were
+intended as stop-gap modes before the introduction of FRMR. They
+are now considered obsolete.
+
+MEMWINDOWS_ASYNC is also considered unsafe because it can leave
+client memory registered and exposed for an indeterminant time after
+each I/O.
+
+At this point, the MEMWINDOWS modes add needless complexity, so
+remove them.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c | 34 +--------
+ net/sunrpc/xprtrdma/transport.c | 9 +--
+ net/sunrpc/xprtrdma/verbs.c | 165 +-------------------------------------
+ net/sunrpc/xprtrdma/xprt_rdma.h | 2 -
+ 4 files changed, 7 insertions(+), 203 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index 02b2941..46b5172 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -199,7 +199,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+ return 0;
+
+ do {
+- /* bind/register the memory, then build chunk from result. */
+ int n = rpcrdma_register_external(seg, nsegs,
+ cur_wchunk != NULL, r_xprt);
+ if (n <= 0)
+@@ -698,16 +697,6 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
+ }
+
+ /*
+- * This function is called when memory window unbind which we are waiting
+- * for completes. Just use rr_func (zeroed by upcall) to signal completion.
+- */
+-static void
+-rpcrdma_unbind_func(struct rpcrdma_rep *rep)
+-{
+- wake_up(&rep->rr_unbind);
+-}
+-
+-/*
+ * Called as a tasklet to do req/reply match and complete a request
+ * Errors must result in the RPC task either being awakened, or
+ * allowed to timeout, to discover the errors at that time.
+@@ -721,7 +710,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+ struct rpc_xprt *xprt = rep->rr_xprt;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ __be32 *iptr;
+- int i, rdmalen, status;
++ int rdmalen, status;
+
+ /* Check status. If bad, signal disconnect and return rep to pool */
+ if (rep->rr_len == ~0U) {
+@@ -850,27 +839,6 @@ badheader:
+ break;
+ }
+
+- /* If using mw bind, start the deregister process now. */
+- /* (Note: if mr_free(), cannot perform it here, in tasklet context) */
+- if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) {
+- case RPCRDMA_MEMWINDOWS:
+- for (i = 0; req->rl_nchunks-- > 1;)
+- i += rpcrdma_deregister_external(
+- &req->rl_segments[i], r_xprt, NULL);
+- /* Optionally wait (not here) for unbinds to complete */
+- rep->rr_func = rpcrdma_unbind_func;
+- (void) rpcrdma_deregister_external(&req->rl_segments[i],
+- r_xprt, rep);
+- break;
+- case RPCRDMA_MEMWINDOWS_ASYNC:
+- for (i = 0; req->rl_nchunks--;)
+- i += rpcrdma_deregister_external(&req->rl_segments[i],
+- r_xprt, NULL);
+- break;
+- default:
+- break;
+- }
+-
+ dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
+ __func__, xprt, rqst, status);
+ xprt_complete_rqst(rqst->rq_task, status);
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index 8c5035a..c23b0c1 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -566,9 +566,7 @@ xprt_rdma_free(void *buffer)
+ __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
+
+ /*
+- * Finish the deregistration. When using mw bind, this was
+- * begun in rpcrdma_reply_handler(). In all other modes, we
+- * do it here, in thread context. The process is considered
++ * Finish the deregistration. The process is considered
+ * complete when the rr_func vector becomes NULL - this
+ * was put in place during rpcrdma_reply_handler() - the wait
+ * call below will not block if the dereg is "done". If
+@@ -580,11 +578,6 @@ xprt_rdma_free(void *buffer)
+ &req->rl_segments[i], r_xprt, NULL);
+ }
+
+- if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) {
+- rep->rr_func = NULL; /* abandon the callback */
+- req->rl_reply = NULL;
+- }
+-
+ if (req->rl_iov.length == 0) { /* see allocate above */
+ struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer;
+ oreq->rl_reply = req->rl_reply;
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 4a4e4ea..304c7ad 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -152,7 +152,7 @@ void rpcrdma_event_process(struct ib_wc *wc)
+ dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
+ __func__, rep, wc->status, wc->opcode, wc->byte_len);
+
+- if (!rep) /* send or bind completion that we don't care about */
++ if (!rep) /* send completion that we don't care about */
+ return;
+
+ if (IB_WC_SUCCESS != wc->status) {
+@@ -197,8 +197,6 @@ void rpcrdma_event_process(struct ib_wc *wc)
+ }
+ atomic_set(&rep->rr_buffer->rb_credits, credits);
+ }
+- /* fall through */
+- case IB_WC_BIND_MW:
+ rpcrdma_schedule_tasklet(rep);
+ break;
+ default:
+@@ -233,7 +231,7 @@ rpcrdma_cq_poll(struct ib_cq *cq)
+ /*
+ * rpcrdma_cq_event_upcall
+ *
+- * This upcall handles recv, send, bind and unbind events.
++ * This upcall handles recv and send events.
+ * It is reentrant but processes single events in order to maintain
+ * ordering of receives to keep server credits.
+ *
+@@ -494,16 +492,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+ }
+
+ switch (memreg) {
+- case RPCRDMA_MEMWINDOWS:
+- case RPCRDMA_MEMWINDOWS_ASYNC:
+- if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
+- dprintk("RPC: %s: MEMWINDOWS registration "
+- "specified but not supported by adapter, "
+- "using slower RPCRDMA_REGISTER\n",
+- __func__);
+- memreg = RPCRDMA_REGISTER;
+- }
+- break;
+ case RPCRDMA_MTHCAFMR:
+ if (!ia->ri_id->device->alloc_fmr) {
+ #if RPCRDMA_PERSISTENT_REGISTRATION
+@@ -567,16 +555,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+ IB_ACCESS_REMOTE_READ;
+ goto register_setup;
+ #endif
+- case RPCRDMA_MEMWINDOWS_ASYNC:
+- case RPCRDMA_MEMWINDOWS:
+- mem_priv = IB_ACCESS_LOCAL_WRITE |
+- IB_ACCESS_MW_BIND;
+- goto register_setup;
+ case RPCRDMA_MTHCAFMR:
+ if (ia->ri_have_dma_lkey)
+ break;
+ mem_priv = IB_ACCESS_LOCAL_WRITE;
++#if RPCRDMA_PERSISTENT_REGISTRATION
+ register_setup:
++#endif
+ ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
+ if (IS_ERR(ia->ri_bind_mem)) {
+ printk(KERN_ALERT "%s: ib_get_dma_mr for "
+@@ -699,14 +684,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+ }
+ break;
+ }
+- case RPCRDMA_MEMWINDOWS_ASYNC:
+- case RPCRDMA_MEMWINDOWS:
+- /* Add room for mw_binds+unbinds - overkill! */
+- ep->rep_attr.cap.max_send_wr++;
+- ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
+- if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
+- return -EINVAL;
+- break;
+ default:
+ break;
+ }
+@@ -728,14 +705,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+
+ /* set trigger for requesting send completion */
+ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
+- switch (ia->ri_memreg_strategy) {
+- case RPCRDMA_MEMWINDOWS_ASYNC:
+- case RPCRDMA_MEMWINDOWS:
+- ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
+- break;
+- default:
+- break;
+- }
+ if (ep->rep_cqinit <= 2)
+ ep->rep_cqinit = 0;
+ INIT_CQCOUNT(ep);
+@@ -743,11 +712,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+ init_waitqueue_head(&ep->rep_connect_wait);
+ INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
+
+- /*
+- * Create a single cq for receive dto and mw_bind (only ever
+- * care about unbind, really). Send completions are suppressed.
+- * Use single threaded tasklet upcalls to maintain ordering.
+- */
+ ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
+ rpcrdma_cq_async_error_upcall, NULL,
+ ep->rep_attr.cap.max_recv_wr +
+@@ -1020,11 +984,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+ len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
+ sizeof(struct rpcrdma_mw);
+ break;
+- case RPCRDMA_MEMWINDOWS_ASYNC:
+- case RPCRDMA_MEMWINDOWS:
+- len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
+- sizeof(struct rpcrdma_mw);
+- break;
+ default:
+ break;
+ }
+@@ -1055,11 +1014,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+ }
+ p += cdata->padding;
+
+- /*
+- * Allocate the fmr's, or mw's for mw_bind chunk registration.
+- * We "cycle" the mw's in order to minimize rkey reuse,
+- * and also reduce unbind-to-bind collision.
+- */
+ INIT_LIST_HEAD(&buf->rb_mws);
+ r = (struct rpcrdma_mw *)p;
+ switch (ia->ri_memreg_strategy) {
+@@ -1107,21 +1061,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+ ++r;
+ }
+ break;
+- case RPCRDMA_MEMWINDOWS_ASYNC:
+- case RPCRDMA_MEMWINDOWS:
+- /* Allocate one extra request's worth, for full cycling */
+- for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
+- r->r.mw = ib_alloc_mw(ia->ri_pd, IB_MW_TYPE_1);
+- if (IS_ERR(r->r.mw)) {
+- rc = PTR_ERR(r->r.mw);
+- dprintk("RPC: %s: ib_alloc_mw"
+- " failed %i\n", __func__, rc);
+- goto out;
+- }
+- list_add(&r->mw_list, &buf->rb_mws);
+- ++r;
+- }
+- break;
+ default:
+ break;
+ }
+@@ -1170,7 +1109,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+ memset(rep, 0, sizeof(struct rpcrdma_rep));
+ buf->rb_recv_bufs[i] = rep;
+ buf->rb_recv_bufs[i]->rr_buffer = buf;
+- init_waitqueue_head(&rep->rr_unbind);
+
+ rc = rpcrdma_register_internal(ia, rep->rr_base,
+ len - offsetof(struct rpcrdma_rep, rr_base),
+@@ -1204,7 +1142,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+
+ /* clean up in reverse order from create
+ * 1. recv mr memory (mr free, then kfree)
+- * 1a. bind mw memory
+ * 2. send mr memory (mr free, then kfree)
+ * 3. padding (if any) [moved to rpcrdma_ep_destroy]
+ * 4. arrays
+@@ -1248,15 +1185,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
+ " failed %i\n",
+ __func__, rc);
+ break;
+- case RPCRDMA_MEMWINDOWS_ASYNC:
+- case RPCRDMA_MEMWINDOWS:
+- rc = ib_dealloc_mw(r->r.mw);
+- if (rc)
+- dprintk("RPC: %s:"
+- " ib_dealloc_mw"
+- " failed %i\n",
+- __func__, rc);
+- break;
+ default:
+ break;
+ }
+@@ -1331,15 +1259,12 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
+ req->rl_niovs = 0;
+ if (req->rl_reply) {
+ buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
+- init_waitqueue_head(&req->rl_reply->rr_unbind);
+ req->rl_reply->rr_func = NULL;
+ req->rl_reply = NULL;
+ }
+ switch (ia->ri_memreg_strategy) {
+ case RPCRDMA_FRMR:
+ case RPCRDMA_MTHCAFMR:
+- case RPCRDMA_MEMWINDOWS_ASYNC:
+- case RPCRDMA_MEMWINDOWS:
+ /*
+ * Cycle mw's back in reverse order, and "spin" them.
+ * This delays and scrambles reuse as much as possible.
+@@ -1384,8 +1309,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
+
+ /*
+ * Put reply buffers back into pool when not attached to
+- * request. This happens in error conditions, and when
+- * aborting unbinds. Pre-decrement counter/array index.
++ * request. This happens in error conditions.
+ */
+ void
+ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
+@@ -1688,74 +1612,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
+ }
+
+ static int
+-rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
+- int *nsegs, int writing, struct rpcrdma_ia *ia,
+- struct rpcrdma_xprt *r_xprt)
+-{
+- int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
+- IB_ACCESS_REMOTE_READ);
+- struct ib_mw_bind param;
+- int rc;
+-
+- *nsegs = 1;
+- rpcrdma_map_one(ia, seg, writing);
+- param.bind_info.mr = ia->ri_bind_mem;
+- param.wr_id = 0ULL; /* no send cookie */
+- param.bind_info.addr = seg->mr_dma;
+- param.bind_info.length = seg->mr_len;
+- param.send_flags = 0;
+- param.bind_info.mw_access_flags = mem_priv;
+-
+- DECR_CQCOUNT(&r_xprt->rx_ep);
+- rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m);
+- if (rc) {
+- dprintk("RPC: %s: failed ib_bind_mw "
+- "%u@0x%llx status %i\n",
+- __func__, seg->mr_len,
+- (unsigned long long)seg->mr_dma, rc);
+- rpcrdma_unmap_one(ia, seg);
+- } else {
+- seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
+- seg->mr_base = param.bind_info.addr;
+- seg->mr_nsegs = 1;
+- }
+- return rc;
+-}
+-
+-static int
+-rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
+- struct rpcrdma_ia *ia,
+- struct rpcrdma_xprt *r_xprt, void **r)
+-{
+- struct ib_mw_bind param;
+- LIST_HEAD(l);
+- int rc;
+-
+- BUG_ON(seg->mr_nsegs != 1);
+- param.bind_info.mr = ia->ri_bind_mem;
+- param.bind_info.addr = 0ULL; /* unbind */
+- param.bind_info.length = 0;
+- param.bind_info.mw_access_flags = 0;
+- if (*r) {
+- param.wr_id = (u64) (unsigned long) *r;
+- param.send_flags = IB_SEND_SIGNALED;
+- INIT_CQCOUNT(&r_xprt->rx_ep);
+- } else {
+- param.wr_id = 0ULL;
+- param.send_flags = 0;
+- DECR_CQCOUNT(&r_xprt->rx_ep);
+- }
+- rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m);
+- rpcrdma_unmap_one(ia, seg);
+- if (rc)
+- dprintk("RPC: %s: failed ib_(un)bind_mw,"
+- " status %i\n", __func__, rc);
+- else
+- *r = NULL; /* will upcall on completion */
+- return rc;
+-}
+-
+-static int
+ rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
+ int *nsegs, int writing, struct rpcrdma_ia *ia)
+ {
+@@ -1845,12 +1701,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
+ rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
+ break;
+
+- /* Registration using memory windows */
+- case RPCRDMA_MEMWINDOWS_ASYNC:
+- case RPCRDMA_MEMWINDOWS:
+- rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
+- break;
+-
+ /* Default registration each time */
+ default:
+ rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
+@@ -1887,11 +1737,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
+ rc = rpcrdma_deregister_fmr_external(seg, ia);
+ break;
+
+- case RPCRDMA_MEMWINDOWS_ASYNC:
+- case RPCRDMA_MEMWINDOWS:
+- rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
+- break;
+-
+ default:
+ rc = rpcrdma_deregister_default_external(seg, ia);
+ break;
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index c620d13..bf08ee0 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -127,7 +127,6 @@ struct rpcrdma_rep {
+ struct rpc_xprt *rr_xprt; /* needed for request/reply matching */
+ void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
+ struct list_head rr_list; /* tasklet list */
+- wait_queue_head_t rr_unbind; /* optional unbind wait */
+ struct ib_sge rr_iov; /* for posting */
+ struct ib_mr *rr_handle; /* handle for mem in rr_iov */
+ char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
+@@ -162,7 +161,6 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
+ struct ib_mr *rl_mr; /* if registered directly */
+ struct rpcrdma_mw { /* if registered from region */
+ union {
+- struct ib_mw *mw;
+ struct ib_fmr *fmr;
+ struct {
+ struct ib_fast_reg_page_list *fr_pgl;
+--
+1.7.1
+
--- /dev/null
+From 0ac531c1832318efa3dc3d723e356a7e09330e80 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:32:43 -0400
+Subject: [PATCH 113/132] xprtrdma: Remove REGISTER memory registration mode
+
+All kernel RDMA providers except amso1100 support either MTHCAFMR
+or FRMR, both of which are faster than REGISTER. amso1100 can
+continue to use ALLPHYSICAL.
+
+The only other ULP consumer in the kernel that uses the reg_phys_mr
+verb is Lustre.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c | 3 +-
+ net/sunrpc/xprtrdma/verbs.c | 90 ++--------------------------------------
+ 2 files changed, 5 insertions(+), 88 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index 46b5172..aae1726 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -476,8 +476,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
+ * on receive. Therefore, we request a reply chunk
+ * for non-writes wherever feasible and efficient.
+ */
+- if (wtype == rpcrdma_noch &&
+- r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER)
++ if (wtype == rpcrdma_noch)
+ wtype = rpcrdma_replych;
+ }
+ }
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 304c7ad..6bb9a07 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -494,19 +494,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+ switch (memreg) {
+ case RPCRDMA_MTHCAFMR:
+ if (!ia->ri_id->device->alloc_fmr) {
+-#if RPCRDMA_PERSISTENT_REGISTRATION
+ dprintk("RPC: %s: MTHCAFMR registration "
+ "specified but not supported by adapter, "
+ "using riskier RPCRDMA_ALLPHYSICAL\n",
+ __func__);
+ memreg = RPCRDMA_ALLPHYSICAL;
+-#else
+- dprintk("RPC: %s: MTHCAFMR registration "
+- "specified but not supported by adapter, "
+- "using slower RPCRDMA_REGISTER\n",
+- __func__);
+- memreg = RPCRDMA_REGISTER;
+-#endif
+ }
+ break;
+ case RPCRDMA_FRMR:
+@@ -514,19 +506,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+ if ((devattr.device_cap_flags &
+ (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
+ (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
+-#if RPCRDMA_PERSISTENT_REGISTRATION
+ dprintk("RPC: %s: FRMR registration "
+ "specified but not supported by adapter, "
+ "using riskier RPCRDMA_ALLPHYSICAL\n",
+ __func__);
+ memreg = RPCRDMA_ALLPHYSICAL;
+-#else
+- dprintk("RPC: %s: FRMR registration "
+- "specified but not supported by adapter, "
+- "using slower RPCRDMA_REGISTER\n",
+- __func__);
+- memreg = RPCRDMA_REGISTER;
+-#endif
+ } else {
+ /* Mind the ia limit on FRMR page list depth */
+ ia->ri_max_frmr_depth = min_t(unsigned int,
+@@ -545,7 +529,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+ * adapter.
+ */
+ switch (memreg) {
+- case RPCRDMA_REGISTER:
+ case RPCRDMA_FRMR:
+ break;
+ #if RPCRDMA_PERSISTENT_REGISTRATION
+@@ -565,11 +548,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+ ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
+ if (IS_ERR(ia->ri_bind_mem)) {
+ printk(KERN_ALERT "%s: ib_get_dma_mr for "
+- "phys register failed with %lX\n\t"
+- "Will continue with degraded performance\n",
++ "phys register failed with %lX\n",
+ __func__, PTR_ERR(ia->ri_bind_mem));
+- memreg = RPCRDMA_REGISTER;
+- ia->ri_bind_mem = NULL;
++ rc = -ENOMEM;
++ goto out2;
+ }
+ break;
+ default:
+@@ -1611,67 +1593,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
+ return rc;
+ }
+
+-static int
+-rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
+- int *nsegs, int writing, struct rpcrdma_ia *ia)
+-{
+- int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
+- IB_ACCESS_REMOTE_READ);
+- struct rpcrdma_mr_seg *seg1 = seg;
+- struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
+- int len, i, rc = 0;
+-
+- if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
+- *nsegs = RPCRDMA_MAX_DATA_SEGS;
+- for (len = 0, i = 0; i < *nsegs;) {
+- rpcrdma_map_one(ia, seg, writing);
+- ipb[i].addr = seg->mr_dma;
+- ipb[i].size = seg->mr_len;
+- len += seg->mr_len;
+- ++seg;
+- ++i;
+- /* Check for holes */
+- if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
+- offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
+- break;
+- }
+- seg1->mr_base = seg1->mr_dma;
+- seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
+- ipb, i, mem_priv, &seg1->mr_base);
+- if (IS_ERR(seg1->mr_chunk.rl_mr)) {
+- rc = PTR_ERR(seg1->mr_chunk.rl_mr);
+- dprintk("RPC: %s: failed ib_reg_phys_mr "
+- "%u@0x%llx (%d)... status %i\n",
+- __func__, len,
+- (unsigned long long)seg1->mr_dma, i, rc);
+- while (i--)
+- rpcrdma_unmap_one(ia, --seg);
+- } else {
+- seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
+- seg1->mr_nsegs = i;
+- seg1->mr_len = len;
+- }
+- *nsegs = i;
+- return rc;
+-}
+-
+-static int
+-rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
+- struct rpcrdma_ia *ia)
+-{
+- struct rpcrdma_mr_seg *seg1 = seg;
+- int rc;
+-
+- rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
+- seg1->mr_chunk.rl_mr = NULL;
+- while (seg1->mr_nsegs--)
+- rpcrdma_unmap_one(ia, seg++);
+- if (rc)
+- dprintk("RPC: %s: failed ib_dereg_mr,"
+- " status %i\n", __func__, rc);
+- return rc;
+-}
+-
+ int
+ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
+ int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
+@@ -1701,10 +1622,8 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
+ rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
+ break;
+
+- /* Default registration each time */
+ default:
+- rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
+- break;
++ return -1;
+ }
+ if (rc)
+ return -1;
+@@ -1738,7 +1657,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
+ break;
+
+ default:
+- rc = rpcrdma_deregister_default_external(seg, ia);
+ break;
+ }
+ if (r) {
+--
+1.7.1
+
--- /dev/null
+From f10eafd3a6ce9da7e96999c124b643ea6c4921f3 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:32:51 -0400
+Subject: [PATCH 114/132] xprtrdma: Fall back to MTHCAFMR when FRMR is not supported
+
+An audit of in-kernel RDMA providers that do not support the FRMR
+memory registration shows that several of them support MTHCAFMR.
+Prefer MTHCAFMR when FRMR is not supported.
+
+If MTHCAFMR is not supported, only then choose ALLPHYSICAL.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c | 31 +++++++++++++++----------------
+ 1 files changed, 15 insertions(+), 16 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 6bb9a07..a352798 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -491,33 +491,32 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+ ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
+ }
+
+- switch (memreg) {
+- case RPCRDMA_MTHCAFMR:
+- if (!ia->ri_id->device->alloc_fmr) {
+- dprintk("RPC: %s: MTHCAFMR registration "
+- "specified but not supported by adapter, "
+- "using riskier RPCRDMA_ALLPHYSICAL\n",
+- __func__);
+- memreg = RPCRDMA_ALLPHYSICAL;
+- }
+- break;
+- case RPCRDMA_FRMR:
++ if (memreg == RPCRDMA_FRMR) {
+ /* Requires both frmr reg and local dma lkey */
+ if ((devattr.device_cap_flags &
+ (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
+ (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
+ dprintk("RPC: %s: FRMR registration "
+- "specified but not supported by adapter, "
+- "using riskier RPCRDMA_ALLPHYSICAL\n",
+- __func__);
+- memreg = RPCRDMA_ALLPHYSICAL;
++ "not supported by HCA\n", __func__);
++ memreg = RPCRDMA_MTHCAFMR;
+ } else {
+ /* Mind the ia limit on FRMR page list depth */
+ ia->ri_max_frmr_depth = min_t(unsigned int,
+ RPCRDMA_MAX_DATA_SEGS,
+ devattr.max_fast_reg_page_list_len);
+ }
+- break;
++ }
++ if (memreg == RPCRDMA_MTHCAFMR) {
++ if (!ia->ri_id->device->alloc_fmr) {
++ dprintk("RPC: %s: MTHCAFMR registration "
++ "not supported by HCA\n", __func__);
++#if RPCRDMA_PERSISTENT_REGISTRATION
++ memreg = RPCRDMA_ALLPHYSICAL;
++#else
++ rc = -EINVAL;
++ goto out2;
++#endif
++ }
+ }
+
+ /*
+--
+1.7.1
+
--- /dev/null
+From cdd9ade711599e7672a635add0406080856f8b92 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:00 -0400
+Subject: [PATCH 115/132] xprtrdma: mount reports "Invalid mount option" if memreg mode not supported
+
+If the selected memory registration mode is not supported by the
+underlying provider/HCA, the NFS mount command reports that there was
+an invalid mount option, and fails. This is misleading.
+
+Reporting a problem allocating memory is a lot closer to the truth.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c | 8 ++++----
+ 1 files changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index a352798..7c7e9b4 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -513,7 +513,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+ #if RPCRDMA_PERSISTENT_REGISTRATION
+ memreg = RPCRDMA_ALLPHYSICAL;
+ #else
+- rc = -EINVAL;
++ rc = -ENOMEM;
+ goto out2;
+ #endif
+ }
+@@ -554,9 +554,9 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
+ }
+ break;
+ default:
+- printk(KERN_ERR "%s: invalid memory registration mode %d\n",
+- __func__, memreg);
+- rc = -EINVAL;
++ printk(KERN_ERR "RPC: Unsupported memory "
++ "registration mode: %d\n", memreg);
++ rc = -ENOMEM;
+ goto out2;
+ }
+ dprintk("RPC: %s: memory registration strategy is %d\n",
+--
+1.7.1
+
--- /dev/null
+From 13c9ff8f673862b69e795ea99a237b461c557eb3 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:08 -0400
+Subject: [PATCH 116/132] xprtrdma: Simplify rpcrdma_deregister_external() synopsis
+
+Clean up: All remaining callers of rpcrdma_deregister_external()
+pass NULL as the last argument, so remove that argument.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c | 2 +-
+ net/sunrpc/xprtrdma/transport.c | 2 +-
+ net/sunrpc/xprtrdma/verbs.c | 8 +-------
+ net/sunrpc/xprtrdma/xprt_rdma.h | 2 +-
+ 4 files changed, 4 insertions(+), 10 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index aae1726..436d229 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -270,7 +270,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+ out:
+ for (pos = 0; nchunks--;)
+ pos += rpcrdma_deregister_external(
+- &req->rl_segments[pos], r_xprt, NULL);
++ &req->rl_segments[pos], r_xprt);
+ return 0;
+ }
+
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index c23b0c1..430cabb 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -575,7 +575,7 @@ xprt_rdma_free(void *buffer)
+ for (i = 0; req->rl_nchunks;) {
+ --req->rl_nchunks;
+ i += rpcrdma_deregister_external(
+- &req->rl_segments[i], r_xprt, NULL);
++ &req->rl_segments[i], r_xprt);
+ }
+
+ if (req->rl_iov.length == 0) { /* see allocate above */
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 7c7e9b4..0cbc83c 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -1632,7 +1632,7 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
+
+ int
+ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
+- struct rpcrdma_xprt *r_xprt, void *r)
++ struct rpcrdma_xprt *r_xprt)
+ {
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ int nsegs = seg->mr_nsegs, rc;
+@@ -1658,12 +1658,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
+ default:
+ break;
+ }
+- if (r) {
+- struct rpcrdma_rep *rep = r;
+- void (*func)(struct rpcrdma_rep *) = rep->rr_func;
+- rep->rr_func = NULL;
+- func(rep); /* dereg done, callback now */
+- }
+ return nsegs;
+ }
+
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index bf08ee0..3f44d6a 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -331,7 +331,7 @@ int rpcrdma_deregister_internal(struct rpcrdma_ia *,
+ int rpcrdma_register_external(struct rpcrdma_mr_seg *,
+ int, int, struct rpcrdma_xprt *);
+ int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
+- struct rpcrdma_xprt *, void *);
++ struct rpcrdma_xprt *);
+
+ /*
+ * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
+--
+1.7.1
+
--- /dev/null
+From 7f1d54191ed6fa0f79f584fe3ebf6519738e817f Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:16 -0400
+Subject: [PATCH 117/132] xprtrdma: Make rpcrdma_ep_destroy() return void
+
+Clean up: rpcrdma_ep_destroy() returns a value that is used
+only to print a debugging message. rpcrdma_ep_destroy() already
+prints debugging messages in all error cases.
+
+Make rpcrdma_ep_destroy() return void instead.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/transport.c | 8 ++------
+ net/sunrpc/xprtrdma/verbs.c | 7 +------
+ net/sunrpc/xprtrdma/xprt_rdma.h | 2 +-
+ 3 files changed, 4 insertions(+), 13 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index 430cabb..d18b2a3 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -229,7 +229,6 @@ static void
+ xprt_rdma_destroy(struct rpc_xprt *xprt)
+ {
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+- int rc;
+
+ dprintk("RPC: %s: called\n", __func__);
+
+@@ -238,10 +237,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
+ xprt_clear_connected(xprt);
+
+ rpcrdma_buffer_destroy(&r_xprt->rx_buf);
+- rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
+- if (rc)
+- dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n",
+- __func__, rc);
++ rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ rpcrdma_ia_close(&r_xprt->rx_ia);
+
+ xprt_rdma_free_addresses(xprt);
+@@ -391,7 +387,7 @@ out4:
+ xprt_rdma_free_addresses(xprt);
+ rc = -EINVAL;
+ out3:
+- (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
++ rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
+ out2:
+ rpcrdma_ia_close(&new_xprt->rx_ia);
+ out1:
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 0cbc83c..edc951e 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -748,11 +748,8 @@ out1:
+ * Disconnect and destroy endpoint. After this, the only
+ * valid operations on the ep are to free it (if dynamically
+ * allocated) or re-create it.
+- *
+- * The caller's error handling must be sure to not leak the endpoint
+- * if this function fails.
+ */
+-int
++void
+ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+ {
+ int rc;
+@@ -782,8 +779,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+ if (rc)
+ dprintk("RPC: %s: ib_destroy_cq returned %i\n",
+ __func__, rc);
+-
+- return rc;
+ }
+
+ /*
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index 3f44d6a..362a19d 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -301,7 +301,7 @@ void rpcrdma_ia_close(struct rpcrdma_ia *);
+ */
+ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
+ struct rpcrdma_create_data_internal *);
+-int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
++void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
+ int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+ int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+
+--
+1.7.1
+
--- /dev/null
+From fc66448549bbb77f2f1a38b270ab2d6b6a22da33 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:25 -0400
+Subject: [PATCH 118/132] xprtrdma: Split the completion queue
+
+The current CQ handler uses the ib_wc.opcode field to distinguish
+between event types. However, the contents of that field are not
+reliable if the completion status is not IB_WC_SUCCESS.
+
+When an error completion occurs on a send event, the CQ handler
+schedules a tasklet with something that is not a struct rpcrdma_rep.
+This is never correct behavior, and sometimes it results in a panic.
+
+To resolve this issue, split the completion queue into a send CQ and
+a receive CQ. The send CQ handler now handles only struct rpcrdma_mw
+wr_id's, and the receive CQ handler now handles only struct
+rpcrdma_rep wr_id's.
+
+Fix suggested by Shirley Ma <shirley.ma@oracle.com>
+
+Reported-by: Rafael Reiter <rafael.reiter@ims.co.at>
+Fixes: 5c635e09cec0feeeb310968e51dad01040244851
+BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=73211
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Klemens Senn <klemens.senn@ims.co.at>
+Tested-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c | 228 +++++++++++++++++++++++----------------
+ net/sunrpc/xprtrdma/xprt_rdma.h | 1 -
+ 2 files changed, 137 insertions(+), 92 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index edc951e..af2d097 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -142,96 +142,115 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
+ }
+ }
+
+-static inline
+-void rpcrdma_event_process(struct ib_wc *wc)
++static void
++rpcrdma_sendcq_process_wc(struct ib_wc *wc)
+ {
+- struct rpcrdma_mw *frmr;
+- struct rpcrdma_rep *rep =
+- (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
++ struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+
+- dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
+- __func__, rep, wc->status, wc->opcode, wc->byte_len);
++ dprintk("RPC: %s: frmr %p status %X opcode %d\n",
++ __func__, frmr, wc->status, wc->opcode);
+
+- if (!rep) /* send completion that we don't care about */
++ if (wc->wr_id == 0ULL)
+ return;
+-
+- if (IB_WC_SUCCESS != wc->status) {
+- dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
+- __func__, wc->opcode, wc->status);
+- rep->rr_len = ~0U;
+- if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
+- rpcrdma_schedule_tasklet(rep);
++ if (wc->status != IB_WC_SUCCESS)
+ return;
+- }
+
+- switch (wc->opcode) {
+- case IB_WC_FAST_REG_MR:
+- frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
++ if (wc->opcode == IB_WC_FAST_REG_MR)
+ frmr->r.frmr.state = FRMR_IS_VALID;
+- break;
+- case IB_WC_LOCAL_INV:
+- frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
++ else if (wc->opcode == IB_WC_LOCAL_INV)
+ frmr->r.frmr.state = FRMR_IS_INVALID;
+- break;
+- case IB_WC_RECV:
+- rep->rr_len = wc->byte_len;
+- ib_dma_sync_single_for_cpu(
+- rdmab_to_ia(rep->rr_buffer)->ri_id->device,
+- rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
+- /* Keep (only) the most recent credits, after check validity */
+- if (rep->rr_len >= 16) {
+- struct rpcrdma_msg *p =
+- (struct rpcrdma_msg *) rep->rr_base;
+- unsigned int credits = ntohl(p->rm_credit);
+- if (credits == 0) {
+- dprintk("RPC: %s: server"
+- " dropped credits to 0!\n", __func__);
+- /* don't deadlock */
+- credits = 1;
+- } else if (credits > rep->rr_buffer->rb_max_requests) {
+- dprintk("RPC: %s: server"
+- " over-crediting: %d (%d)\n",
+- __func__, credits,
+- rep->rr_buffer->rb_max_requests);
+- credits = rep->rr_buffer->rb_max_requests;
+- }
+- atomic_set(&rep->rr_buffer->rb_credits, credits);
+- }
+- rpcrdma_schedule_tasklet(rep);
+- break;
+- default:
+- dprintk("RPC: %s: unexpected WC event %X\n",
+- __func__, wc->opcode);
+- break;
+- }
+ }
+
+-static inline int
+-rpcrdma_cq_poll(struct ib_cq *cq)
++static int
++rpcrdma_sendcq_poll(struct ib_cq *cq)
+ {
+ struct ib_wc wc;
+ int rc;
+
+- for (;;) {
+- rc = ib_poll_cq(cq, 1, &wc);
+- if (rc < 0) {
+- dprintk("RPC: %s: ib_poll_cq failed %i\n",
+- __func__, rc);
+- return rc;
+- }
+- if (rc == 0)
+- break;
++ while ((rc = ib_poll_cq(cq, 1, &wc)) == 1)
++ rpcrdma_sendcq_process_wc(&wc);
++ return rc;
++}
+
+- rpcrdma_event_process(&wc);
++/*
++ * Handle send, fast_reg_mr, and local_inv completions.
++ *
++ * Send events are typically suppressed and thus do not result
++ * in an upcall. Occasionally one is signaled, however. This
++ * prevents the provider's completion queue from wrapping and
++ * losing a completion.
++ */
++static void
++rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
++{
++ int rc;
++
++ rc = rpcrdma_sendcq_poll(cq);
++ if (rc) {
++ dprintk("RPC: %s: ib_poll_cq failed: %i\n",
++ __func__, rc);
++ return;
+ }
+
+- return 0;
++ rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
++ if (rc) {
++ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
++ __func__, rc);
++ return;
++ }
++
++ rpcrdma_sendcq_poll(cq);
++}
++
++static void
++rpcrdma_recvcq_process_wc(struct ib_wc *wc)
++{
++ struct rpcrdma_rep *rep =
++ (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
++
++ dprintk("RPC: %s: rep %p status %X opcode %X length %u\n",
++ __func__, rep, wc->status, wc->opcode, wc->byte_len);
++
++ if (wc->status != IB_WC_SUCCESS) {
++ rep->rr_len = ~0U;
++ goto out_schedule;
++ }
++ if (wc->opcode != IB_WC_RECV)
++ return;
++
++ rep->rr_len = wc->byte_len;
++ ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
++ rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
++
++ if (rep->rr_len >= 16) {
++ struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base;
++ unsigned int credits = ntohl(p->rm_credit);
++
++ if (credits == 0)
++ credits = 1; /* don't deadlock */
++ else if (credits > rep->rr_buffer->rb_max_requests)
++ credits = rep->rr_buffer->rb_max_requests;
++ atomic_set(&rep->rr_buffer->rb_credits, credits);
++ }
++
++out_schedule:
++ rpcrdma_schedule_tasklet(rep);
++}
++
++static int
++rpcrdma_recvcq_poll(struct ib_cq *cq)
++{
++ struct ib_wc wc;
++ int rc;
++
++ while ((rc = ib_poll_cq(cq, 1, &wc)) == 1)
++ rpcrdma_recvcq_process_wc(&wc);
++ return rc;
+ }
+
+ /*
+- * rpcrdma_cq_event_upcall
++ * Handle receive completions.
+ *
+- * This upcall handles recv and send events.
+ * It is reentrant but processes single events in order to maintain
+ * ordering of receives to keep server credits.
+ *
+@@ -240,26 +259,27 @@ rpcrdma_cq_poll(struct ib_cq *cq)
+ * connection shutdown. That is, the structures required for
+ * the completion of the reply handler must remain intact until
+ * all memory has been reclaimed.
+- *
+- * Note that send events are suppressed and do not result in an upcall.
+ */
+ static void
+-rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
++rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
+ {
+ int rc;
+
+- rc = rpcrdma_cq_poll(cq);
+- if (rc)
++ rc = rpcrdma_recvcq_poll(cq);
++ if (rc) {
++ dprintk("RPC: %s: ib_poll_cq failed: %i\n",
++ __func__, rc);
+ return;
++ }
+
+ rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+ if (rc) {
+- dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
++ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
+ __func__, rc);
+ return;
+ }
+
+- rpcrdma_cq_poll(cq);
++ rpcrdma_recvcq_poll(cq);
+ }
+
+ #ifdef RPC_DEBUG
+@@ -610,6 +630,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+ struct rpcrdma_create_data_internal *cdata)
+ {
+ struct ib_device_attr devattr;
++ struct ib_cq *sendcq, *recvcq;
+ int rc, err;
+
+ rc = ib_query_device(ia->ri_id->device, &devattr);
+@@ -685,7 +706,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+ ep->rep_attr.cap.max_recv_sge);
+
+ /* set trigger for requesting send completion */
+- ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
++ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
+ if (ep->rep_cqinit <= 2)
+ ep->rep_cqinit = 0;
+ INIT_CQCOUNT(ep);
+@@ -693,26 +714,43 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+ init_waitqueue_head(&ep->rep_connect_wait);
+ INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
+
+- ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
++ sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
+ rpcrdma_cq_async_error_upcall, NULL,
+- ep->rep_attr.cap.max_recv_wr +
+ ep->rep_attr.cap.max_send_wr + 1, 0);
+- if (IS_ERR(ep->rep_cq)) {
+- rc = PTR_ERR(ep->rep_cq);
+- dprintk("RPC: %s: ib_create_cq failed: %i\n",
++ if (IS_ERR(sendcq)) {
++ rc = PTR_ERR(sendcq);
++ dprintk("RPC: %s: failed to create send CQ: %i\n",
+ __func__, rc);
+ goto out1;
+ }
+
+- rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
++ rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
+ if (rc) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
+ __func__, rc);
+ goto out2;
+ }
+
+- ep->rep_attr.send_cq = ep->rep_cq;
+- ep->rep_attr.recv_cq = ep->rep_cq;
++ recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
++ rpcrdma_cq_async_error_upcall, NULL,
++ ep->rep_attr.cap.max_recv_wr + 1, 0);
++ if (IS_ERR(recvcq)) {
++ rc = PTR_ERR(recvcq);
++ dprintk("RPC: %s: failed to create recv CQ: %i\n",
++ __func__, rc);
++ goto out2;
++ }
++
++ rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
++ if (rc) {
++ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
++ __func__, rc);
++ ib_destroy_cq(recvcq);
++ goto out2;
++ }
++
++ ep->rep_attr.send_cq = sendcq;
++ ep->rep_attr.recv_cq = recvcq;
+
+ /* Initialize cma parameters */
+
+@@ -734,7 +772,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+ return 0;
+
+ out2:
+- err = ib_destroy_cq(ep->rep_cq);
++ err = ib_destroy_cq(sendcq);
+ if (err)
+ dprintk("RPC: %s: ib_destroy_cq returned %i\n",
+ __func__, err);
+@@ -774,8 +812,14 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+ ep->rep_pad_mr = NULL;
+ }
+
+- rpcrdma_clean_cq(ep->rep_cq);
+- rc = ib_destroy_cq(ep->rep_cq);
++ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
++ rc = ib_destroy_cq(ep->rep_attr.recv_cq);
++ if (rc)
++ dprintk("RPC: %s: ib_destroy_cq returned %i\n",
++ __func__, rc);
++
++ rpcrdma_clean_cq(ep->rep_attr.send_cq);
++ rc = ib_destroy_cq(ep->rep_attr.send_cq);
+ if (rc)
+ dprintk("RPC: %s: ib_destroy_cq returned %i\n",
+ __func__, rc);
+@@ -798,7 +842,9 @@ retry:
+ if (rc && rc != -ENOTCONN)
+ dprintk("RPC: %s: rpcrdma_ep_disconnect"
+ " status %i\n", __func__, rc);
+- rpcrdma_clean_cq(ep->rep_cq);
++
++ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
++ rpcrdma_clean_cq(ep->rep_attr.send_cq);
+
+ xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
+ id = rpcrdma_create_id(xprt, ia,
+@@ -907,7 +953,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+ {
+ int rc;
+
+- rpcrdma_clean_cq(ep->rep_cq);
++ rpcrdma_clean_cq(ep->rep_attr.recv_cq);
++ rpcrdma_clean_cq(ep->rep_attr.send_cq);
+ rc = rdma_disconnect(ia->ri_id);
+ if (!rc) {
+ /* returns without wait if not connected */
+@@ -1727,7 +1774,6 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
+ ib_dma_sync_single_for_cpu(ia->ri_id->device,
+ rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
+
+- DECR_CQCOUNT(ep);
+ rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
+
+ if (rc)
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index 362a19d..334ab6e 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -79,7 +79,6 @@ struct rpcrdma_ep {
+ int rep_cqinit;
+ int rep_connected;
+ struct rpcrdma_ia *rep_ia;
+- struct ib_cq *rep_cq;
+ struct ib_qp_init_attr rep_attr;
+ wait_queue_head_t rep_connect_wait;
+ struct ib_sge rep_pad; /* holds zeroed pad */
+--
+1.7.1
+
--- /dev/null
+From 7f23f6f6e388d2003c4ecf5d558f3c2191e12530 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:34 -0400
+Subject: [PATCH 119/132] xprtrmda: Reduce lock contention in completion handlers
+
+Skip the ib_poll_cq() after re-arming, if the provider knows there
+are no additional items waiting. (Have a look at commit ed23a727 for
+more details).
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c | 14 ++++++++++----
+ 1 files changed, 10 insertions(+), 4 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index af2d097..c7d5281 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -192,8 +192,11 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
+ return;
+ }
+
+- rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+- if (rc) {
++ rc = ib_req_notify_cq(cq,
++ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
++ if (rc == 0)
++ return;
++ if (rc < 0) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
+ __func__, rc);
+ return;
+@@ -272,8 +275,11 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
+ return;
+ }
+
+- rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+- if (rc) {
++ rc = ib_req_notify_cq(cq,
++ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
++ if (rc == 0)
++ return;
++ if (rc < 0) {
+ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
+ __func__, rc);
+ return;
+--
+1.7.1
+
--- /dev/null
+From 1c00dd0776543608e13c74a527660cb8cd28a74f Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:42 -0400
+Subject: [PATCH 120/132] xprtrmda: Reduce calls to ib_poll_cq() in completion handlers
+
+Change the completion handlers to grab up to 16 items per
+ib_poll_cq() call. No extra ib_poll_cq() is needed if fewer than 16
+items are returned.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c | 56 ++++++++++++++++++++++++++------------
+ net/sunrpc/xprtrdma/xprt_rdma.h | 4 +++
+ 2 files changed, 42 insertions(+), 18 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index c7d5281..b8caee9 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -162,14 +162,23 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
+ }
+
+ static int
+-rpcrdma_sendcq_poll(struct ib_cq *cq)
++rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+ {
+- struct ib_wc wc;
+- int rc;
++ struct ib_wc *wcs;
++ int count, rc;
+
+- while ((rc = ib_poll_cq(cq, 1, &wc)) == 1)
+- rpcrdma_sendcq_process_wc(&wc);
+- return rc;
++ do {
++ wcs = ep->rep_send_wcs;
++
++ rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
++ if (rc <= 0)
++ return rc;
++
++ count = rc;
++ while (count-- > 0)
++ rpcrdma_sendcq_process_wc(wcs++);
++ } while (rc == RPCRDMA_POLLSIZE);
++ return 0;
+ }
+
+ /*
+@@ -183,9 +192,10 @@ rpcrdma_sendcq_poll(struct ib_cq *cq)
+ static void
+ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
+ {
++ struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
+ int rc;
+
+- rc = rpcrdma_sendcq_poll(cq);
++ rc = rpcrdma_sendcq_poll(cq, ep);
+ if (rc) {
+ dprintk("RPC: %s: ib_poll_cq failed: %i\n",
+ __func__, rc);
+@@ -202,7 +212,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
+ return;
+ }
+
+- rpcrdma_sendcq_poll(cq);
++ rpcrdma_sendcq_poll(cq, ep);
+ }
+
+ static void
+@@ -241,14 +251,23 @@ out_schedule:
+ }
+
+ static int
+-rpcrdma_recvcq_poll(struct ib_cq *cq)
++rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+ {
+- struct ib_wc wc;
+- int rc;
++ struct ib_wc *wcs;
++ int count, rc;
+
+- while ((rc = ib_poll_cq(cq, 1, &wc)) == 1)
+- rpcrdma_recvcq_process_wc(&wc);
+- return rc;
++ do {
++ wcs = ep->rep_recv_wcs;
++
++ rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
++ if (rc <= 0)
++ return rc;
++
++ count = rc;
++ while (count-- > 0)
++ rpcrdma_recvcq_process_wc(wcs++);
++ } while (rc == RPCRDMA_POLLSIZE);
++ return 0;
+ }
+
+ /*
+@@ -266,9 +285,10 @@ rpcrdma_recvcq_poll(struct ib_cq *cq)
+ static void
+ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
+ {
++ struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
+ int rc;
+
+- rc = rpcrdma_recvcq_poll(cq);
++ rc = rpcrdma_recvcq_poll(cq, ep);
+ if (rc) {
+ dprintk("RPC: %s: ib_poll_cq failed: %i\n",
+ __func__, rc);
+@@ -285,7 +305,7 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
+ return;
+ }
+
+- rpcrdma_recvcq_poll(cq);
++ rpcrdma_recvcq_poll(cq, ep);
+ }
+
+ #ifdef RPC_DEBUG
+@@ -721,7 +741,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+ INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
+
+ sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
+- rpcrdma_cq_async_error_upcall, NULL,
++ rpcrdma_cq_async_error_upcall, ep,
+ ep->rep_attr.cap.max_send_wr + 1, 0);
+ if (IS_ERR(sendcq)) {
+ rc = PTR_ERR(sendcq);
+@@ -738,7 +758,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
+ }
+
+ recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
+- rpcrdma_cq_async_error_upcall, NULL,
++ rpcrdma_cq_async_error_upcall, ep,
+ ep->rep_attr.cap.max_recv_wr + 1, 0);
+ if (IS_ERR(recvcq)) {
+ rc = PTR_ERR(recvcq);
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index 334ab6e..cb4c882 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -74,6 +74,8 @@ struct rpcrdma_ia {
+ * RDMA Endpoint -- one per transport instance
+ */
+
++#define RPCRDMA_POLLSIZE (16)
++
+ struct rpcrdma_ep {
+ atomic_t rep_cqcount;
+ int rep_cqinit;
+@@ -88,6 +90,8 @@ struct rpcrdma_ep {
+ struct rdma_conn_param rep_remote_cma;
+ struct sockaddr_storage rep_remote_addr;
+ struct delayed_work rep_connect_worker;
++ struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE];
++ struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE];
+ };
+
+ #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
+--
+1.7.1
+
--- /dev/null
+From 8301a2c047cc25dabd645e5590c1db0ead4c5af4 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:51 -0400
+Subject: [PATCH 121/132] xprtrdma: Limit work done by completion handler
+
+Sagi Grimberg <sagig@dev.mellanox.co.il> points out that a steady
+stream of CQ events could starve other work because of the boundless
+loop pooling in rpcrdma_{send,recv}_poll().
+
+Instead of a (potentially infinite) while loop, return after
+collecting a budgeted number of completions.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Acked-by: Sagi Grimberg <sagig@dev.mellanox.co.il>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c | 10 ++++++----
+ net/sunrpc/xprtrdma/xprt_rdma.h | 1 +
+ 2 files changed, 7 insertions(+), 4 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index b8caee9..1d08366 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -165,8 +165,9 @@ static int
+ rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+ {
+ struct ib_wc *wcs;
+- int count, rc;
++ int budget, count, rc;
+
++ budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
+ do {
+ wcs = ep->rep_send_wcs;
+
+@@ -177,7 +178,7 @@ rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+ count = rc;
+ while (count-- > 0)
+ rpcrdma_sendcq_process_wc(wcs++);
+- } while (rc == RPCRDMA_POLLSIZE);
++ } while (rc == RPCRDMA_POLLSIZE && --budget);
+ return 0;
+ }
+
+@@ -254,8 +255,9 @@ static int
+ rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+ {
+ struct ib_wc *wcs;
+- int count, rc;
++ int budget, count, rc;
+
++ budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
+ do {
+ wcs = ep->rep_recv_wcs;
+
+@@ -266,7 +268,7 @@ rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
+ count = rc;
+ while (count-- > 0)
+ rpcrdma_recvcq_process_wc(wcs++);
+- } while (rc == RPCRDMA_POLLSIZE);
++ } while (rc == RPCRDMA_POLLSIZE && --budget);
+ return 0;
+ }
+
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index cb4c882..0c3b88e 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -74,6 +74,7 @@ struct rpcrdma_ia {
+ * RDMA Endpoint -- one per transport instance
+ */
+
++#define RPCRDMA_WC_BUDGET (128)
+ #define RPCRDMA_POLLSIZE (16)
+
+ struct rpcrdma_ep {
+--
+1.7.1
+
--- /dev/null
+From 65866f8259851cea5e356d2fd46fc37a4e26330e Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:33:59 -0400
+Subject: [PATCH 122/132] xprtrdma: Reduce the number of hardway buffer allocations
+
+While marshaling an RPC/RDMA request, the inline_{rsize,wsize}
+settings determine whether an inline request is used, or whether
+read or write chunks lists are built. The current default value of
+these settings is 1024. Any RPC request smaller than 1024 bytes is
+sent to the NFS server completely inline.
+
+rpcrdma_buffer_create() allocates and pre-registers a set of RPC
+buffers for each transport instance, also based on the inline rsize
+and wsize settings.
+
+RPC/RDMA requests and replies are built in these buffers. However,
+if an RPC/RDMA request is expected to be larger than 1024, a buffer
+has to be allocated and registered for that RPC, and deregistered
+and released when the RPC is complete. This is known has a
+"hardway allocation."
+
+Since the introduction of NFSv4, the size of RPC requests has become
+larger, and hardway allocations are thus more frequent. Hardway
+allocations are significant overhead, and they waste the existing
+RPC buffers pre-allocated by rpcrdma_buffer_create().
+
+We'd like fewer hardway allocations.
+
+Increasing the size of the pre-registered buffers is the most direct
+way to do this. However, a blanket increase of the inline thresholds
+has interoperability consequences.
+
+On my 64-bit system, rpcrdma_buffer_create() requests roughly 7000
+bytes for each RPC request buffer, using kmalloc(). Due to internal
+fragmentation, this wastes nearly 1200 bytes because kmalloc()
+already returns an 8192-byte piece of memory for a 7000-byte
+allocation request, though the extra space remains unused.
+
+So let's round up the size of the pre-allocated buffers, and make
+use of the unused space in the kmalloc'd memory.
+
+This change reduces the amount of hardway allocated memory for an
+NFSv4 general connectathon run from 1322092 to 9472 bytes (99%).
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c | 25 +++++++++++++------------
+ 1 files changed, 13 insertions(+), 12 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 1d08366..c80995a 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -50,6 +50,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/pci.h> /* for Tavor hack below */
+ #include <linux/slab.h>
++#include <asm/bitops.h>
+
+ #include "xprt_rdma.h"
+
+@@ -1005,7 +1006,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+ struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
+ {
+ char *p;
+- size_t len;
++ size_t len, rlen, wlen;
+ int i, rc;
+ struct rpcrdma_mw *r;
+
+@@ -1120,16 +1121,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+ * Allocate/init the request/reply buffers. Doing this
+ * using kmalloc for now -- one for each buf.
+ */
++ wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req));
++ rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep));
++ dprintk("RPC: %s: wlen = %zu, rlen = %zu\n",
++ __func__, wlen, rlen);
++
+ for (i = 0; i < buf->rb_max_requests; i++) {
+ struct rpcrdma_req *req;
+ struct rpcrdma_rep *rep;
+
+- len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
+- /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
+- /* Typical ~2400b, so rounding up saves work later */
+- if (len < 4096)
+- len = 4096;
+- req = kmalloc(len, GFP_KERNEL);
++ req = kmalloc(wlen, GFP_KERNEL);
+ if (req == NULL) {
+ dprintk("RPC: %s: request buffer %d alloc"
+ " failed\n", __func__, i);
+@@ -1141,16 +1142,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+ buf->rb_send_bufs[i]->rl_buffer = buf;
+
+ rc = rpcrdma_register_internal(ia, req->rl_base,
+- len - offsetof(struct rpcrdma_req, rl_base),
++ wlen - offsetof(struct rpcrdma_req, rl_base),
+ &buf->rb_send_bufs[i]->rl_handle,
+ &buf->rb_send_bufs[i]->rl_iov);
+ if (rc)
+ goto out;
+
+- buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
++ buf->rb_send_bufs[i]->rl_size = wlen -
++ sizeof(struct rpcrdma_req);
+
+- len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
+- rep = kmalloc(len, GFP_KERNEL);
++ rep = kmalloc(rlen, GFP_KERNEL);
+ if (rep == NULL) {
+ dprintk("RPC: %s: reply buffer %d alloc failed\n",
+ __func__, i);
+@@ -1162,7 +1163,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
+ buf->rb_recv_bufs[i]->rr_buffer = buf;
+
+ rc = rpcrdma_register_internal(ia, rep->rr_base,
+- len - offsetof(struct rpcrdma_rep, rr_base),
++ rlen - offsetof(struct rpcrdma_rep, rr_base),
+ &buf->rb_recv_bufs[i]->rr_handle,
+ &buf->rb_recv_bufs[i]->rr_iov);
+ if (rc)
+--
+1.7.1
+
--- /dev/null
+From ec62f40d3505a643497d105c297093bb90afd44e Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:34:07 -0400
+Subject: [PATCH 123/132] xprtrdma: Ensure ia->ri_id->qp is not NULL when reconnecting
+
+Devesh Sharma <Devesh.Sharma@Emulex.Com> reports that after a
+disconnect, his HCA is failing to create a fresh QP, leaving
+ia_ri->ri_id->qp set to NULL. But xprtrdma still allows RPCs to
+wake up and post LOCAL_INV as they exit, causing an oops.
+
+rpcrdma_ep_connect() is allowing the wake-up by leaking the QP
+creation error code (-EPERM in this case) to the RPC client's
+generic layer. xprt_connect_status() does not recognize -EPERM, so
+it kills pending RPC tasks immediately rather than retrying the
+connect.
+
+Re-arrange the QP creation logic so that when it fails on reconnect,
+it leaves ->qp with the old QP rather than NULL. If pending RPC
+tasks wake and exit, LOCAL_INV work requests will flush rather than
+oops.
+
+On initial connect, leaving ->qp == NULL is OK, since there are no
+pending RPCs that might use ->qp. But be sure not to try to destroy
+a NULL QP when rpcrdma_ep_connect() is retried.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c | 29 ++++++++++++++++++++---------
+ 1 files changed, 20 insertions(+), 9 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index c80995a..54edf2a 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -867,6 +867,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+ if (ep->rep_connected != 0) {
+ struct rpcrdma_xprt *xprt;
+ retry:
++ dprintk("RPC: %s: reconnecting...\n", __func__);
+ rc = rpcrdma_ep_disconnect(ep, ia);
+ if (rc && rc != -ENOTCONN)
+ dprintk("RPC: %s: rpcrdma_ep_disconnect"
+@@ -879,7 +880,7 @@ retry:
+ id = rpcrdma_create_id(xprt, ia,
+ (struct sockaddr *)&xprt->rx_data.addr);
+ if (IS_ERR(id)) {
+- rc = PTR_ERR(id);
++ rc = -EHOSTUNREACH;
+ goto out;
+ }
+ /* TEMP TEMP TEMP - fail if new device:
+@@ -893,20 +894,30 @@ retry:
+ printk("RPC: %s: can't reconnect on "
+ "different device!\n", __func__);
+ rdma_destroy_id(id);
+- rc = -ENETDOWN;
++ rc = -ENETUNREACH;
+ goto out;
+ }
+ /* END TEMP */
++ rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr);
++ if (rc) {
++ dprintk("RPC: %s: rdma_create_qp failed %i\n",
++ __func__, rc);
++ rdma_destroy_id(id);
++ rc = -ENETUNREACH;
++ goto out;
++ }
+ rdma_destroy_qp(ia->ri_id);
+ rdma_destroy_id(ia->ri_id);
+ ia->ri_id = id;
+- }
+-
+- rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
+- if (rc) {
+- dprintk("RPC: %s: rdma_create_qp failed %i\n",
+- __func__, rc);
+- goto out;
++ } else {
++ dprintk("RPC: %s: connecting...\n", __func__);
++ rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
++ if (rc) {
++ dprintk("RPC: %s: rdma_create_qp failed %i\n",
++ __func__, rc);
++ /* do not update ep->rep_connected */
++ return -ENETUNREACH;
++ }
+ }
+
+ /* XXX Tavor device performs badly with 2K MTU! */
+--
+1.7.1
+
--- /dev/null
+From 5bc4bc729275a0bfc2bfd04466e8ab7c85af2f6e Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:34:16 -0400
+Subject: [PATCH 124/132] xprtrdma: Remove Tavor MTU setting
+
+Clean up. Remove HCA-specific clutter in xprtrdma, which is
+supposed to be device-independent.
+
+Hal Rosenstock <hal@dev.mellanox.co.il> observes:
+> Note that there is OpenSM option (enable_quirks) to return 1K MTU
+> in SA PathRecord responses for Tavor so that can be used for this.
+> The default setting for enable_quirks is FALSE so that would need
+> changing.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/verbs.c | 14 --------------
+ 1 files changed, 0 insertions(+), 14 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 54edf2a..515dfc1 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -48,7 +48,6 @@
+ */
+
+ #include <linux/interrupt.h>
+-#include <linux/pci.h> /* for Tavor hack below */
+ #include <linux/slab.h>
+ #include <asm/bitops.h>
+
+@@ -920,19 +919,6 @@ retry:
+ }
+ }
+
+-/* XXX Tavor device performs badly with 2K MTU! */
+-if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
+- struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
+- if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
+- (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
+- pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
+- struct ib_qp_attr attr = {
+- .path_mtu = IB_MTU_1024
+- };
+- rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
+- }
+-}
+-
+ ep->rep_connected = 0;
+
+ rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
+--
+1.7.1
+
--- /dev/null
+From 196c69989d84ab902bbe545f7bd8ce78ee74dac4 Mon Sep 17 00:00:00 2001
+From: Shirley Ma <shirley.ma@oracle.com>
+Date: Wed, 28 May 2014 10:34:24 -0400
+Subject: [PATCH 125/132] xprtrdma: Allocate missing pagelist
+
+GETACL relies on transport layer to alloc memory for reply buffer.
+However xprtrdma assumes that the reply buffer (pagelist) has been
+pre-allocated in upper layer. This problem was reported by IOL OFA lab
+test on PPC.
+
+Signed-off-by: Shirley Ma <shirley.ma@oracle.com>
+Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
+Tested-by: Edward Mossman <emossman@iol.unh.edu>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c | 6 ++++++
+ 1 files changed, 6 insertions(+), 0 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index 436d229..dc4a826 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -99,6 +99,12 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
+ page_base = xdrbuf->page_base & ~PAGE_MASK;
+ p = 0;
+ while (len && n < nsegs) {
++ if (!ppages[p]) {
++ /* alloc the pagelist for receiving buffer */
++ ppages[p] = alloc_page(GFP_ATOMIC);
++ if (!ppages[p])
++ return 0;
++ }
+ seg[n].mr_page = ppages[p];
+ seg[n].mr_offset = (void *)(unsigned long) page_base;
+ seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
+--
+1.7.1
+
--- /dev/null
+From bfaee096deaa680195df5491eb650f81051c145d Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:34:32 -0400
+Subject: [PATCH 126/132] xprtrdma: Use macros for reconnection timeout constants
+
+Clean up: Ensure the same max and min constant values are used
+everywhere when setting reconnect timeouts.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/transport.c | 19 ++++++++++++-------
+ 1 files changed, 12 insertions(+), 7 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index d18b2a3..6b84d7d 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -149,6 +149,11 @@ static struct ctl_table sunrpc_table[] = {
+
+ #endif
+
++#define RPCRDMA_BIND_TO (60U * HZ)
++#define RPCRDMA_INIT_REEST_TO (5U * HZ)
++#define RPCRDMA_MAX_REEST_TO (30U * HZ)
++#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
++
+ static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
+
+ static void
+@@ -285,9 +290,9 @@ xprt_setup_rdma(struct xprt_create *args)
+
+ /* 60 second timeout, no retries */
+ xprt->timeout = &xprt_rdma_default_timeout;
+- xprt->bind_timeout = (60U * HZ);
+- xprt->reestablish_timeout = (5U * HZ);
+- xprt->idle_timeout = (5U * 60 * HZ);
++ xprt->bind_timeout = RPCRDMA_BIND_TO;
++ xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
++ xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
+
+ xprt->resvport = 0; /* privileged port not needed */
+ xprt->tsh_size = 0; /* RPC-RDMA handles framing */
+@@ -432,10 +437,10 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
+ schedule_delayed_work(&r_xprt->rdma_connect,
+ xprt->reestablish_timeout);
+ xprt->reestablish_timeout <<= 1;
+- if (xprt->reestablish_timeout > (30 * HZ))
+- xprt->reestablish_timeout = (30 * HZ);
+- else if (xprt->reestablish_timeout < (5 * HZ))
+- xprt->reestablish_timeout = (5 * HZ);
++ if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
++ xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
++ else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
++ xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+ } else {
+ schedule_delayed_work(&r_xprt->rdma_connect, 0);
+ if (!RPC_IS_ASYNC(task))
+--
+1.7.1
+
--- /dev/null
+From 18906972aa1103c07869c9b43860a52e0e27e8e5 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:34:41 -0400
+Subject: [PATCH 127/132] xprtrdma: Reset connection timeout after successful reconnect
+
+If the new connection is able to make forward progress, reset the
+re-establish timeout. Otherwise it keeps growing even if disconnect
+events are rare.
+
+The same behavior as TCP is adopted: reconnect immediately if the
+transport instance has been able to make some forward progress.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c | 1 +
+ 1 files changed, 1 insertions(+), 0 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index dc4a826..ac65b0c 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -770,6 +770,7 @@ repost:
+
+ /* from here on, the reply is no longer an orphan */
+ req->rl_reply = rep;
++ xprt->reestablish_timeout = 0;
+
+ /* check for expected message types */
+ /* The order of some of these tests is important. */
+--
+1.7.1
+
--- /dev/null
+From e7ce710a8802351bd4118c5d6136c1d850f67cf9 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:34:57 -0400
+Subject: [PATCH 128/132] xprtrdma: Avoid deadlock when credit window is reset
+
+Update the cwnd while processing the server's reply. Otherwise the
+next task on the xprt_sending queue is still subject to the old
+credit window. Currently, no task is awoken if the old congestion
+window is still exceeded, even if the new window is larger, and a
+deadlock results.
+
+This is an issue during a transport reconnect. Servers don't
+normally shrink the credit window, but the client does reset it to
+1 when reconnecting so the server can safely grow it again.
+
+As a minor optimization, remove the hack of grabbing the initial
+cwnd size (which happens to be RPC_CWNDSCALE) and using that value
+as the congestion scaling factor. The scaling value is invariant,
+and we are better off without the multiplication operation.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c | 6 ++++++
+ net/sunrpc/xprtrdma/transport.c | 19 +------------------
+ net/sunrpc/xprtrdma/xprt_rdma.h | 1 -
+ 3 files changed, 7 insertions(+), 19 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index ac65b0c..77b84cf 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -716,6 +716,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ __be32 *iptr;
+ int rdmalen, status;
++ unsigned long cwnd;
+
+ /* Check status. If bad, signal disconnect and return rep to pool */
+ if (rep->rr_len == ~0U) {
+@@ -845,6 +846,11 @@ badheader:
+ break;
+ }
+
++ cwnd = xprt->cwnd;
++ xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
++ if (xprt->cwnd > cwnd)
++ xprt_release_rqst_cong(rqst->rq_task);
++
+ dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
+ __func__, xprt, rqst, status);
+ xprt_complete_rqst(rqst->rq_task, status);
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index 6b84d7d..187894b 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -448,23 +448,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
+ }
+ }
+
+-static int
+-xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
+-{
+- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+- int credits = atomic_read(&r_xprt->rx_buf.rb_credits);
+-
+- /* == RPC_CWNDSCALE @ init, but *after* setup */
+- if (r_xprt->rx_buf.rb_cwndscale == 0UL) {
+- r_xprt->rx_buf.rb_cwndscale = xprt->cwnd;
+- dprintk("RPC: %s: cwndscale %lu\n", __func__,
+- r_xprt->rx_buf.rb_cwndscale);
+- BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0);
+- }
+- xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale;
+- return xprt_reserve_xprt_cong(xprt, task);
+-}
+-
+ /*
+ * The RDMA allocate/free functions need the task structure as a place
+ * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
+@@ -686,7 +669,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+ */
+
+ static struct rpc_xprt_ops xprt_rdma_procs = {
+- .reserve_xprt = xprt_rdma_reserve_xprt,
++ .reserve_xprt = xprt_reserve_xprt_cong,
+ .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
+ .alloc_slot = xprt_alloc_slot,
+ .release_request = xprt_release_rqst_cong, /* ditto */
+diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
+index 0c3b88e..89e7cd4 100644
+--- a/net/sunrpc/xprtrdma/xprt_rdma.h
++++ b/net/sunrpc/xprtrdma/xprt_rdma.h
+@@ -212,7 +212,6 @@ struct rpcrdma_req {
+ struct rpcrdma_buffer {
+ spinlock_t rb_lock; /* protects indexes */
+ atomic_t rb_credits; /* most recent server credits */
+- unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */
+ int rb_max_requests;/* client max requests */
+ struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
+ int rb_send_index;
+--
+1.7.1
+
--- /dev/null
+From c977dea22708688eae31774f70126c97aa4dfe83 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:35:06 -0400
+Subject: [PATCH 129/132] xprtrdma: Remove BUG_ON() call sites
+
+If an error occurs in the marshaling logic, fail the RPC request
+being processed, but leave the client running.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/transport.c | 3 ++-
+ net/sunrpc/xprtrdma/verbs.c | 18 ++++++++++--------
+ 2 files changed, 12 insertions(+), 9 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index 187894b..93fe775 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -463,7 +463,8 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
+ struct rpcrdma_req *req, *nreq;
+
+ req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);
+- BUG_ON(NULL == req);
++ if (req == NULL)
++ return NULL;
+
+ if (size > req->rl_size) {
+ dprintk("RPC: %s: size %zd too large for buffer[%zd]: "
+diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
+index 515dfc1..13dbd1c 100644
+--- a/net/sunrpc/xprtrdma/verbs.c
++++ b/net/sunrpc/xprtrdma/verbs.c
+@@ -1302,7 +1302,6 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
+ int i;
+ unsigned long flags;
+
+- BUG_ON(req->rl_nchunks != 0);
+ spin_lock_irqsave(&buffers->rb_lock, flags);
+ buffers->rb_send_bufs[--buffers->rb_send_index] = req;
+ req->rl_niovs = 0;
+@@ -1535,10 +1534,6 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
+ } else
+ post_wr = &frmr_wr;
+
+- /* Bump the key */
+- key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
+- ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
+-
+ /* Prepare FRMR WR */
+ memset(&frmr_wr, 0, sizeof frmr_wr);
+ frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
+@@ -1549,7 +1544,16 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
+ frmr_wr.wr.fast_reg.page_list_len = page_no;
+ frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT;
+- BUG_ON(frmr_wr.wr.fast_reg.length < len);
++ if (frmr_wr.wr.fast_reg.length < len) {
++ while (seg1->mr_nsegs--)
++ rpcrdma_unmap_one(ia, seg++);
++ return -EIO;
++ }
++
++ /* Bump the key */
++ key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
++ ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
++
+ frmr_wr.wr.fast_reg.access_flags = (writing ?
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+ IB_ACCESS_REMOTE_READ);
+@@ -1709,9 +1713,7 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
+
+ #if RPCRDMA_PERSISTENT_REGISTRATION
+ case RPCRDMA_ALLPHYSICAL:
+- BUG_ON(nsegs != 1);
+ rpcrdma_unmap_one(ia, seg);
+- rc = 0;
+ break;
+ #endif
+
+--
+1.7.1
+
--- /dev/null
+From c93c62231cf55df4a26bd08937efeea97e6fc5e8 Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Wed, 28 May 2014 10:35:14 -0400
+Subject: [PATCH 130/132] xprtrdma: Disconnect on registration failure
+
+If rpcrdma_register_external() fails during request marshaling, the
+current RPC request is killed. Instead, this RPC should be retried
+after reconnecting the transport instance.
+
+The most likely reason for registration failure with FRMR is a
+failed post_send, which would be due to a remote transport
+disconnect or memory exhaustion. These issues can be recovered
+by a retry.
+
+Problems encountered in the marshaling logic itself will not be
+corrected by trying again, so these should still kill a request.
+
+Now that we've added a clean exit for marshaling errors, take the
+opportunity to defang some BUG_ON's.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+---
+ net/sunrpc/xprtrdma/rpc_rdma.c | 48 +++++++++++++++++++++++++-------------
+ net/sunrpc/xprtrdma/transport.c | 17 +++++++++-----
+ 2 files changed, 42 insertions(+), 23 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
+index 77b84cf..693966d 100644
+--- a/net/sunrpc/xprtrdma/rpc_rdma.c
++++ b/net/sunrpc/xprtrdma/rpc_rdma.c
+@@ -77,6 +77,8 @@ static const char transfertypes[][12] = {
+ * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
+ * elements. Segments are then coalesced when registered, if possible
+ * within the selected memreg mode.
++ *
++ * Returns positive number of segments converted, or a negative errno.
+ */
+
+ static int
+@@ -103,12 +105,13 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
+ /* alloc the pagelist for receiving buffer */
+ ppages[p] = alloc_page(GFP_ATOMIC);
+ if (!ppages[p])
+- return 0;
++ return -ENOMEM;
+ }
+ seg[n].mr_page = ppages[p];
+ seg[n].mr_offset = (void *)(unsigned long) page_base;
+ seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
+- BUG_ON(seg[n].mr_len > PAGE_SIZE);
++ if (seg[n].mr_len > PAGE_SIZE)
++ return -EIO;
+ len -= seg[n].mr_len;
+ ++n;
+ ++p;
+@@ -117,7 +120,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
+
+ /* Message overflows the seg array */
+ if (len && n == nsegs)
+- return 0;
++ return -EIO;
+
+ if (xdrbuf->tail[0].iov_len) {
+ /* the rpcrdma protocol allows us to omit any trailing
+@@ -126,7 +129,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
+ return n;
+ if (n == nsegs)
+ /* Tail remains, but we're out of segments */
+- return 0;
++ return -EIO;
+ seg[n].mr_page = NULL;
+ seg[n].mr_offset = xdrbuf->tail[0].iov_base;
+ seg[n].mr_len = xdrbuf->tail[0].iov_len;
+@@ -167,15 +170,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
+ * Reply chunk (a counted array):
+ * N elements:
+ * 1 - N - HLOO - HLOO - ... - HLOO
++ *
++ * Returns positive RPC/RDMA header size, or negative errno.
+ */
+
+-static unsigned int
++static ssize_t
+ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+ struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
+ {
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+- int nsegs, nchunks = 0;
++ int n, nsegs, nchunks = 0;
+ unsigned int pos;
+ struct rpcrdma_mr_seg *seg = req->rl_segments;
+ struct rpcrdma_read_chunk *cur_rchunk = NULL;
+@@ -201,11 +206,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
+ pos = target->head[0].iov_len;
+
+ nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
+- if (nsegs == 0)
+- return 0;
++ if (nsegs < 0)
++ return nsegs;
+
+ do {
+- int n = rpcrdma_register_external(seg, nsegs,
++ n = rpcrdma_register_external(seg, nsegs,
+ cur_wchunk != NULL, r_xprt);
+ if (n <= 0)
+ goto out;
+@@ -277,7 +282,7 @@ out:
+ for (pos = 0; nchunks--;)
+ pos += rpcrdma_deregister_external(
+ &req->rl_segments[pos], r_xprt);
+- return 0;
++ return n;
+ }
+
+ /*
+@@ -359,6 +364,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
+ * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
+ * [2] -- optional padding.
+ * [3] -- if padded, header only in [1] and data here.
++ *
++ * Returns zero on success, otherwise a negative errno.
+ */
+
+ int
+@@ -368,7 +375,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ char *base;
+- size_t hdrlen, rpclen, padlen;
++ size_t rpclen, padlen;
++ ssize_t hdrlen;
+ enum rpcrdma_chunktype rtype, wtype;
+ struct rpcrdma_msg *headerp;
+
+@@ -439,7 +447,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
+ /* The following simplification is not true forever */
+ if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
+ wtype = rpcrdma_noch;
+- BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch);
++ if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) {
++ dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
++ __func__);
++ return -EIO;
++ }
+
+ hdrlen = 28; /*sizeof *headerp;*/
+ padlen = 0;
+@@ -464,8 +476,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
+ headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
+ headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
+ hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
+- BUG_ON(wtype != rpcrdma_noch);
+-
++ if (wtype != rpcrdma_noch) {
++ dprintk("RPC: %s: invalid chunk list\n",
++ __func__);
++ return -EIO;
++ }
+ } else {
+ headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
+ headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
+@@ -500,9 +515,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
+ hdrlen = rpcrdma_create_chunks(rqst,
+ &rqst->rq_rcv_buf, headerp, wtype);
+ }
+-
+- if (hdrlen == 0)
+- return -1;
++ if (hdrlen < 0)
++ return hdrlen;
+
+ dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+ " headerp 0x%p base 0x%p lkey 0x%x\n",
+diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
+index 93fe775..66f91f0 100644
+--- a/net/sunrpc/xprtrdma/transport.c
++++ b/net/sunrpc/xprtrdma/transport.c
+@@ -595,13 +595,12 @@ xprt_rdma_send_request(struct rpc_task *task)
+ struct rpc_xprt *xprt = rqst->rq_xprt;
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
++ int rc;
+
+- /* marshal the send itself */
+- if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) {
+- r_xprt->rx_stats.failed_marshal_count++;
+- dprintk("RPC: %s: rpcrdma_marshal_req failed\n",
+- __func__);
+- return -EIO;
++ if (req->rl_niovs == 0) {
++ rc = rpcrdma_marshal_req(rqst);
++ if (rc < 0)
++ goto failed_marshal;
+ }
+
+ if (req->rl_reply == NULL) /* e.g. reconnection */
+@@ -625,6 +624,12 @@ xprt_rdma_send_request(struct rpc_task *task)
+ rqst->rq_bytes_sent = 0;
+ return 0;
+
++failed_marshal:
++ r_xprt->rx_stats.failed_marshal_count++;
++ dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
++ __func__, rc);
++ if (rc == -EIO)
++ return -EIO;
+ drop_connection:
+ xprt_disconnect_done(xprt);
+ return -ENOTCONN; /* implies disconnect */
+--
+1.7.1
+
--- /dev/null
+From 0bf4828983dff062cd502f27ab8644b32774e72e Mon Sep 17 00:00:00 2001
+From: Steve Wise <swise@opengridcomputing.com>
+Date: Wed, 28 May 2014 15:12:01 -0500
+Subject: [PATCH 131/132] svcrdma: refactor marshalling logic
+
+This patch refactors the NFSRDMA server marshalling logic to
+remove the intermediary map structures. It also fixes an existing bug
+where the NFSRDMA server was not minding the device fast register page
+list length limitations.
+
+Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
+Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+---
+ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 643 +++++++++++++-----------------
+ net/sunrpc/xprtrdma/svc_rdma_sendto.c | 230 +----------
+ net/sunrpc/xprtrdma/svc_rdma_transport.c | 62 ++--
+ 3 files changed, 331 insertions(+), 604 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+index 8d904e4..52d9f2c 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+@@ -69,7 +70,8 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
+
+ /* Set up the XDR head */
+ rqstp->rq_arg.head[0].iov_base = page_address(page);
+- rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length);
++ rqstp->rq_arg.head[0].iov_len =
++ min_t(size_t, byte_count, ctxt->sge[0].length);
+ rqstp->rq_arg.len = byte_count;
+ rqstp->rq_arg.buflen = byte_count;
+
+@@ -85,7 +87,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
+ page = ctxt->pages[sge_no];
+ put_page(rqstp->rq_pages[sge_no]);
+ rqstp->rq_pages[sge_no] = page;
+- bc -= min(bc, ctxt->sge[sge_no].length);
++ bc -= min_t(u32, bc, ctxt->sge[sge_no].length);
+ rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
+ sge_no++;
+ }
+@@ -113,291 +115,265 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
+ rqstp->rq_arg.tail[0].iov_len = 0;
+ }
+
+-/* Encode a read-chunk-list as an array of IB SGE
+- *
+- * Assumptions:
+- * - chunk[0]->position points to pages[0] at an offset of 0
+- * - pages[] is not physically or virtually contiguous and consists of
+- * PAGE_SIZE elements.
+- *
+- * Output:
+- * - sge array pointing into pages[] array.
+- * - chunk_sge array specifying sge index and count for each
+- * chunk in the read list
+- *
+- */
+-static int map_read_chunks(struct svcxprt_rdma *xprt,
+- struct svc_rqst *rqstp,
+- struct svc_rdma_op_ctxt *head,
+- struct rpcrdma_msg *rmsgp,
+- struct svc_rdma_req_map *rpl_map,
+- struct svc_rdma_req_map *chl_map,
+- int ch_count,
+- int byte_count)
++static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
+ {
+- int sge_no;
+- int sge_bytes;
+- int page_off;
+- int page_no;
+- int ch_bytes;
+- int ch_no;
+- struct rpcrdma_read_chunk *ch;
++ if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
++ RDMA_TRANSPORT_IWARP)
++ return 1;
++ else
++ return min_t(int, sge_count, xprt->sc_max_sge);
++}
+
+- sge_no = 0;
+- page_no = 0;
+- page_off = 0;
+- ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+- ch_no = 0;
+- ch_bytes = ntohl(ch->rc_target.rs_length);
+- head->arg.head[0] = rqstp->rq_arg.head[0];
+- head->arg.tail[0] = rqstp->rq_arg.tail[0];
+- head->arg.pages = &head->pages[head->count];
+- head->hdr_count = head->count; /* save count of hdr pages */
+- head->arg.page_base = 0;
+- head->arg.page_len = ch_bytes;
+- head->arg.len = rqstp->rq_arg.len + ch_bytes;
+- head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes;
+- head->count++;
+- chl_map->ch[0].start = 0;
+- while (byte_count) {
+- rpl_map->sge[sge_no].iov_base =
+- page_address(rqstp->rq_arg.pages[page_no]) + page_off;
+- sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes);
+- rpl_map->sge[sge_no].iov_len = sge_bytes;
+- /*
+- * Don't bump head->count here because the same page
+- * may be used by multiple SGE.
+- */
+- head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
+- rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
++typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt,
++ struct svc_rqst *rqstp,
++ struct svc_rdma_op_ctxt *head,
++ int *page_no,
++ u32 *page_offset,
++ u32 rs_handle,
++ u32 rs_length,
++ u64 rs_offset,
++ int last);
++
++/* Issue an RDMA_READ using the local lkey to map the data sink */
++static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
++ struct svc_rqst *rqstp,
++ struct svc_rdma_op_ctxt *head,
++ int *page_no,
++ u32 *page_offset,
++ u32 rs_handle,
++ u32 rs_length,
++ u64 rs_offset,
++ int last)
++{
++ struct ib_send_wr read_wr;
++ int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
++ struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
++ int ret, read, pno;
++ u32 pg_off = *page_offset;
++ u32 pg_no = *page_no;
++
++ ctxt->direction = DMA_FROM_DEVICE;
++ ctxt->read_hdr = head;
++ pages_needed =
++ min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed));
++ read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
++
++ for (pno = 0; pno < pages_needed; pno++) {
++ int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
++
++ head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
++ head->arg.page_len += len;
++ head->arg.len += len;
++ if (!pg_off)
++ head->count++;
++ rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
++ ctxt->sge[pno].addr =
++ ib_dma_map_page(xprt->sc_cm_id->device,
++ head->arg.pages[pg_no], pg_off,
++ PAGE_SIZE - pg_off,
++ DMA_FROM_DEVICE);
++ ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
++ ctxt->sge[pno].addr);
++ if (ret)
++ goto err;
++ atomic_inc(&xprt->sc_dma_used);
+
+- byte_count -= sge_bytes;
+- ch_bytes -= sge_bytes;
+- sge_no++;
+- /*
+- * If all bytes for this chunk have been mapped to an
+- * SGE, move to the next SGE
+- */
+- if (ch_bytes == 0) {
+- chl_map->ch[ch_no].count =
+- sge_no - chl_map->ch[ch_no].start;
+- ch_no++;
+- ch++;
+- chl_map->ch[ch_no].start = sge_no;
+- ch_bytes = ntohl(ch->rc_target.rs_length);
+- /* If bytes remaining account for next chunk */
+- if (byte_count) {
+- head->arg.page_len += ch_bytes;
+- head->arg.len += ch_bytes;
+- head->arg.buflen += ch_bytes;
+- }
++ /* The lkey here is either a local dma lkey or a dma_mr lkey */
++ ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
++ ctxt->sge[pno].length = len;
++ ctxt->count++;
++
++ /* adjust offset and wrap to next page if needed */
++ pg_off += len;
++ if (pg_off == PAGE_SIZE) {
++ pg_off = 0;
++ pg_no++;
+ }
+- /*
+- * If this SGE consumed all of the page, move to the
+- * next page
+- */
+- if ((sge_bytes + page_off) == PAGE_SIZE) {
+- page_no++;
+- page_off = 0;
+- /*
+- * If there are still bytes left to map, bump
+- * the page count
+- */
+- if (byte_count)
+- head->count++;
+- } else
+- page_off += sge_bytes;
++ rs_length -= len;
+ }
+- BUG_ON(byte_count != 0);
+- return sge_no;
++
++ if (last && rs_length == 0)
++ set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
++ else
++ clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
++
++ memset(&read_wr, 0, sizeof(read_wr));
++ read_wr.wr_id = (unsigned long)ctxt;
++ read_wr.opcode = IB_WR_RDMA_READ;
++ ctxt->wr_op = read_wr.opcode;
++ read_wr.send_flags = IB_SEND_SIGNALED;
++ read_wr.wr.rdma.rkey = rs_handle;
++ read_wr.wr.rdma.remote_addr = rs_offset;
++ read_wr.sg_list = ctxt->sge;
++ read_wr.num_sge = pages_needed;
++
++ ret = svc_rdma_send(xprt, &read_wr);
++ if (ret) {
++ pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
++ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
++ goto err;
++ }
++
++ /* return current location in page array */
++ *page_no = pg_no;
++ *page_offset = pg_off;
++ ret = read;
++ atomic_inc(&rdma_stat_read);
++ return ret;
++ err:
++ svc_rdma_unmap_dma(ctxt);
++ svc_rdma_put_context(ctxt, 0);
++ return ret;
+ }
+
+-/* Map a read-chunk-list to an XDR and fast register the page-list.
+- *
+- * Assumptions:
+- * - chunk[0] position points to pages[0] at an offset of 0
+- * - pages[] will be made physically contiguous by creating a one-off memory
+- * region using the fastreg verb.
+- * - byte_count is # of bytes in read-chunk-list
+- * - ch_count is # of chunks in read-chunk-list
+- *
+- * Output:
+- * - sge array pointing into pages[] array.
+- * - chunk_sge array specifying sge index and count for each
+- * chunk in the read list
+- */
+-static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
++/* Issue an RDMA_READ using an FRMR to map the data sink */
++static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
+ struct svc_rqst *rqstp,
+ struct svc_rdma_op_ctxt *head,
+- struct rpcrdma_msg *rmsgp,
+- struct svc_rdma_req_map *rpl_map,
+- struct svc_rdma_req_map *chl_map,
+- int ch_count,
+- int byte_count)
++ int *page_no,
++ u32 *page_offset,
++ u32 rs_handle,
++ u32 rs_length,
++ u64 rs_offset,
++ int last)
+ {
+- int page_no;
+- int ch_no;
+- u32 offset;
+- struct rpcrdma_read_chunk *ch;
+- struct svc_rdma_fastreg_mr *frmr;
+- int ret = 0;
++ struct ib_send_wr read_wr;
++ struct ib_send_wr inv_wr;
++ struct ib_send_wr fastreg_wr;
++ u8 key;
++ int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
++ struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
++ struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt);
++ int ret, read, pno;
++ u32 pg_off = *page_offset;
++ u32 pg_no = *page_no;
+
+- frmr = svc_rdma_get_frmr(xprt);
+ if (IS_ERR(frmr))
+ return -ENOMEM;
+
+- head->frmr = frmr;
+- head->arg.head[0] = rqstp->rq_arg.head[0];
+- head->arg.tail[0] = rqstp->rq_arg.tail[0];
+- head->arg.pages = &head->pages[head->count];
+- head->hdr_count = head->count; /* save count of hdr pages */
+- head->arg.page_base = 0;
+- head->arg.page_len = byte_count;
+- head->arg.len = rqstp->rq_arg.len + byte_count;
+- head->arg.buflen = rqstp->rq_arg.buflen + byte_count;
++ ctxt->direction = DMA_FROM_DEVICE;
++ ctxt->frmr = frmr;
++ pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len);
++ read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
+
+- /* Fast register the page list */
+- frmr->kva = page_address(rqstp->rq_arg.pages[0]);
++ frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]);
+ frmr->direction = DMA_FROM_DEVICE;
+ frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
+- frmr->map_len = byte_count;
+- frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT;
+- for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
+- frmr->page_list->page_list[page_no] =
++ frmr->map_len = pages_needed << PAGE_SHIFT;
++ frmr->page_list_len = pages_needed;
++
++ for (pno = 0; pno < pages_needed; pno++) {
++ int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
++
++ head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
++ head->arg.page_len += len;
++ head->arg.len += len;
++ if (!pg_off)
++ head->count++;
++ rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
++ rqstp->rq_next_page = rqstp->rq_respages + 1;
++ frmr->page_list->page_list[pno] =
+ ib_dma_map_page(xprt->sc_cm_id->device,
+- rqstp->rq_arg.pages[page_no], 0,
++ head->arg.pages[pg_no], 0,
+ PAGE_SIZE, DMA_FROM_DEVICE);
+- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+- frmr->page_list->page_list[page_no]))
+- goto fatal_err;
++ ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
++ frmr->page_list->page_list[pno]);
++ if (ret)
++ goto err;
+ atomic_inc(&xprt->sc_dma_used);
+- head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
+- }
+- head->count += page_no;
+-
+- /* rq_respages points one past arg pages */
+- rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+- rqstp->rq_next_page = rqstp->rq_respages + 1;
+
+- /* Create the reply and chunk maps */
+- offset = 0;
+- ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+- for (ch_no = 0; ch_no < ch_count; ch_no++) {
+- int len = ntohl(ch->rc_target.rs_length);
+- rpl_map->sge[ch_no].iov_base = frmr->kva + offset;
+- rpl_map->sge[ch_no].iov_len = len;
+- chl_map->ch[ch_no].count = 1;
+- chl_map->ch[ch_no].start = ch_no;
+- offset += len;
+- ch++;
++ /* adjust offset and wrap to next page if needed */
++ pg_off += len;
++ if (pg_off == PAGE_SIZE) {
++ pg_off = 0;
++ pg_no++;
++ }
++ rs_length -= len;
+ }
+
+- ret = svc_rdma_fastreg(xprt, frmr);
+- if (ret)
+- goto fatal_err;
+-
+- return ch_no;
+-
+- fatal_err:
+- printk("svcrdma: error fast registering xdr for xprt %p", xprt);
+- svc_rdma_put_frmr(xprt, frmr);
+- return -EIO;
+-}
+-
+-static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt,
+- struct svc_rdma_op_ctxt *ctxt,
+- struct svc_rdma_fastreg_mr *frmr,
+- struct kvec *vec,
+- u64 *sgl_offset,
+- int count)
+-{
+- int i;
+- unsigned long off;
++ if (last && rs_length == 0)
++ set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
++ else
++ clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+
+- ctxt->count = count;
+- ctxt->direction = DMA_FROM_DEVICE;
+- for (i = 0; i < count; i++) {
+- ctxt->sge[i].length = 0; /* in case map fails */
+- if (!frmr) {
+- BUG_ON(!virt_to_page(vec[i].iov_base));
+- off = (unsigned long)vec[i].iov_base & ~PAGE_MASK;
+- ctxt->sge[i].addr =
+- ib_dma_map_page(xprt->sc_cm_id->device,
+- virt_to_page(vec[i].iov_base),
+- off,
+- vec[i].iov_len,
+- DMA_FROM_DEVICE);
+- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+- ctxt->sge[i].addr))
+- return -EINVAL;
+- ctxt->sge[i].lkey = xprt->sc_dma_lkey;
+- atomic_inc(&xprt->sc_dma_used);
+- } else {
+- ctxt->sge[i].addr = (unsigned long)vec[i].iov_base;
+- ctxt->sge[i].lkey = frmr->mr->lkey;
+- }
+- ctxt->sge[i].length = vec[i].iov_len;
+- *sgl_offset = *sgl_offset + vec[i].iov_len;
++ /* Bump the key */
++ key = (u8)(frmr->mr->lkey & 0x000000FF);
++ ib_update_fast_reg_key(frmr->mr, ++key);
++
++ ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset;
++ ctxt->sge[0].lkey = frmr->mr->lkey;
++ ctxt->sge[0].length = read;
++ ctxt->count = 1;
++ ctxt->read_hdr = head;
++
++ /* Prepare FASTREG WR */
++ memset(&fastreg_wr, 0, sizeof(fastreg_wr));
++ fastreg_wr.opcode = IB_WR_FAST_REG_MR;
++ fastreg_wr.send_flags = IB_SEND_SIGNALED;
++ fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
++ fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
++ fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
++ fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
++ fastreg_wr.wr.fast_reg.length = frmr->map_len;
++ fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
++ fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
++ fastreg_wr.next = &read_wr;
++
++ /* Prepare RDMA_READ */
++ memset(&read_wr, 0, sizeof(read_wr));
++ read_wr.send_flags = IB_SEND_SIGNALED;
++ read_wr.wr.rdma.rkey = rs_handle;
++ read_wr.wr.rdma.remote_addr = rs_offset;
++ read_wr.sg_list = ctxt->sge;
++ read_wr.num_sge = 1;
++ if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) {
++ read_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
++ read_wr.wr_id = (unsigned long)ctxt;
++ read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey;
++ } else {
++ read_wr.opcode = IB_WR_RDMA_READ;
++ read_wr.next = &inv_wr;
++ /* Prepare invalidate */
++ memset(&inv_wr, 0, sizeof(inv_wr));
++ inv_wr.wr_id = (unsigned long)ctxt;
++ inv_wr.opcode = IB_WR_LOCAL_INV;
++ inv_wr.send_flags = IB_SEND_SIGNALED;
++ inv_wr.ex.invalidate_rkey = frmr->mr->lkey;
++ }
++ ctxt->wr_op = read_wr.opcode;
++
++ /* Post the chain */
++ ret = svc_rdma_send(xprt, &fastreg_wr);
++ if (ret) {
++ pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
++ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
++ goto err;
+ }
+- return 0;
+-}
+
+-static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
+-{
+- if ((rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
+- RDMA_TRANSPORT_IWARP) &&
+- sge_count > 1)
+- return 1;
+- else
+- return min_t(int, sge_count, xprt->sc_max_sge);
++ /* return current location in page array */
++ *page_no = pg_no;
++ *page_offset = pg_off;
++ ret = read;
++ atomic_inc(&rdma_stat_read);
++ return ret;
++ err:
++ svc_rdma_unmap_dma(ctxt);
++ svc_rdma_put_context(ctxt, 0);
++ svc_rdma_put_frmr(xprt, frmr);
++ return ret;
+ }
+
+-/*
+- * Use RDMA_READ to read data from the advertised client buffer into the
+- * XDR stream starting at rq_arg.head[0].iov_base.
+- * Each chunk in the array
+- * contains the following fields:
+- * discrim - '1', This isn't used for data placement
+- * position - The xdr stream offset (the same for every chunk)
+- * handle - RMR for client memory region
+- * length - data transfer length
+- * offset - 64 bit tagged offset in remote memory region
+- *
+- * On our side, we need to read into a pagelist. The first page immediately
+- * follows the RPC header.
+- *
+- * This function returns:
+- * 0 - No error and no read-list found.
+- *
+- * 1 - Successful read-list processing. The data is not yet in
+- * the pagelist and therefore the RPC request must be deferred. The
+- * I/O completion will enqueue the transport again and
+- * svc_rdma_recvfrom will complete the request.
+- *
+- * <0 - Error processing/posting read-list.
+- *
+- * NOTE: The ctxt must not be touched after the last WR has been posted
+- * because the I/O completion processing may occur on another
+- * processor and free / modify the context. Ne touche pas!
+- */
+-static int rdma_read_xdr(struct svcxprt_rdma *xprt,
+- struct rpcrdma_msg *rmsgp,
+- struct svc_rqst *rqstp,
+- struct svc_rdma_op_ctxt *hdr_ctxt)
++static int rdma_read_chunks(struct svcxprt_rdma *xprt,
++ struct rpcrdma_msg *rmsgp,
++ struct svc_rqst *rqstp,
++ struct svc_rdma_op_ctxt *head)
+ {
+- struct ib_send_wr read_wr;
+- struct ib_send_wr inv_wr;
+- int err = 0;
+- int ch_no;
+- int ch_count;
+- int byte_count;
+- int sge_count;
+- u64 sgl_offset;
++ int page_no, ch_count, ret;
+ struct rpcrdma_read_chunk *ch;
+- struct svc_rdma_op_ctxt *ctxt = NULL;
+- struct svc_rdma_req_map *rpl_map;
+- struct svc_rdma_req_map *chl_map;
++ u32 page_offset, byte_count;
++ u64 rs_offset;
++ rdma_reader_fn reader;
+
+ /* If no read list is present, return 0 */
+ ch = svc_rdma_get_read_chunk(rmsgp);
+@@ -408,122 +384,55 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt,
+ if (ch_count > RPCSVC_MAXPAGES)
+ return -EINVAL;
+
+- /* Allocate temporary reply and chunk maps */
+- rpl_map = svc_rdma_get_req_map();
+- chl_map = svc_rdma_get_req_map();
++ /* The request is completed when the RDMA_READs complete. The
++ * head context keeps all the pages that comprise the
++ * request.
++ */
++ head->arg.head[0] = rqstp->rq_arg.head[0];
++ head->arg.tail[0] = rqstp->rq_arg.tail[0];
++ head->arg.pages = &head->pages[head->count];
++ head->hdr_count = head->count;
++ head->arg.page_base = 0;
++ head->arg.page_len = 0;
++ head->arg.len = rqstp->rq_arg.len;
++ head->arg.buflen = rqstp->rq_arg.buflen;
+
+- if (!xprt->sc_frmr_pg_list_len)
+- sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
+- rpl_map, chl_map, ch_count,
+- byte_count);
++ /* Use FRMR if supported */
++ if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)
++ reader = rdma_read_chunk_frmr;
+ else
+- sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp,
+- rpl_map, chl_map, ch_count,
+- byte_count);
+- if (sge_count < 0) {
+- err = -EIO;
+- goto out;
+- }
+-
+- sgl_offset = 0;
+- ch_no = 0;
++ reader = rdma_read_chunk_lcl;
+
++ page_no = 0; page_offset = 0;
+ for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
+- ch->rc_discrim != 0; ch++, ch_no++) {
+- u64 rs_offset;
+-next_sge:
+- ctxt = svc_rdma_get_context(xprt);
+- ctxt->direction = DMA_FROM_DEVICE;
+- ctxt->frmr = hdr_ctxt->frmr;
+- ctxt->read_hdr = NULL;
+- clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+- clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
++ ch->rc_discrim != 0; ch++) {
+
+- /* Prepare READ WR */
+- memset(&read_wr, 0, sizeof read_wr);
+- read_wr.wr_id = (unsigned long)ctxt;
+- read_wr.opcode = IB_WR_RDMA_READ;
+- ctxt->wr_op = read_wr.opcode;
+- read_wr.send_flags = IB_SEND_SIGNALED;
+- read_wr.wr.rdma.rkey = ntohl(ch->rc_target.rs_handle);
+ xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset,
+ &rs_offset);
+- read_wr.wr.rdma.remote_addr = rs_offset + sgl_offset;
+- read_wr.sg_list = ctxt->sge;
+- read_wr.num_sge =
+- rdma_read_max_sge(xprt, chl_map->ch[ch_no].count);
+- err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr,
+- &rpl_map->sge[chl_map->ch[ch_no].start],
+- &sgl_offset,
+- read_wr.num_sge);
+- if (err) {
+- svc_rdma_unmap_dma(ctxt);
+- svc_rdma_put_context(ctxt, 0);
+- goto out;
+- }
+- if (((ch+1)->rc_discrim == 0) &&
+- (read_wr.num_sge == chl_map->ch[ch_no].count)) {
+- /*
+- * Mark the last RDMA_READ with a bit to
+- * indicate all RPC data has been fetched from
+- * the client and the RPC needs to be enqueued.
+- */
+- set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+- if (hdr_ctxt->frmr) {
+- set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+- /*
+- * Invalidate the local MR used to map the data
+- * sink.
+- */
+- if (xprt->sc_dev_caps &
+- SVCRDMA_DEVCAP_READ_W_INV) {
+- read_wr.opcode =
+- IB_WR_RDMA_READ_WITH_INV;
+- ctxt->wr_op = read_wr.opcode;
+- read_wr.ex.invalidate_rkey =
+- ctxt->frmr->mr->lkey;
+- } else {
+- /* Prepare INVALIDATE WR */
+- memset(&inv_wr, 0, sizeof inv_wr);
+- inv_wr.opcode = IB_WR_LOCAL_INV;
+- inv_wr.send_flags = IB_SEND_SIGNALED;
+- inv_wr.ex.invalidate_rkey =
+- hdr_ctxt->frmr->mr->lkey;
+- read_wr.next = &inv_wr;
+- }
+- }
+- ctxt->read_hdr = hdr_ctxt;
+- }
+- /* Post the read */
+- err = svc_rdma_send(xprt, &read_wr);
+- if (err) {
+- printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n",
+- err);
+- set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+- svc_rdma_unmap_dma(ctxt);
+- svc_rdma_put_context(ctxt, 0);
+- goto out;
++ byte_count = ntohl(ch->rc_target.rs_length);
++
++ while (byte_count > 0) {
++ ret = reader(xprt, rqstp, head,
++ &page_no, &page_offset,
++ ntohl(ch->rc_target.rs_handle),
++ byte_count, rs_offset,
++ ((ch+1)->rc_discrim == 0) /* last */
++ );
++ if (ret < 0)
++ goto err;
++ byte_count -= ret;
++ rs_offset += ret;
++ head->arg.buflen += ret;
+ }
+- atomic_inc(&rdma_stat_read);
+-
+- if (read_wr.num_sge < chl_map->ch[ch_no].count) {
+- chl_map->ch[ch_no].count -= read_wr.num_sge;
+- chl_map->ch[ch_no].start += read_wr.num_sge;
+- goto next_sge;
+- }
+- sgl_offset = 0;
+- err = 1;
+ }
+-
+- out:
+- svc_rdma_put_req_map(rpl_map);
+- svc_rdma_put_req_map(chl_map);
+-
++ ret = 1;
++ err:
+ /* Detach arg pages. svc_recv will replenish them */
+- for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
+- rqstp->rq_pages[ch_no] = NULL;
++ for (page_no = 0;
++ &rqstp->rq_pages[page_no] < rqstp->rq_respages; page_no++)
++ rqstp->rq_pages[page_no] = NULL;
+
+- return err;
++ return ret;
+ }
+
+ static int rdma_read_complete(struct svc_rqst *rqstp,
+@@ -595,13 +504,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
+ struct svc_rdma_op_ctxt,
+ dto_q);
+ list_del_init(&ctxt->dto_q);
+- }
+- if (ctxt) {
+ spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+ return rdma_read_complete(rqstp, ctxt);
+- }
+-
+- if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
++ } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
+ ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
+ struct svc_rdma_op_ctxt,
+ dto_q);
+@@ -621,7 +526,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
+ if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
+ goto close_out;
+
+- BUG_ON(ret);
+ goto out;
+ }
+ dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
+@@ -644,12 +548,11 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
+ }
+
+ /* Read read-list data. */
+- ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt);
++ ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt);
+ if (ret > 0) {
+ /* read-list posted, defer until data received from client. */
+ goto defer;
+- }
+- if (ret < 0) {
++ } else if (ret < 0) {
+ /* Post of read-list failed, free context. */
+ svc_rdma_put_context(ctxt, 1);
+ return 0;
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+index 7e024a5..49fd21a 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+@@ -49,152 +50,6 @@
+
+ #define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+-/* Encode an XDR as an array of IB SGE
+- *
+- * Assumptions:
+- * - head[0] is physically contiguous.
+- * - tail[0] is physically contiguous.
+- * - pages[] is not physically or virtually contiguous and consists of
+- * PAGE_SIZE elements.
+- *
+- * Output:
+- * SGE[0] reserved for RCPRDMA header
+- * SGE[1] data from xdr->head[]
+- * SGE[2..sge_count-2] data from xdr->pages[]
+- * SGE[sge_count-1] data from xdr->tail.
+- *
+- * The max SGE we need is the length of the XDR / pagesize + one for
+- * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES
+- * reserves a page for both the request and the reply header, and this
+- * array is only concerned with the reply we are assured that we have
+- * on extra page for the RPCRMDA header.
+- */
+-static int fast_reg_xdr(struct svcxprt_rdma *xprt,
+- struct xdr_buf *xdr,
+- struct svc_rdma_req_map *vec)
+-{
+- int sge_no;
+- u32 sge_bytes;
+- u32 page_bytes;
+- u32 page_off;
+- int page_no = 0;
+- u8 *frva;
+- struct svc_rdma_fastreg_mr *frmr;
+-
+- frmr = svc_rdma_get_frmr(xprt);
+- if (IS_ERR(frmr))
+- return -ENOMEM;
+- vec->frmr = frmr;
+-
+- /* Skip the RPCRDMA header */
+- sge_no = 1;
+-
+- /* Map the head. */
+- frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK);
+- vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
+- vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
+- vec->count = 2;
+- sge_no++;
+-
+- /* Map the XDR head */
+- frmr->kva = frva;
+- frmr->direction = DMA_TO_DEVICE;
+- frmr->access_flags = 0;
+- frmr->map_len = PAGE_SIZE;
+- frmr->page_list_len = 1;
+- page_off = (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
+- frmr->page_list->page_list[page_no] =
+- ib_dma_map_page(xprt->sc_cm_id->device,
+- virt_to_page(xdr->head[0].iov_base),
+- page_off,
+- PAGE_SIZE - page_off,
+- DMA_TO_DEVICE);
+- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+- frmr->page_list->page_list[page_no]))
+- goto fatal_err;
+- atomic_inc(&xprt->sc_dma_used);
+-
+- /* Map the XDR page list */
+- page_off = xdr->page_base;
+- page_bytes = xdr->page_len + page_off;
+- if (!page_bytes)
+- goto encode_tail;
+-
+- /* Map the pages */
+- vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
+- vec->sge[sge_no].iov_len = page_bytes;
+- sge_no++;
+- while (page_bytes) {
+- struct page *page;
+-
+- page = xdr->pages[page_no++];
+- sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
+- page_bytes -= sge_bytes;
+-
+- frmr->page_list->page_list[page_no] =
+- ib_dma_map_page(xprt->sc_cm_id->device,
+- page, page_off,
+- sge_bytes, DMA_TO_DEVICE);
+- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+- frmr->page_list->page_list[page_no]))
+- goto fatal_err;
+-
+- atomic_inc(&xprt->sc_dma_used);
+- page_off = 0; /* reset for next time through loop */
+- frmr->map_len += PAGE_SIZE;
+- frmr->page_list_len++;
+- }
+- vec->count++;
+-
+- encode_tail:
+- /* Map tail */
+- if (0 == xdr->tail[0].iov_len)
+- goto done;
+-
+- vec->count++;
+- vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
+-
+- if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) ==
+- ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) {
+- /*
+- * If head and tail use the same page, we don't need
+- * to map it again.
+- */
+- vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
+- } else {
+- void *va;
+-
+- /* Map another page for the tail */
+- page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
+- va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK);
+- vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off;
+-
+- frmr->page_list->page_list[page_no] =
+- ib_dma_map_page(xprt->sc_cm_id->device, virt_to_page(va),
+- page_off,
+- PAGE_SIZE,
+- DMA_TO_DEVICE);
+- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+- frmr->page_list->page_list[page_no]))
+- goto fatal_err;
+- atomic_inc(&xprt->sc_dma_used);
+- frmr->map_len += PAGE_SIZE;
+- frmr->page_list_len++;
+- }
+-
+- done:
+- if (svc_rdma_fastreg(xprt, frmr))
+- goto fatal_err;
+-
+- return 0;
+-
+- fatal_err:
+- printk("svcrdma: Error fast registering memory for xprt %p\n", xprt);
+- vec->frmr = NULL;
+- svc_rdma_put_frmr(xprt, frmr);
+- return -EIO;
+-}
+-
+ static int map_xdr(struct svcxprt_rdma *xprt,
+ struct xdr_buf *xdr,
+ struct svc_rdma_req_map *vec)
+@@ -208,9 +63,6 @@ static int map_xdr(struct svcxprt_rdma *xprt,
+ BUG_ON(xdr->len !=
+ (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
+
+- if (xprt->sc_frmr_pg_list_len)
+- return fast_reg_xdr(xprt, xdr, vec);
+-
+ /* Skip the first sge, this is for the RPCRDMA header */
+ sge_no = 1;
+
+@@ -282,8 +134,6 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
+ }
+
+ /* Assumptions:
+- * - We are using FRMR
+- * - or -
+ * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
+ */
+ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+@@ -327,23 +177,16 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+ sge_bytes = min_t(size_t,
+ bc, vec->sge[xdr_sge_no].iov_len-sge_off);
+ sge[sge_no].length = sge_bytes;
+- if (!vec->frmr) {
+- sge[sge_no].addr =
+- dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
+- sge_bytes, DMA_TO_DEVICE);
+- xdr_off += sge_bytes;
+- if (ib_dma_mapping_error(xprt->sc_cm_id->device,
+- sge[sge_no].addr))
+- goto err;
+- atomic_inc(&xprt->sc_dma_used);
+- sge[sge_no].lkey = xprt->sc_dma_lkey;
+- } else {
+- sge[sge_no].addr = (unsigned long)
+- vec->sge[xdr_sge_no].iov_base + sge_off;
+- sge[sge_no].lkey = vec->frmr->mr->lkey;
+- }
++ sge[sge_no].addr =
++ dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
++ sge_bytes, DMA_TO_DEVICE);
++ xdr_off += sge_bytes;
++ if (ib_dma_mapping_error(xprt->sc_cm_id->device,
++ sge[sge_no].addr))
++ goto err;
++ atomic_inc(&xprt->sc_dma_used);
++ sge[sge_no].lkey = xprt->sc_dma_lkey;
+ ctxt->count++;
+- ctxt->frmr = vec->frmr;
+ sge_off = 0;
+ sge_no++;
+ xdr_sge_no++;
+@@ -369,7 +212,6 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+ return 0;
+ err:
+ svc_rdma_unmap_dma(ctxt);
+- svc_rdma_put_frmr(xprt, vec->frmr);
+ svc_rdma_put_context(ctxt, 0);
+ /* Fatal error, close transport */
+ return -EIO;
+@@ -397,10 +239,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
+ res_ary = (struct rpcrdma_write_array *)
+ &rdma_resp->rm_body.rm_chunks[1];
+
+- if (vec->frmr)
+- max_write = vec->frmr->map_len;
+- else
+- max_write = xprt->sc_max_sge * PAGE_SIZE;
++ max_write = xprt->sc_max_sge * PAGE_SIZE;
+
+ /* Write chunks start at the pagelist */
+ for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
+@@ -472,10 +311,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
+ res_ary = (struct rpcrdma_write_array *)
+ &rdma_resp->rm_body.rm_chunks[2];
+
+- if (vec->frmr)
+- max_write = vec->frmr->map_len;
+- else
+- max_write = xprt->sc_max_sge * PAGE_SIZE;
++ max_write = xprt->sc_max_sge * PAGE_SIZE;
+
+ /* xdr offset starts at RPC message */
+ nchunks = ntohl(arg_ary->wc_nchunks);
+@@ -545,7 +381,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
+ int byte_count)
+ {
+ struct ib_send_wr send_wr;
+- struct ib_send_wr inv_wr;
+ int sge_no;
+ int sge_bytes;
+ int page_no;
+@@ -559,7 +394,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
+ "svcrdma: could not post a receive buffer, err=%d."
+ "Closing transport %p.\n", ret, rdma);
+ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+- svc_rdma_put_frmr(rdma, vec->frmr);
+ svc_rdma_put_context(ctxt, 0);
+ return -ENOTCONN;
+ }
+@@ -567,11 +401,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
+ /* Prepare the context */
+ ctxt->pages[0] = page;
+ ctxt->count = 1;
+- ctxt->frmr = vec->frmr;
+- if (vec->frmr)
+- set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+- else
+- clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags);
+
+ /* Prepare the SGE for the RPCRDMA Header */
+ ctxt->sge[0].lkey = rdma->sc_dma_lkey;
+@@ -590,21 +419,15 @@ static int send_reply(struct svcxprt_rdma *rdma,
+ int xdr_off = 0;
+ sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
+ byte_count -= sge_bytes;
+- if (!vec->frmr) {
+- ctxt->sge[sge_no].addr =
+- dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
+- sge_bytes, DMA_TO_DEVICE);
+- xdr_off += sge_bytes;
+- if (ib_dma_mapping_error(rdma->sc_cm_id->device,
+- ctxt->sge[sge_no].addr))
+- goto err;
+- atomic_inc(&rdma->sc_dma_used);
+- ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+- } else {
+- ctxt->sge[sge_no].addr = (unsigned long)
+- vec->sge[sge_no].iov_base;
+- ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey;
+- }
++ ctxt->sge[sge_no].addr =
++ dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
++ sge_bytes, DMA_TO_DEVICE);
++ xdr_off += sge_bytes;
++ if (ib_dma_mapping_error(rdma->sc_cm_id->device,
++ ctxt->sge[sge_no].addr))
++ goto err;
++ atomic_inc(&rdma->sc_dma_used);
++ ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+ ctxt->sge[sge_no].length = sge_bytes;
+ }
+ BUG_ON(byte_count != 0);
+@@ -627,6 +450,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
+ ctxt->sge[page_no+1].length = 0;
+ }
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
++
+ BUG_ON(sge_no > rdma->sc_max_sge);
+ memset(&send_wr, 0, sizeof send_wr);
+ ctxt->wr_op = IB_WR_SEND;
+@@ -635,15 +459,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
+ send_wr.num_sge = sge_no;
+ send_wr.opcode = IB_WR_SEND;
+ send_wr.send_flags = IB_SEND_SIGNALED;
+- if (vec->frmr) {
+- /* Prepare INVALIDATE WR */
+- memset(&inv_wr, 0, sizeof inv_wr);
+- inv_wr.opcode = IB_WR_LOCAL_INV;
+- inv_wr.send_flags = IB_SEND_SIGNALED;
+- inv_wr.ex.invalidate_rkey =
+- vec->frmr->mr->lkey;
+- send_wr.next = &inv_wr;
+- }
+
+ ret = svc_rdma_send(rdma, &send_wr);
+ if (ret)
+@@ -653,7 +468,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
+
+ err:
+ svc_rdma_unmap_dma(ctxt);
+- svc_rdma_put_frmr(rdma, vec->frmr);
+ svc_rdma_put_context(ctxt, 1);
+ return -EIO;
+ }
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
+index 02db8d9..e7323fb 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+@@ -162,7 +163,6 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void)
+ schedule_timeout_uninterruptible(msecs_to_jiffies(500));
+ }
+ map->count = 0;
+- map->frmr = NULL;
+ return map;
+ }
+
+@@ -338,22 +338,21 @@ static void process_context(struct svcxprt_rdma *xprt,
+
+ switch (ctxt->wr_op) {
+ case IB_WR_SEND:
+- if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
+- svc_rdma_put_frmr(xprt, ctxt->frmr);
++ BUG_ON(ctxt->frmr);
+ svc_rdma_put_context(ctxt, 1);
+ break;
+
+ case IB_WR_RDMA_WRITE:
++ BUG_ON(ctxt->frmr);
+ svc_rdma_put_context(ctxt, 0);
+ break;
+
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_READ_WITH_INV:
++ svc_rdma_put_frmr(xprt, ctxt->frmr);
+ if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
+ struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
+ BUG_ON(!read_hdr);
+- if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags))
+- svc_rdma_put_frmr(xprt, ctxt->frmr);
+ spin_lock_bh(&xprt->sc_rq_dto_lock);
+ set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+ list_add_tail(&read_hdr->dto_q,
+@@ -365,6 +364,7 @@ static void process_context(struct svcxprt_rdma *xprt,
+ break;
+
+ default:
++ BUG_ON(1);
+ printk(KERN_ERR "svcrdma: unexpected completion type, "
+ "opcode=%d\n",
+ ctxt->wr_op);
+@@ -380,29 +380,42 @@ static void process_context(struct svcxprt_rdma *xprt,
+ static void sq_cq_reap(struct svcxprt_rdma *xprt)
+ {
+ struct svc_rdma_op_ctxt *ctxt = NULL;
+- struct ib_wc wc;
++ struct ib_wc wc_a[6];
++ struct ib_wc *wc;
+ struct ib_cq *cq = xprt->sc_sq_cq;
+ int ret;
+
++ memset(wc_a, 0, sizeof(wc_a));
++
+ if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
+ return;
+
+ ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
+ atomic_inc(&rdma_stat_sq_poll);
+- while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
+- if (wc.status != IB_WC_SUCCESS)
+- /* Close the transport */
+- set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
++ while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) {
++ int i;
+
+- /* Decrement used SQ WR count */
+- atomic_dec(&xprt->sc_sq_count);
+- wake_up(&xprt->sc_send_wait);
++ for (i = 0; i < ret; i++) {
++ wc = &wc_a[i];
++ if (wc->status != IB_WC_SUCCESS) {
++ dprintk("svcrdma: sq wc err status %d\n",
++ wc->status);
+
+- ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
+- if (ctxt)
+- process_context(xprt, ctxt);
++ /* Close the transport */
++ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
++ }
+
+- svc_xprt_put(&xprt->sc_xprt);
++ /* Decrement used SQ WR count */
++ atomic_dec(&xprt->sc_sq_count);
++ wake_up(&xprt->sc_send_wait);
++
++ ctxt = (struct svc_rdma_op_ctxt *)
++ (unsigned long)wc->wr_id;
++ if (ctxt)
++ process_context(xprt, ctxt);
++
++ svc_xprt_put(&xprt->sc_xprt);
++ }
+ }
+
+ if (ctxt)
+@@ -995,7 +1008,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
+ need_dma_mr = 0;
+ break;
+ case RDMA_TRANSPORT_IB:
+- if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
++ if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) {
++ need_dma_mr = 1;
++ dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
++ } else if (!(devattr.device_cap_flags &
++ IB_DEVICE_LOCAL_DMA_LKEY)) {
+ need_dma_mr = 1;
+ dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
+ } else
+@@ -1192,14 +1209,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
+ /*
+- * If there are fewer SQ WR available than required to send a
+- * simple response, return false.
+- */
+- if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3))
+- return 0;
+-
+- /*
+- * ...or there are already waiters on the SQ,
++ * If there are already waiters on the SQ,
+ * return false.
+ */
+ if (waitqueue_active(&rdma->sc_send_wait))
+--
+1.7.1
+
--- /dev/null
+From 83710fc753d2ae158aa3cb7a7966d9c1bd05b792 Mon Sep 17 00:00:00 2001
+From: Steve Wise <swise@opengridcomputing.com>
+Date: Thu, 5 Jun 2014 09:54:31 -0500
+Subject: [PATCH 132/132] svcrdma: Fence LOCAL_INV work requests
+
+Fencing forces the invalidate to only happen after all prior send
+work requests have been completed.
+
+Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+Reported by : Devesh Sharma <Devesh.Sharma@Emulex.Com>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+---
+ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 +-
+ 1 files changed, 1 insertions(+), 1 deletions(-)
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+index 52d9f2c..8f92a61 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+@@ -338,7 +338,7 @@ static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
+ memset(&inv_wr, 0, sizeof(inv_wr));
+ inv_wr.wr_id = (unsigned long)ctxt;
+ inv_wr.opcode = IB_WR_LOCAL_INV;
+- inv_wr.send_flags = IB_SEND_SIGNALED;
++ inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE;
+ inv_wr.ex.invalidate_rkey = frmr->mr->lkey;
+ }
+ ctxt->wr_op = read_wr.opcode;
+--
+1.7.1
+
--- /dev/null
+commit 255942907e7ff498ab1545b5edce5690833ff640
+Author: Steve Wise <swise@opengridcomputing.com>
+Date: Wed Jul 9 13:49:15 2014 -0500
+
+ svcrdma: send_write() must not overflow the device's max sge
+
+ Function send_write() must stop creating sges when it reaches the device
+ max and return the amount sent in the RDMA Write to the caller.
+
+ Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+ Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+index 49fd21a..9f1b506 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+@@ -192,6 +192,8 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+ xdr_sge_no++;
+ BUG_ON(xdr_sge_no > vec->count);
+ bc -= sge_bytes;
++ if (sge_no == xprt->sc_max_sge)
++ break;
+ }
+
+ /* Prepare WRITE WR */
+@@ -209,7 +211,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
+ atomic_inc(&rdma_stat_write);
+ if (svc_rdma_send(xprt, &write_wr))
+ goto err;
+- return 0;
++ return write_len - bc;
+ err:
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 0);
+@@ -225,7 +227,6 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
+ {
+ u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
+ int write_len;
+- int max_write;
+ u32 xdr_off;
+ int chunk_off;
+ int chunk_no;
+@@ -239,8 +240,6 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
+ res_ary = (struct rpcrdma_write_array *)
+ &rdma_resp->rm_body.rm_chunks[1];
+
+- max_write = xprt->sc_max_sge * PAGE_SIZE;
+-
+ /* Write chunks start at the pagelist */
+ for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
+ xfer_len && chunk_no < arg_ary->wc_nchunks;
+@@ -260,23 +259,21 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
+ write_len);
+ chunk_off = 0;
+ while (write_len) {
+- int this_write;
+- this_write = min(write_len, max_write);
+ ret = send_write(xprt, rqstp,
+ ntohl(arg_ch->rs_handle),
+ rs_offset + chunk_off,
+ xdr_off,
+- this_write,
++ write_len,
+ vec);
+- if (ret) {
++ if (ret <= 0) {
+ dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
+ ret);
+ return -EIO;
+ }
+- chunk_off += this_write;
+- xdr_off += this_write;
+- xfer_len -= this_write;
+- write_len -= this_write;
++ chunk_off += ret;
++ xdr_off += ret;
++ xfer_len -= ret;
++ write_len -= ret;
+ }
+ }
+ /* Update the req with the number of chunks actually used */
+@@ -293,7 +290,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
+ {
+ u32 xfer_len = rqstp->rq_res.len;
+ int write_len;
+- int max_write;
+ u32 xdr_off;
+ int chunk_no;
+ int chunk_off;
+@@ -311,8 +307,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
+ res_ary = (struct rpcrdma_write_array *)
+ &rdma_resp->rm_body.rm_chunks[2];
+
+- max_write = xprt->sc_max_sge * PAGE_SIZE;
+-
+ /* xdr offset starts at RPC message */
+ nchunks = ntohl(arg_ary->wc_nchunks);
+ for (xdr_off = 0, chunk_no = 0;
+@@ -330,24 +324,21 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
+ write_len);
+ chunk_off = 0;
+ while (write_len) {
+- int this_write;
+-
+- this_write = min(write_len, max_write);
+ ret = send_write(xprt, rqstp,
+ ntohl(ch->rs_handle),
+ rs_offset + chunk_off,
+ xdr_off,
+- this_write,
++ write_len,
+ vec);
+- if (ret) {
++ if (ret <= 0) {
+ dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
+ ret);
+ return -EIO;
+ }
+- chunk_off += this_write;
+- xdr_off += this_write;
+- xfer_len -= this_write;
+- write_len -= this_write;
++ chunk_off += ret;
++ xdr_off += ret;
++ xfer_len -= ret;
++ write_len -= ret;
+ }
+ }
+ /* Update the req with the number of chunks actually used */
--- /dev/null
+Index: compat-rdma/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+===================================================================
+--- compat-rdma.orig/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
++++ compat-rdma/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+@@ -92,7 +92,9 @@ static void rdma_build_arg_xdr(struct sv
+ sge_no++;
+ }
+ rqstp->rq_respages = &rqstp->rq_pages[sge_no];
++#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,7,0))
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
++#endif
+
+ /* We should never run out of SGE because the limit is defined to
+ * support the max allowed RPC data length
+@@ -167,7 +169,9 @@ static int rdma_read_chunk_lcl(struct sv
+ if (!pg_off)
+ head->count++;
+ rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
++#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,7,0))
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
++#endif
+ ctxt->sge[pno].addr =
+ ib_dma_map_page(xprt->sc_cm_id->device,
+ head->arg.pages[pg_no], pg_off,
+@@ -272,7 +276,9 @@ static int rdma_read_chunk_frmr(struct s
+ if (!pg_off)
+ head->count++;
+ rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
++#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,7,0))
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
++#endif
+ frmr->page_list->page_list[pno] =
+ ib_dma_map_page(xprt->sc_cm_id->device,
+ head->arg.pages[pg_no], 0,
+Index: compat-rdma/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+===================================================================
+--- compat-rdma.orig/net/sunrpc/xprtrdma/svc_rdma_sendto.c
++++ compat-rdma/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+@@ -446,7 +446,9 @@ static int send_reply(struct svcxprt_rdm
+ if (page_no+1 >= sge_no)
+ ctxt->sge[page_no+1].length = 0;
+ }
++#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,7,0))
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
++#endif
+
+ BUG_ON(sge_no > rdma->sc_max_sge);
+ memset(&send_wr, 0, sizeof send_wr);
+++ /dev/null
-commit 2b7bbc963da8d076f263574af4138b5df2e1581f
-Author: Chuck Lever <chuck.lever@oracle.com>
-Date: Wed Mar 12 12:51:30 2014 -0400
-
- SUNRPC: Fix large reads on NFS/RDMA
-
- After commit a11a2bf4, "SUNRPC: Optimise away unnecessary data moves
- in xdr_align_pages", Thu Aug 2 13:21:43 2012, READs larger than a
- few hundred bytes via NFS/RDMA no longer work. This commit exposed
- a long-standing bug in rpcrdma_inline_fixup().
-
- I reproduce this with an rsize=4096 mount using the cthon04 basic
- tests. Test 5 fails with an EIO error.
-
- For my reproducer, kernel log shows:
-
- NFS: server cheating in read reply: count 4096 > recvd 0
-
- rpcrdma_inline_fixup() is zeroing the xdr_stream::page_len field,
- and xdr_align_pages() is now returning that value to the READ XDR
- decoder function.
-
- That field is set up by xdr_inline_pages() by the READ XDR encoder
- function. As far as I can tell, it is supposed to be left alone
- after that, as it describes the dimensions of the reply xdr_stream,
- not the contents of that stream.
-
- Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=68391
- Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
- Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
-
-diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
-index e03725b..96ead52 100644
---- a/net/sunrpc/xprtrdma/rpc_rdma.c
-+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
-@@ -649,9 +649,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
- break;
- page_base = 0;
- }
-- rqst->rq_rcv_buf.page_len = olen - copy_len;
-- } else
-- rqst->rq_rcv_buf.page_len = 0;
-+ }
-
- if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
- curlen = copy_len;
+++ /dev/null
-Fix regression in NFSRDMA server
-
-From: Tom Tucker <tom@ogc.us>
-
-The server regression was caused by the addition of rq_next_page
-(afc59400d6c65bad66d4ad0b2daf879cbff8e23e). There were a few places that
-were missed with the update of the rq_respages array.
-
-NOTE: Patch modified to apply against OFED.
-
-Signed-off-by: Tom Tucker <tom@ogc.us>
-Tested-by: Steve Wise <swise@ogc.us>
-
----
-
---- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2014-03-31 15:31:05.214903226 -0500
-+++ a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2014-03-31 15:34:40.042047141 -0500
-@@ -90,6 +90,9 @@ static void rdma_build_arg_xdr(struct sv
- sge_no++;
- }
- rqstp->rq_respages = &rqstp->rq_pages[sge_no];
-+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0))
-+ rqstp->rq_next_page = rqstp->rq_respages + 1;
-+#endif
-
- /* We should never run out of SGE because the limit is defined to
- * support the max allowed RPC data length
-@@ -169,6 +172,9 @@ static int map_read_chunks(struct svcxpr
- */
- head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
- rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
-+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0))
-+ rqstp->rq_next_page = rqstp->rq_respages + 1;
-+#endif
-
- byte_count -= sge_bytes;
- ch_bytes -= sge_bytes;
-@@ -276,6 +282,9 @@ static int fast_reg_read_chunks(struct s
-
- /* rq_respages points one past arg pages */
- rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
-+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0))
-+ rqstp->rq_next_page = rqstp->rq_respages + 1;
-+#endif
-
- /* Create the reply and chunk maps */
- offset = 0;
-@@ -527,9 +536,6 @@ next_sge:
- #if (LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0))
- while (rqstp->rq_resused)
- rqstp->rq_respages[--rqstp->rq_resused] = NULL;
--#else
-- while (rqstp->rq_next_page != rqstp->rq_respages)
-- *(--rqstp->rq_next_page) = NULL;
- #endif
-
- return err;
-@@ -558,7 +564,7 @@ static int rdma_read_complete(struct svc
- #if (LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0))
- rqstp->rq_resused = 0;
- #else
-- rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no];
-+ rqstp->rq_next_page = rqstp->rq_respages + 1;
- #endif
-
- /* Rebuild rq_arg head and tail. */
index xxxxxxx..xxxxxxx xxxxxx
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
-@@ -524,8 +524,13 @@ next_sge:
- * Detach res pages. If svc_release sees any it will attempt to
- * put them.
- */
-+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0))
-+ while (rqstp->rq_resused)
-+ rqstp->rq_respages[--rqstp->rq_resused] = NULL;
-+#else
- while (rqstp->rq_next_page != rqstp->rq_respages)
- *(--rqstp->rq_next_page) = NULL;
-+#endif
-
- return err;
- }
-@@ -550,7 +555,11 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
+@@ -550,7 +556,11 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
/* rq_respages starts after the last arg page */
rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+- rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no];
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0))
+ rqstp->rq_resused = 0;
+#else
- rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no];
++ rqstp->rq_next_page = rqstp->rq_respages + 1;
+#endif
/* Rebuild rq_arg head and tail. */
xprt_rdma_slot_table_entries);
if (xprt == NULL) {
dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n",
-@@ -450,8 +452,15 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
- }
-
- static int
-+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)) || defined (CONFIG_COMPAT_XPRT_RESERVE_XPRT_CONG_2PARAMS)
- xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
-+#else
-+xprt_rdma_reserve_xprt(struct rpc_task *task)
-+#endif
- {
-+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,1,0)) && !defined (CONFIG_COMPAT_XPRT_RESERVE_XPRT_CONG_2PARAMS)
-+ struct rpc_xprt *xprt = task->tk_xprt;
-+#endif
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- int credits = atomic_read(&r_xprt->rx_buf.rb_credits);
-
-@@ -463,7 +472,11 @@ xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task)
- BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0);
- }
- xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale;
-+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)) || defined (CONFIG_COMPAT_XPRT_RESERVE_XPRT_CONG_2PARAMS)
- return xprt_reserve_xprt_cong(xprt, task);
-+#else
-+ return xprt_reserve_xprt_cong(task);
-+#endif
- }
-
- /*