From 20c0cf89971d35f7ccbd547204180da906808f48 Mon Sep 17 00:00:00 2001 From: "Jeffrey C. Becker" Date: Fri, 15 Aug 2014 17:10:44 -0700 Subject: [PATCH] NFSoRDMA: fixes for 3.12 and RHEL7, RHEL6.5, SLES11SP3 backports Signed-off-by: Jeff Becker --- ...1-SUNRPC-Fix-large-reads-on-NFS-RDMA.patch | 51 + ...ve-KERN_INFO-from-dprintk-call-sites.patch | 49 + ...-printk-when-memory-allocation-fails.patch | 30 + ...104-Fix-regression-in-NFSRDMA-server.patch | 74 + ...et-calculation-for-non-page-aligned-.patch | 33 + ...a-Backport-RPC_CWNDSHIFT-from-sunrpc.patch | 12 + ...e-device-s-max-fast-register-page-li.patch | 146 ++ .../0108-nfs-rdma-Fix-for-FMR-leaks.patch | 140 ++ ...A-must-invoke-xprt_wake_pending_task.patch | 112 ++ ...BOUNCEBUFFERS-memory-registration-mo.patch | 104 ++ ...Remove-MEMWINDOWS-registration-modes.patch | 455 ++++++ ...ve-REGISTER-memory-registration-mode.patch | 191 +++ ...ck-to-MTHCAFMR-when-FRMR-is-not-supp.patch | 73 + ...eports-Invalid-mount-option-if-memre.patch | 46 + ...y-rpcrdma_deregister_external-synops.patch | 86 ++ ...-Make-rpcrdma_ep_destroy-return-void.patch | 95 ++ ...-xprtrdma-Split-the-completion-queue.patch | 395 ++++++ ...lock-contention-in-completion-handle.patch | 50 + ...calls-to-ib_poll_cq-in-completion-ha.patch | 165 +++ ...imit-work-done-by-completion-handler.patch | 79 ++ ...the-number-of-hardway-buffer-allocat.patch | 128 ++ ...ia-ri_id-qp-is-not-NULL-when-reconne.patch | 94 ++ ...23-xprtrdma-Remove-Tavor-MTU-setting.patch | 55 + ...4-xprtrdma-Allocate-missing-pagelist.patch | 38 + ...ros-for-reconnection-timeout-constan.patch | 61 + ...onnection-timeout-after-successful-r.patch | 33 + ...deadlock-when-credit-window-is-reset.patch | 104 ++ ...28-xprtrdma-Remove-BUG_ON-call-sites.patch | 83 ++ ...a-Disconnect-on-registration-failure.patch | 215 +++ ...0-svcrdma-refactor-marshalling-logic.patch | 1243 +++++++++++++++++ ...vcrdma-Fence-LOCAL_INV-work-requests.patch | 31 + ...svcrdma-send_write-must-not-overflow.patch | 129 ++ .../0133-nfsrdma-backport-fixes.patch | 48 + ...4-SUNRPC-Fix-large_reads-on-NFS-RDMA.patch | 46 - ...DMA-Fix-regression-in-NFSRDMA-server.patch | 66 - .../0023-nfsrdma-Backport-for-rhel6.5.patch | 19 +- .../0026-nfsrdma-Backport-for-sles11sp3.patch | 28 - 37 files changed, 4651 insertions(+), 156 deletions(-) create mode 100644 linux-next-cherry-picks/0101-SUNRPC-Fix-large-reads-on-NFS-RDMA.patch create mode 100644 linux-next-cherry-picks/0102-SUNRPC-remove-KERN_INFO-from-dprintk-call-sites.patch create mode 100644 linux-next-cherry-picks/0103-svcrdma-fix-printk-when-memory-allocation-fails.patch create mode 100644 linux-next-cherry-picks/0104-Fix-regression-in-NFSRDMA-server.patch create mode 100644 linux-next-cherry-picks/0105-svcrdma-fix-offset-calculation-for-non-page-aligned-.patch create mode 100644 linux-next-cherry-picks/0106-xprtrdma-Backport-RPC_CWNDSHIFT-from-sunrpc.patch create mode 100644 linux-next-cherry-picks/0107-xprtrdma-mind-the-device-s-max-fast-register-page-li.patch create mode 100644 linux-next-cherry-picks/0108-nfs-rdma-Fix-for-FMR-leaks.patch create mode 100644 linux-next-cherry-picks/0109-xprtrdma-RPC-RDMA-must-invoke-xprt_wake_pending_task.patch create mode 100644 linux-next-cherry-picks/0110-xprtrdma-Remove-BOUNCEBUFFERS-memory-registration-mo.patch create mode 100644 linux-next-cherry-picks/0111-xprtrdma-Remove-MEMWINDOWS-registration-modes.patch create mode 100644 linux-next-cherry-picks/0112-xprtrdma-Remove-REGISTER-memory-registration-mode.patch create mode 100644 linux-next-cherry-picks/0113-xprtrdma-Fall-back-to-MTHCAFMR-when-FRMR-is-not-supp.patch create mode 100644 linux-next-cherry-picks/0114-xprtrdma-mount-reports-Invalid-mount-option-if-memre.patch create mode 100644 linux-next-cherry-picks/0115-xprtrdma-Simplify-rpcrdma_deregister_external-synops.patch create mode 100644 linux-next-cherry-picks/0116-xprtrdma-Make-rpcrdma_ep_destroy-return-void.patch create mode 100644 linux-next-cherry-picks/0117-xprtrdma-Split-the-completion-queue.patch create mode 100644 linux-next-cherry-picks/0118-xprtrmda-Reduce-lock-contention-in-completion-handle.patch create mode 100644 linux-next-cherry-picks/0119-xprtrmda-Reduce-calls-to-ib_poll_cq-in-completion-ha.patch create mode 100644 linux-next-cherry-picks/0120-xprtrdma-Limit-work-done-by-completion-handler.patch create mode 100644 linux-next-cherry-picks/0121-xprtrdma-Reduce-the-number-of-hardway-buffer-allocat.patch create mode 100644 linux-next-cherry-picks/0122-xprtrdma-Ensure-ia-ri_id-qp-is-not-NULL-when-reconne.patch create mode 100644 linux-next-cherry-picks/0123-xprtrdma-Remove-Tavor-MTU-setting.patch create mode 100644 linux-next-cherry-picks/0124-xprtrdma-Allocate-missing-pagelist.patch create mode 100644 linux-next-cherry-picks/0125-xprtrdma-Use-macros-for-reconnection-timeout-constan.patch create mode 100644 linux-next-cherry-picks/0126-xprtrdma-Reset-connection-timeout-after-successful-r.patch create mode 100644 linux-next-cherry-picks/0127-xprtrdma-Avoid-deadlock-when-credit-window-is-reset.patch create mode 100644 linux-next-cherry-picks/0128-xprtrdma-Remove-BUG_ON-call-sites.patch create mode 100644 linux-next-cherry-picks/0129-xprtrdma-Disconnect-on-registration-failure.patch create mode 100644 linux-next-cherry-picks/0130-svcrdma-refactor-marshalling-logic.patch create mode 100644 linux-next-cherry-picks/0131-svcrdma-Fence-LOCAL_INV-work-requests.patch create mode 100644 linux-next-cherry-picks/0132-svcrdma-send_write-must-not-overflow.patch create mode 100644 linux-next-cherry-picks/0133-nfsrdma-backport-fixes.patch delete mode 100644 linux-next-pending/0024-SUNRPC-Fix-large_reads-on-NFS-RDMA.patch delete mode 100644 linux-next-pending/0025-NFSRDMA-Fix-regression-in-NFSRDMA-server.patch diff --git a/linux-next-cherry-picks/0101-SUNRPC-Fix-large-reads-on-NFS-RDMA.patch b/linux-next-cherry-picks/0101-SUNRPC-Fix-large-reads-on-NFS-RDMA.patch new file mode 100644 index 0000000..20a476d --- /dev/null +++ b/linux-next-cherry-picks/0101-SUNRPC-Fix-large-reads-on-NFS-RDMA.patch @@ -0,0 +1,51 @@ +From 2b7bbc963da8d076f263574af4138b5df2e1581f Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 12 Mar 2014 12:51:30 -0400 +Subject: [PATCH 101/132] SUNRPC: Fix large reads on NFS/RDMA + +After commit a11a2bf4, "SUNRPC: Optimise away unnecessary data moves +in xdr_align_pages", Thu Aug 2 13:21:43 2012, READs larger than a +few hundred bytes via NFS/RDMA no longer work. This commit exposed +a long-standing bug in rpcrdma_inline_fixup(). + +I reproduce this with an rsize=4096 mount using the cthon04 basic +tests. Test 5 fails with an EIO error. + +For my reproducer, kernel log shows: + + NFS: server cheating in read reply: count 4096 > recvd 0 + +rpcrdma_inline_fixup() is zeroing the xdr_stream::page_len field, +and xdr_align_pages() is now returning that value to the READ XDR +decoder function. + +That field is set up by xdr_inline_pages() by the READ XDR encoder +function. As far as I can tell, it is supposed to be left alone +after that, as it describes the dimensions of the reply xdr_stream, +not the contents of that stream. + +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=68391 +Signed-off-by: Chuck Lever +Signed-off-by: Trond Myklebust +--- + net/sunrpc/xprtrdma/rpc_rdma.c | 4 +--- + 1 files changed, 1 insertions(+), 3 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c +index e03725b..96ead52 100644 +--- a/net/sunrpc/xprtrdma/rpc_rdma.c ++++ b/net/sunrpc/xprtrdma/rpc_rdma.c +@@ -649,9 +649,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) + break; + page_base = 0; + } +- rqst->rq_rcv_buf.page_len = olen - copy_len; +- } else +- rqst->rq_rcv_buf.page_len = 0; ++ } + + if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { + curlen = copy_len; +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0102-SUNRPC-remove-KERN_INFO-from-dprintk-call-sites.patch b/linux-next-cherry-picks/0102-SUNRPC-remove-KERN_INFO-from-dprintk-call-sites.patch new file mode 100644 index 0000000..54a2345 --- /dev/null +++ b/linux-next-cherry-picks/0102-SUNRPC-remove-KERN_INFO-from-dprintk-call-sites.patch @@ -0,0 +1,49 @@ +From 3a0799a94c0384a3b275a73267aaa10517b1bf7d Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 12 Mar 2014 12:51:39 -0400 +Subject: [PATCH 102/132] SUNRPC: remove KERN_INFO from dprintk() call sites + +The use of KERN_INFO causes garbage characters to appear when +debugging is enabled. + +Signed-off-by: Chuck Lever +Signed-off-by: Trond Myklebust +--- + net/sunrpc/xprtrdma/transport.c | 10 +++++----- + 1 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c +index 285dc08..1eb9c46 100644 +--- a/net/sunrpc/xprtrdma/transport.c ++++ b/net/sunrpc/xprtrdma/transport.c +@@ -733,7 +733,7 @@ static void __exit xprt_rdma_cleanup(void) + { + int rc; + +- dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n"); ++ dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); + #ifdef RPC_DEBUG + if (sunrpc_table_header) { + unregister_sysctl_table(sunrpc_table_header); +@@ -755,14 +755,14 @@ static int __init xprt_rdma_init(void) + if (rc) + return rc; + +- dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n"); ++ dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); + +- dprintk(KERN_INFO "Defaults:\n"); +- dprintk(KERN_INFO "\tSlots %d\n" ++ dprintk("Defaults:\n"); ++ dprintk("\tSlots %d\n" + "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", + xprt_rdma_slot_table_entries, + xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); +- dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n", ++ dprintk("\tPadding %d\n\tMemreg %d\n", + xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy); + + #ifdef RPC_DEBUG +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0103-svcrdma-fix-printk-when-memory-allocation-fails.patch b/linux-next-cherry-picks/0103-svcrdma-fix-printk-when-memory-allocation-fails.patch new file mode 100644 index 0000000..9efe81c --- /dev/null +++ b/linux-next-cherry-picks/0103-svcrdma-fix-printk-when-memory-allocation-fails.patch @@ -0,0 +1,30 @@ +From c42a01eee74dfd9ba8f8abb7cb81dd9a8839dc7b Mon Sep 17 00:00:00 2001 +From: Jeff Layton +Date: Mon, 10 Mar 2014 11:33:48 -0400 +Subject: [PATCH 103/132] svcrdma: fix printk when memory allocation fails + +It retries in 1s, not 1000 jiffies. + +Signed-off-by: Jeff Layton +Signed-off-by: J. Bruce Fields +--- + net/sunrpc/xprtrdma/svc_rdma_transport.c | 3 +-- + 1 files changed, 1 insertions(+), 2 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c +index 62e4f9b..25688fa 100644 +--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c +@@ -477,8 +477,7 @@ struct page *svc_rdma_get_page(void) + + while ((page = alloc_page(GFP_KERNEL)) == NULL) { + /* If we can't get memory, wait a bit and try again */ +- printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 " +- "jiffies.\n"); ++ printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n"); + schedule_timeout_uninterruptible(msecs_to_jiffies(1000)); + } + return page; +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0104-Fix-regression-in-NFSRDMA-server.patch b/linux-next-cherry-picks/0104-Fix-regression-in-NFSRDMA-server.patch new file mode 100644 index 0000000..172f9a2 --- /dev/null +++ b/linux-next-cherry-picks/0104-Fix-regression-in-NFSRDMA-server.patch @@ -0,0 +1,74 @@ +From 7e4359e2611f95a97037e2b6905eab52f28afbeb Mon Sep 17 00:00:00 2001 +From: Tom Tucker +Date: Tue, 25 Mar 2014 15:14:57 -0500 +Subject: [PATCH 104/132] Fix regression in NFSRDMA server + +The server regression was caused by the addition of rq_next_page +(afc59400d6c65bad66d4ad0b2daf879cbff8e23e). There were a few places that +were missed with the update of the rq_respages array. + +Signed-off-by: Tom Tucker +Tested-by: Steve Wise +Signed-off-by: J. Bruce Fields +--- + net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 12 ++++-------- + net/sunrpc/xprtrdma/svc_rdma_sendto.c | 1 + + 2 files changed, 5 insertions(+), 8 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +index 0ce7552..8d904e4 100644 +--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +@@ -90,6 +90,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, + sge_no++; + } + rqstp->rq_respages = &rqstp->rq_pages[sge_no]; ++ rqstp->rq_next_page = rqstp->rq_respages + 1; + + /* We should never run out of SGE because the limit is defined to + * support the max allowed RPC data length +@@ -169,6 +170,7 @@ static int map_read_chunks(struct svcxprt_rdma *xprt, + */ + head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; ++ rqstp->rq_next_page = rqstp->rq_respages + 1; + + byte_count -= sge_bytes; + ch_bytes -= sge_bytes; +@@ -276,6 +278,7 @@ static int fast_reg_read_chunks(struct svcxprt_rdma *xprt, + + /* rq_respages points one past arg pages */ + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; ++ rqstp->rq_next_page = rqstp->rq_respages + 1; + + /* Create the reply and chunk maps */ + offset = 0; +@@ -520,13 +523,6 @@ next_sge: + for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) + rqstp->rq_pages[ch_no] = NULL; + +- /* +- * Detach res pages. If svc_release sees any it will attempt to +- * put them. +- */ +- while (rqstp->rq_next_page != rqstp->rq_respages) +- *(--rqstp->rq_next_page) = NULL; +- + return err; + } + +diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c +index c1d124d..11e90f8 100644 +--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c +@@ -625,6 +625,7 @@ static int send_reply(struct svcxprt_rdma *rdma, + if (page_no+1 >= sge_no) + ctxt->sge[page_no+1].length = 0; + } ++ rqstp->rq_next_page = rqstp->rq_respages + 1; + BUG_ON(sge_no > rdma->sc_max_sge); + memset(&send_wr, 0, sizeof send_wr); + ctxt->wr_op = IB_WR_SEND; +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0105-svcrdma-fix-offset-calculation-for-non-page-aligned-.patch b/linux-next-cherry-picks/0105-svcrdma-fix-offset-calculation-for-non-page-aligned-.patch new file mode 100644 index 0000000..8e3e81f --- /dev/null +++ b/linux-next-cherry-picks/0105-svcrdma-fix-offset-calculation-for-non-page-aligned-.patch @@ -0,0 +1,33 @@ +From 3cbe01a94c7b369f943f8a9d40394198d757cdd4 Mon Sep 17 00:00:00 2001 +From: Jeff Layton +Date: Mon, 17 Mar 2014 13:10:05 -0400 +Subject: [PATCH 106/132] svcrdma: fix offset calculation for non-page aligned sge entries + +The xdr_off value in dma_map_xdr gets passed to ib_dma_map_page as the +offset into the page to be mapped. This calculation does not correctly +take into account the case where the data starts at some offset into +the page. Increment the xdr_off by the page_base to ensure that it is +respected. + +Cc: Tom Tucker +Signed-off-by: Jeff Layton +Signed-off-by: J. Bruce Fields +--- + net/sunrpc/xprtrdma/svc_rdma_sendto.c | 1 + + 1 files changed, 1 insertions(+), 0 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c +index 11e90f8..7e024a5 100644 +--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c +@@ -265,6 +265,7 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, + xdr_off -= xdr->head[0].iov_len; + if (xdr_off < xdr->page_len) { + /* This offset is in the page list */ ++ xdr_off += xdr->page_base; + page = xdr->pages[xdr_off >> PAGE_SHIFT]; + xdr_off &= ~PAGE_MASK; + } else { +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0106-xprtrdma-Backport-RPC_CWNDSHIFT-from-sunrpc.patch b/linux-next-cherry-picks/0106-xprtrdma-Backport-RPC_CWNDSHIFT-from-sunrpc.patch new file mode 100644 index 0000000..11d70f2 --- /dev/null +++ b/linux-next-cherry-picks/0106-xprtrdma-Backport-RPC_CWNDSHIFT-from-sunrpc.patch @@ -0,0 +1,12 @@ +diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h +index cc1445d..f1cd3d3 100644 +--- a/net/sunrpc/xprtrdma/xprt_rdma.h ++++ b/net/sunrpc/xprtrdma/xprt_rdma.h +@@ -53,6 +53,7 @@ + + #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ + #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ ++#define RPC_CWNDSHIFT (8U) /* backported from linux/sunrpc/xprt.h */ + + /* + * Interface Adapter -- one per transport instance diff --git a/linux-next-cherry-picks/0107-xprtrdma-mind-the-device-s-max-fast-register-page-li.patch b/linux-next-cherry-picks/0107-xprtrdma-mind-the-device-s-max-fast-register-page-li.patch new file mode 100644 index 0000000..935efdb --- /dev/null +++ b/linux-next-cherry-picks/0107-xprtrdma-mind-the-device-s-max-fast-register-page-li.patch @@ -0,0 +1,146 @@ +From 0fc6c4e7bb287148eb5e949efd89327929d4841d Mon Sep 17 00:00:00 2001 +From: Steve Wise +Date: Wed, 28 May 2014 10:32:00 -0400 +Subject: [PATCH 108/132] xprtrdma: mind the device's max fast register page list depth + +Some rdma devices don't support a fast register page list depth of +at least RPCRDMA_MAX_DATA_SEGS. So xprtrdma needs to chunk its fast +register regions according to the minimum of the device max supported +depth or RPCRDMA_MAX_DATA_SEGS. + +Signed-off-by: Steve Wise +Reviewed-by: Chuck Lever +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/rpc_rdma.c | 4 --- + net/sunrpc/xprtrdma/verbs.c | 47 +++++++++++++++++++++++++++++---------- + net/sunrpc/xprtrdma/xprt_rdma.h | 1 + + 3 files changed, 36 insertions(+), 16 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c +index 96ead52..400aa1b 100644 +--- a/net/sunrpc/xprtrdma/rpc_rdma.c ++++ b/net/sunrpc/xprtrdma/rpc_rdma.c +@@ -248,10 +248,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, + /* success. all failures return above */ + req->rl_nchunks = nchunks; + +- BUG_ON(nchunks == 0); +- BUG_ON((r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) +- && (nchunks > 3)); +- + /* + * finish off header. If write, marshal discrim and nchunks. + */ +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index 9372656..55fb09a 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -539,6 +539,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) + __func__); + memreg = RPCRDMA_REGISTER; + #endif ++ } else { ++ /* Mind the ia limit on FRMR page list depth */ ++ ia->ri_max_frmr_depth = min_t(unsigned int, ++ RPCRDMA_MAX_DATA_SEGS, ++ devattr.max_fast_reg_page_list_len); + } + break; + } +@@ -659,24 +664,42 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + ep->rep_attr.srq = NULL; + ep->rep_attr.cap.max_send_wr = cdata->max_requests; + switch (ia->ri_memreg_strategy) { +- case RPCRDMA_FRMR: ++ case RPCRDMA_FRMR: { ++ int depth = 7; ++ + /* Add room for frmr register and invalidate WRs. + * 1. FRMR reg WR for head + * 2. FRMR invalidate WR for head +- * 3. FRMR reg WR for pagelist +- * 4. FRMR invalidate WR for pagelist ++ * 3. N FRMR reg WRs for pagelist ++ * 4. N FRMR invalidate WRs for pagelist + * 5. FRMR reg WR for tail + * 6. FRMR invalidate WR for tail + * 7. The RDMA_SEND WR + */ +- ep->rep_attr.cap.max_send_wr *= 7; ++ ++ /* Calculate N if the device max FRMR depth is smaller than ++ * RPCRDMA_MAX_DATA_SEGS. ++ */ ++ if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { ++ int delta = RPCRDMA_MAX_DATA_SEGS - ++ ia->ri_max_frmr_depth; ++ ++ do { ++ depth += 2; /* FRMR reg + invalidate */ ++ delta -= ia->ri_max_frmr_depth; ++ } while (delta > 0); ++ ++ } ++ ep->rep_attr.cap.max_send_wr *= depth; + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) { +- cdata->max_requests = devattr.max_qp_wr / 7; ++ cdata->max_requests = devattr.max_qp_wr / depth; + if (!cdata->max_requests) + return -EINVAL; +- ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7; ++ ep->rep_attr.cap.max_send_wr = cdata->max_requests * ++ depth; + } + break; ++ } + case RPCRDMA_MEMWINDOWS_ASYNC: + case RPCRDMA_MEMWINDOWS: + /* Add room for mw_binds+unbinds - overkill! */ +@@ -1043,16 +1066,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, + case RPCRDMA_FRMR: + for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { + r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, +- RPCRDMA_MAX_SEGS); ++ ia->ri_max_frmr_depth); + if (IS_ERR(r->r.frmr.fr_mr)) { + rc = PTR_ERR(r->r.frmr.fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr" + " failed %i\n", __func__, rc); + goto out; + } +- r->r.frmr.fr_pgl = +- ib_alloc_fast_reg_page_list(ia->ri_id->device, +- RPCRDMA_MAX_SEGS); ++ r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( ++ ia->ri_id->device, ++ ia->ri_max_frmr_depth); + if (IS_ERR(r->r.frmr.fr_pgl)) { + rc = PTR_ERR(r->r.frmr.fr_pgl); + dprintk("RPC: %s: " +@@ -1498,8 +1521,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; +- if (*nsegs > RPCRDMA_MAX_DATA_SEGS) +- *nsegs = RPCRDMA_MAX_DATA_SEGS; ++ if (*nsegs > ia->ri_max_frmr_depth) ++ *nsegs = ia->ri_max_frmr_depth; + for (page_no = i = 0; i < *nsegs;) { + rpcrdma_map_one(ia, seg, writing); + pa = seg->mr_dma; +diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h +index cc1445d..98340a3 100644 +--- a/net/sunrpc/xprtrdma/xprt_rdma.h ++++ b/net/sunrpc/xprtrdma/xprt_rdma.h +@@ -66,6 +66,7 @@ struct rpcrdma_ia { + struct completion ri_done; + int ri_async_rc; + enum rpcrdma_memreg ri_memreg_strategy; ++ unsigned int ri_max_frmr_depth; + }; + + /* +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0108-nfs-rdma-Fix-for-FMR-leaks.patch b/linux-next-cherry-picks/0108-nfs-rdma-Fix-for-FMR-leaks.patch new file mode 100644 index 0000000..5e01e25 --- /dev/null +++ b/linux-next-cherry-picks/0108-nfs-rdma-Fix-for-FMR-leaks.patch @@ -0,0 +1,140 @@ +From 4034ba04231f554abb97ad8900a4c1af03f8e21d Mon Sep 17 00:00:00 2001 +From: Allen Andrews +Date: Wed, 28 May 2014 10:32:09 -0400 +Subject: [PATCH 109/132] nfs-rdma: Fix for FMR leaks + +Two memory region leaks were found during testing: + +1. rpcrdma_buffer_create: While allocating RPCRDMA_FRMR's +ib_alloc_fast_reg_mr is called and then ib_alloc_fast_reg_page_list is +called. If ib_alloc_fast_reg_page_list returns an error it bails out of +the routine dropping the last ib_alloc_fast_reg_mr frmr region creating a +memory leak. Added code to dereg the last frmr if +ib_alloc_fast_reg_page_list fails. + +2. rpcrdma_buffer_destroy: While cleaning up, the routine will only free +the MR's on the rb_mws list if there are rb_send_bufs present. However, in +rpcrdma_buffer_create while the rb_mws list is being built if one of the MR +allocation requests fail after some MR's have been allocated on the rb_mws +list the routine never gets to create any rb_send_bufs but instead jumps to +the rpcrdma_buffer_destroy routine which will never free the MR's on rb_mws +list because the rb_send_bufs were never created. This leaks all the MR's +on the rb_mws list that were created prior to one of the MR allocations +failing. + +Issue(2) was seen during testing. Our adapter had a finite number of MR's +available and we created enough connections to where we saw an MR +allocation failure on our Nth NFS connection request. After the kernel +cleaned up the resources it had allocated for the Nth connection we noticed +that FMR's had been leaked due to the coding error described above. + +Issue(1) was seen during a code review while debugging issue(2). + +Signed-off-by: Allen Andrews +Reviewed-by: Chuck Lever +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/verbs.c | 73 ++++++++++++++++++++++-------------------- + 1 files changed, 38 insertions(+), 35 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index 55fb09a..8f9704e 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -1081,6 +1081,8 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, + dprintk("RPC: %s: " + "ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); ++ ++ ib_dereg_mr(r->r.frmr.fr_mr); + goto out; + } + list_add(&r->mw_list, &buf->rb_mws); +@@ -1217,41 +1219,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) + kfree(buf->rb_recv_bufs[i]); + } + if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { +- while (!list_empty(&buf->rb_mws)) { +- r = list_entry(buf->rb_mws.next, +- struct rpcrdma_mw, mw_list); +- list_del(&r->mw_list); +- switch (ia->ri_memreg_strategy) { +- case RPCRDMA_FRMR: +- rc = ib_dereg_mr(r->r.frmr.fr_mr); +- if (rc) +- dprintk("RPC: %s:" +- " ib_dereg_mr" +- " failed %i\n", +- __func__, rc); +- ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); +- break; +- case RPCRDMA_MTHCAFMR: +- rc = ib_dealloc_fmr(r->r.fmr); +- if (rc) +- dprintk("RPC: %s:" +- " ib_dealloc_fmr" +- " failed %i\n", +- __func__, rc); +- break; +- case RPCRDMA_MEMWINDOWS_ASYNC: +- case RPCRDMA_MEMWINDOWS: +- rc = ib_dealloc_mw(r->r.mw); +- if (rc) +- dprintk("RPC: %s:" +- " ib_dealloc_mw" +- " failed %i\n", +- __func__, rc); +- break; +- default: +- break; +- } +- } + rpcrdma_deregister_internal(ia, + buf->rb_send_bufs[i]->rl_handle, + &buf->rb_send_bufs[i]->rl_iov); +@@ -1259,6 +1226,42 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) + } + } + ++ while (!list_empty(&buf->rb_mws)) { ++ r = list_entry(buf->rb_mws.next, ++ struct rpcrdma_mw, mw_list); ++ list_del(&r->mw_list); ++ switch (ia->ri_memreg_strategy) { ++ case RPCRDMA_FRMR: ++ rc = ib_dereg_mr(r->r.frmr.fr_mr); ++ if (rc) ++ dprintk("RPC: %s:" ++ " ib_dereg_mr" ++ " failed %i\n", ++ __func__, rc); ++ ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); ++ break; ++ case RPCRDMA_MTHCAFMR: ++ rc = ib_dealloc_fmr(r->r.fmr); ++ if (rc) ++ dprintk("RPC: %s:" ++ " ib_dealloc_fmr" ++ " failed %i\n", ++ __func__, rc); ++ break; ++ case RPCRDMA_MEMWINDOWS_ASYNC: ++ case RPCRDMA_MEMWINDOWS: ++ rc = ib_dealloc_mw(r->r.mw); ++ if (rc) ++ dprintk("RPC: %s:" ++ " ib_dealloc_mw" ++ " failed %i\n", ++ __func__, rc); ++ break; ++ default: ++ break; ++ } ++ } ++ + kfree(buf->rb_pool); + } + +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0109-xprtrdma-RPC-RDMA-must-invoke-xprt_wake_pending_task.patch b/linux-next-cherry-picks/0109-xprtrdma-RPC-RDMA-must-invoke-xprt_wake_pending_task.patch new file mode 100644 index 0000000..53ca2a6 --- /dev/null +++ b/linux-next-cherry-picks/0109-xprtrdma-RPC-RDMA-must-invoke-xprt_wake_pending_task.patch @@ -0,0 +1,112 @@ +From 254f91e2fa1f4cc18fd2eb9d5481888ffe126d5b Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:32:17 -0400 +Subject: [PATCH 110/132] xprtrdma: RPC/RDMA must invoke xprt_wake_pending_tasks() in process context + +An IB provider can invoke rpcrdma_conn_func() in an IRQ context, +thus rpcrdma_conn_func() cannot be allowed to directly invoke +generic RPC functions like xprt_wake_pending_tasks(). + +Signed-off-by: Chuck Lever +Tested-by: Steve Wise +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/rpc_rdma.c | 22 +++++++++++++++------- + net/sunrpc/xprtrdma/verbs.c | 3 +++ + net/sunrpc/xprtrdma/xprt_rdma.h | 3 +++ + 3 files changed, 21 insertions(+), 7 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c +index 400aa1b..c296468 100644 +--- a/net/sunrpc/xprtrdma/rpc_rdma.c ++++ b/net/sunrpc/xprtrdma/rpc_rdma.c +@@ -676,15 +676,11 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) + rqst->rq_private_buf = rqst->rq_rcv_buf; + } + +-/* +- * This function is called when an async event is posted to +- * the connection which changes the connection state. All it +- * does at this point is mark the connection up/down, the rpc +- * timers do the rest. +- */ + void +-rpcrdma_conn_func(struct rpcrdma_ep *ep) ++rpcrdma_connect_worker(struct work_struct *work) + { ++ struct rpcrdma_ep *ep = ++ container_of(work, struct rpcrdma_ep, rep_connect_worker.work); + struct rpc_xprt *xprt = ep->rep_xprt; + + spin_lock_bh(&xprt->transport_lock); +@@ -701,6 +697,18 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) + } + + /* ++ * This function is called when an async event is posted to ++ * the connection which changes the connection state. All it ++ * does at this point is mark the connection up/down, the rpc ++ * timers do the rest. ++ */ ++void ++rpcrdma_conn_func(struct rpcrdma_ep *ep) ++{ ++ schedule_delayed_work(&ep->rep_connect_worker, 0); ++} ++ ++/* + * This function is called when memory window unbind which we are waiting + * for completes. Just use rr_func (zeroed by upcall) to signal completion. + */ +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index 8f9704e..9cb88f3 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -742,6 +742,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + INIT_CQCOUNT(ep); + ep->rep_ia = ia; + init_waitqueue_head(&ep->rep_connect_wait); ++ INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); + + /* + * Create a single cq for receive dto and mw_bind (only ever +@@ -817,6 +818,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) + dprintk("RPC: %s: entering, connected is %d\n", + __func__, ep->rep_connected); + ++ cancel_delayed_work_sync(&ep->rep_connect_worker); ++ + if (ia->ri_id->qp) { + rc = rpcrdma_ep_disconnect(ep, ia); + if (rc) +diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h +index 98340a3..c620d13 100644 +--- a/net/sunrpc/xprtrdma/xprt_rdma.h ++++ b/net/sunrpc/xprtrdma/xprt_rdma.h +@@ -43,6 +43,7 @@ + #include /* wait_queue_head_t, etc */ + #include /* spinlock_t, etc */ + #include /* atomic_t, etc */ ++#include /* struct work_struct */ + + #include /* RDMA connection api */ + #include /* RDMA verbs api */ +@@ -87,6 +88,7 @@ struct rpcrdma_ep { + struct rpc_xprt *rep_xprt; /* for rep_func */ + struct rdma_conn_param rep_remote_cma; + struct sockaddr_storage rep_remote_addr; ++ struct delayed_work rep_connect_worker; + }; + + #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) +@@ -336,6 +338,7 @@ int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, + /* + * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c + */ ++void rpcrdma_connect_worker(struct work_struct *); + void rpcrdma_conn_func(struct rpcrdma_ep *); + void rpcrdma_reply_handler(struct rpcrdma_rep *); + +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0110-xprtrdma-Remove-BOUNCEBUFFERS-memory-registration-mo.patch b/linux-next-cherry-picks/0110-xprtrdma-Remove-BOUNCEBUFFERS-memory-registration-mo.patch new file mode 100644 index 0000000..30307fd --- /dev/null +++ b/linux-next-cherry-picks/0110-xprtrdma-Remove-BOUNCEBUFFERS-memory-registration-mo.patch @@ -0,0 +1,104 @@ +From 03ff8821eb5ed168792667cfc3ddff903e97af99 Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:32:26 -0400 +Subject: [PATCH 111/132] xprtrdma: Remove BOUNCEBUFFERS memory registration mode + +Clean up: This memory registration mode is slow and was never +meant for use in production environments. Remove it to reduce +implementation complexity. + +Signed-off-by: Chuck Lever +Tested-by: Steve Wise +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/rpc_rdma.c | 11 ----------- + net/sunrpc/xprtrdma/transport.c | 13 ------------- + net/sunrpc/xprtrdma/verbs.c | 5 +---- + 3 files changed, 1 insertions(+), 28 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c +index c296468..02b2941 100644 +--- a/net/sunrpc/xprtrdma/rpc_rdma.c ++++ b/net/sunrpc/xprtrdma/rpc_rdma.c +@@ -77,9 +77,6 @@ static const char transfertypes[][12] = { + * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk + * elements. Segments are then coalesced when registered, if possible + * within the selected memreg mode. +- * +- * Note, this routine is never called if the connection's memory +- * registration strategy is 0 (bounce buffers). + */ + + static int +@@ -439,14 +436,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) + wtype = rpcrdma_noch; + BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch); + +- if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS && +- (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) { +- /* forced to "pure inline"? */ +- dprintk("RPC: %s: too much data (%d/%d) for inline\n", +- __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len); +- return -1; +- } +- + hdrlen = 28; /*sizeof *headerp;*/ + padlen = 0; + +diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c +index 1eb9c46..8c5035a 100644 +--- a/net/sunrpc/xprtrdma/transport.c ++++ b/net/sunrpc/xprtrdma/transport.c +@@ -503,18 +503,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) + * If the allocation or registration fails, the RPC framework + * will (doggedly) retry. + */ +- if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy == +- RPCRDMA_BOUNCEBUFFERS) { +- /* forced to "pure inline" */ +- dprintk("RPC: %s: too much data (%zd) for inline " +- "(r/w max %d/%d)\n", __func__, size, +- rpcx_to_rdmad(xprt).inline_rsize, +- rpcx_to_rdmad(xprt).inline_wsize); +- size = req->rl_size; +- rpc_exit(task, -EIO); /* fail the operation */ +- rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; +- goto out; +- } + if (task->tk_flags & RPC_TASK_SWAPPER) + nreq = kmalloc(sizeof *req + size, GFP_ATOMIC); + else +@@ -543,7 +531,6 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) + req = nreq; + } + dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); +-out: + req->rl_connect_cookie = 0; /* our reserved value */ + return req->rl_xdr_buf; + +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index 9cb88f3..4a4e4ea 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -557,7 +557,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) + * adapter. + */ + switch (memreg) { +- case RPCRDMA_BOUNCEBUFFERS: + case RPCRDMA_REGISTER: + case RPCRDMA_FRMR: + break; +@@ -778,9 +777,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + + /* Client offers RDMA Read but does not initiate */ + ep->rep_remote_cma.initiator_depth = 0; +- if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS) +- ep->rep_remote_cma.responder_resources = 0; +- else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ ++ if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + ep->rep_remote_cma.responder_resources = 32; + else + ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0111-xprtrdma-Remove-MEMWINDOWS-registration-modes.patch b/linux-next-cherry-picks/0111-xprtrdma-Remove-MEMWINDOWS-registration-modes.patch new file mode 100644 index 0000000..fb7158d --- /dev/null +++ b/linux-next-cherry-picks/0111-xprtrdma-Remove-MEMWINDOWS-registration-modes.patch @@ -0,0 +1,455 @@ +From b45ccfd25d506e83d9ecf93d0ac7edf031d35d2f Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:32:34 -0400 +Subject: [PATCH 112/132] xprtrdma: Remove MEMWINDOWS registration modes + +The MEMWINDOWS and MEMWINDOWS_ASYNC memory registration modes were +intended as stop-gap modes before the introduction of FRMR. They +are now considered obsolete. + +MEMWINDOWS_ASYNC is also considered unsafe because it can leave +client memory registered and exposed for an indeterminant time after +each I/O. + +At this point, the MEMWINDOWS modes add needless complexity, so +remove them. + +Signed-off-by: Chuck Lever +Tested-by: Steve Wise +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/rpc_rdma.c | 34 +-------- + net/sunrpc/xprtrdma/transport.c | 9 +-- + net/sunrpc/xprtrdma/verbs.c | 165 +------------------------------------- + net/sunrpc/xprtrdma/xprt_rdma.h | 2 - + 4 files changed, 7 insertions(+), 203 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c +index 02b2941..46b5172 100644 +--- a/net/sunrpc/xprtrdma/rpc_rdma.c ++++ b/net/sunrpc/xprtrdma/rpc_rdma.c +@@ -199,7 +199,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, + return 0; + + do { +- /* bind/register the memory, then build chunk from result. */ + int n = rpcrdma_register_external(seg, nsegs, + cur_wchunk != NULL, r_xprt); + if (n <= 0) +@@ -698,16 +697,6 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) + } + + /* +- * This function is called when memory window unbind which we are waiting +- * for completes. Just use rr_func (zeroed by upcall) to signal completion. +- */ +-static void +-rpcrdma_unbind_func(struct rpcrdma_rep *rep) +-{ +- wake_up(&rep->rr_unbind); +-} +- +-/* + * Called as a tasklet to do req/reply match and complete a request + * Errors must result in the RPC task either being awakened, or + * allowed to timeout, to discover the errors at that time. +@@ -721,7 +710,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) + struct rpc_xprt *xprt = rep->rr_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + __be32 *iptr; +- int i, rdmalen, status; ++ int rdmalen, status; + + /* Check status. If bad, signal disconnect and return rep to pool */ + if (rep->rr_len == ~0U) { +@@ -850,27 +839,6 @@ badheader: + break; + } + +- /* If using mw bind, start the deregister process now. */ +- /* (Note: if mr_free(), cannot perform it here, in tasklet context) */ +- if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) { +- case RPCRDMA_MEMWINDOWS: +- for (i = 0; req->rl_nchunks-- > 1;) +- i += rpcrdma_deregister_external( +- &req->rl_segments[i], r_xprt, NULL); +- /* Optionally wait (not here) for unbinds to complete */ +- rep->rr_func = rpcrdma_unbind_func; +- (void) rpcrdma_deregister_external(&req->rl_segments[i], +- r_xprt, rep); +- break; +- case RPCRDMA_MEMWINDOWS_ASYNC: +- for (i = 0; req->rl_nchunks--;) +- i += rpcrdma_deregister_external(&req->rl_segments[i], +- r_xprt, NULL); +- break; +- default: +- break; +- } +- + dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", + __func__, xprt, rqst, status); + xprt_complete_rqst(rqst->rq_task, status); +diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c +index 8c5035a..c23b0c1 100644 +--- a/net/sunrpc/xprtrdma/transport.c ++++ b/net/sunrpc/xprtrdma/transport.c +@@ -566,9 +566,7 @@ xprt_rdma_free(void *buffer) + __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : ""); + + /* +- * Finish the deregistration. When using mw bind, this was +- * begun in rpcrdma_reply_handler(). In all other modes, we +- * do it here, in thread context. The process is considered ++ * Finish the deregistration. The process is considered + * complete when the rr_func vector becomes NULL - this + * was put in place during rpcrdma_reply_handler() - the wait + * call below will not block if the dereg is "done". If +@@ -580,11 +578,6 @@ xprt_rdma_free(void *buffer) + &req->rl_segments[i], r_xprt, NULL); + } + +- if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) { +- rep->rr_func = NULL; /* abandon the callback */ +- req->rl_reply = NULL; +- } +- + if (req->rl_iov.length == 0) { /* see allocate above */ + struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer; + oreq->rl_reply = req->rl_reply; +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index 4a4e4ea..304c7ad 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -152,7 +152,7 @@ void rpcrdma_event_process(struct ib_wc *wc) + dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", + __func__, rep, wc->status, wc->opcode, wc->byte_len); + +- if (!rep) /* send or bind completion that we don't care about */ ++ if (!rep) /* send completion that we don't care about */ + return; + + if (IB_WC_SUCCESS != wc->status) { +@@ -197,8 +197,6 @@ void rpcrdma_event_process(struct ib_wc *wc) + } + atomic_set(&rep->rr_buffer->rb_credits, credits); + } +- /* fall through */ +- case IB_WC_BIND_MW: + rpcrdma_schedule_tasklet(rep); + break; + default: +@@ -233,7 +231,7 @@ rpcrdma_cq_poll(struct ib_cq *cq) + /* + * rpcrdma_cq_event_upcall + * +- * This upcall handles recv, send, bind and unbind events. ++ * This upcall handles recv and send events. + * It is reentrant but processes single events in order to maintain + * ordering of receives to keep server credits. + * +@@ -494,16 +492,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) + } + + switch (memreg) { +- case RPCRDMA_MEMWINDOWS: +- case RPCRDMA_MEMWINDOWS_ASYNC: +- if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) { +- dprintk("RPC: %s: MEMWINDOWS registration " +- "specified but not supported by adapter, " +- "using slower RPCRDMA_REGISTER\n", +- __func__); +- memreg = RPCRDMA_REGISTER; +- } +- break; + case RPCRDMA_MTHCAFMR: + if (!ia->ri_id->device->alloc_fmr) { + #if RPCRDMA_PERSISTENT_REGISTRATION +@@ -567,16 +555,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) + IB_ACCESS_REMOTE_READ; + goto register_setup; + #endif +- case RPCRDMA_MEMWINDOWS_ASYNC: +- case RPCRDMA_MEMWINDOWS: +- mem_priv = IB_ACCESS_LOCAL_WRITE | +- IB_ACCESS_MW_BIND; +- goto register_setup; + case RPCRDMA_MTHCAFMR: + if (ia->ri_have_dma_lkey) + break; + mem_priv = IB_ACCESS_LOCAL_WRITE; ++#if RPCRDMA_PERSISTENT_REGISTRATION + register_setup: ++#endif + ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); + if (IS_ERR(ia->ri_bind_mem)) { + printk(KERN_ALERT "%s: ib_get_dma_mr for " +@@ -699,14 +684,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + } + break; + } +- case RPCRDMA_MEMWINDOWS_ASYNC: +- case RPCRDMA_MEMWINDOWS: +- /* Add room for mw_binds+unbinds - overkill! */ +- ep->rep_attr.cap.max_send_wr++; +- ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS); +- if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) +- return -EINVAL; +- break; + default: + break; + } +@@ -728,14 +705,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + + /* set trigger for requesting send completion */ + ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; +- switch (ia->ri_memreg_strategy) { +- case RPCRDMA_MEMWINDOWS_ASYNC: +- case RPCRDMA_MEMWINDOWS: +- ep->rep_cqinit -= RPCRDMA_MAX_SEGS; +- break; +- default: +- break; +- } + if (ep->rep_cqinit <= 2) + ep->rep_cqinit = 0; + INIT_CQCOUNT(ep); +@@ -743,11 +712,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + init_waitqueue_head(&ep->rep_connect_wait); + INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); + +- /* +- * Create a single cq for receive dto and mw_bind (only ever +- * care about unbind, really). Send completions are suppressed. +- * Use single threaded tasklet upcalls to maintain ordering. +- */ + ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall, + rpcrdma_cq_async_error_upcall, NULL, + ep->rep_attr.cap.max_recv_wr + +@@ -1020,11 +984,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, + len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * + sizeof(struct rpcrdma_mw); + break; +- case RPCRDMA_MEMWINDOWS_ASYNC: +- case RPCRDMA_MEMWINDOWS: +- len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * +- sizeof(struct rpcrdma_mw); +- break; + default: + break; + } +@@ -1055,11 +1014,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, + } + p += cdata->padding; + +- /* +- * Allocate the fmr's, or mw's for mw_bind chunk registration. +- * We "cycle" the mw's in order to minimize rkey reuse, +- * and also reduce unbind-to-bind collision. +- */ + INIT_LIST_HEAD(&buf->rb_mws); + r = (struct rpcrdma_mw *)p; + switch (ia->ri_memreg_strategy) { +@@ -1107,21 +1061,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, + ++r; + } + break; +- case RPCRDMA_MEMWINDOWS_ASYNC: +- case RPCRDMA_MEMWINDOWS: +- /* Allocate one extra request's worth, for full cycling */ +- for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { +- r->r.mw = ib_alloc_mw(ia->ri_pd, IB_MW_TYPE_1); +- if (IS_ERR(r->r.mw)) { +- rc = PTR_ERR(r->r.mw); +- dprintk("RPC: %s: ib_alloc_mw" +- " failed %i\n", __func__, rc); +- goto out; +- } +- list_add(&r->mw_list, &buf->rb_mws); +- ++r; +- } +- break; + default: + break; + } +@@ -1170,7 +1109,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, + memset(rep, 0, sizeof(struct rpcrdma_rep)); + buf->rb_recv_bufs[i] = rep; + buf->rb_recv_bufs[i]->rr_buffer = buf; +- init_waitqueue_head(&rep->rr_unbind); + + rc = rpcrdma_register_internal(ia, rep->rr_base, + len - offsetof(struct rpcrdma_rep, rr_base), +@@ -1204,7 +1142,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) + + /* clean up in reverse order from create + * 1. recv mr memory (mr free, then kfree) +- * 1a. bind mw memory + * 2. send mr memory (mr free, then kfree) + * 3. padding (if any) [moved to rpcrdma_ep_destroy] + * 4. arrays +@@ -1248,15 +1185,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) + " failed %i\n", + __func__, rc); + break; +- case RPCRDMA_MEMWINDOWS_ASYNC: +- case RPCRDMA_MEMWINDOWS: +- rc = ib_dealloc_mw(r->r.mw); +- if (rc) +- dprintk("RPC: %s:" +- " ib_dealloc_mw" +- " failed %i\n", +- __func__, rc); +- break; + default: + break; + } +@@ -1331,15 +1259,12 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) + req->rl_niovs = 0; + if (req->rl_reply) { + buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; +- init_waitqueue_head(&req->rl_reply->rr_unbind); + req->rl_reply->rr_func = NULL; + req->rl_reply = NULL; + } + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + case RPCRDMA_MTHCAFMR: +- case RPCRDMA_MEMWINDOWS_ASYNC: +- case RPCRDMA_MEMWINDOWS: + /* + * Cycle mw's back in reverse order, and "spin" them. + * This delays and scrambles reuse as much as possible. +@@ -1384,8 +1309,7 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req) + + /* + * Put reply buffers back into pool when not attached to +- * request. This happens in error conditions, and when +- * aborting unbinds. Pre-decrement counter/array index. ++ * request. This happens in error conditions. + */ + void + rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) +@@ -1688,74 +1612,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, + } + + static int +-rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg, +- int *nsegs, int writing, struct rpcrdma_ia *ia, +- struct rpcrdma_xprt *r_xprt) +-{ +- int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : +- IB_ACCESS_REMOTE_READ); +- struct ib_mw_bind param; +- int rc; +- +- *nsegs = 1; +- rpcrdma_map_one(ia, seg, writing); +- param.bind_info.mr = ia->ri_bind_mem; +- param.wr_id = 0ULL; /* no send cookie */ +- param.bind_info.addr = seg->mr_dma; +- param.bind_info.length = seg->mr_len; +- param.send_flags = 0; +- param.bind_info.mw_access_flags = mem_priv; +- +- DECR_CQCOUNT(&r_xprt->rx_ep); +- rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); +- if (rc) { +- dprintk("RPC: %s: failed ib_bind_mw " +- "%u@0x%llx status %i\n", +- __func__, seg->mr_len, +- (unsigned long long)seg->mr_dma, rc); +- rpcrdma_unmap_one(ia, seg); +- } else { +- seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; +- seg->mr_base = param.bind_info.addr; +- seg->mr_nsegs = 1; +- } +- return rc; +-} +- +-static int +-rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg, +- struct rpcrdma_ia *ia, +- struct rpcrdma_xprt *r_xprt, void **r) +-{ +- struct ib_mw_bind param; +- LIST_HEAD(l); +- int rc; +- +- BUG_ON(seg->mr_nsegs != 1); +- param.bind_info.mr = ia->ri_bind_mem; +- param.bind_info.addr = 0ULL; /* unbind */ +- param.bind_info.length = 0; +- param.bind_info.mw_access_flags = 0; +- if (*r) { +- param.wr_id = (u64) (unsigned long) *r; +- param.send_flags = IB_SEND_SIGNALED; +- INIT_CQCOUNT(&r_xprt->rx_ep); +- } else { +- param.wr_id = 0ULL; +- param.send_flags = 0; +- DECR_CQCOUNT(&r_xprt->rx_ep); +- } +- rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, ¶m); +- rpcrdma_unmap_one(ia, seg); +- if (rc) +- dprintk("RPC: %s: failed ib_(un)bind_mw," +- " status %i\n", __func__, rc); +- else +- *r = NULL; /* will upcall on completion */ +- return rc; +-} +- +-static int + rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, + int *nsegs, int writing, struct rpcrdma_ia *ia) + { +@@ -1845,12 +1701,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, + rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); + break; + +- /* Registration using memory windows */ +- case RPCRDMA_MEMWINDOWS_ASYNC: +- case RPCRDMA_MEMWINDOWS: +- rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt); +- break; +- + /* Default registration each time */ + default: + rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); +@@ -1887,11 +1737,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, + rc = rpcrdma_deregister_fmr_external(seg, ia); + break; + +- case RPCRDMA_MEMWINDOWS_ASYNC: +- case RPCRDMA_MEMWINDOWS: +- rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r); +- break; +- + default: + rc = rpcrdma_deregister_default_external(seg, ia); + break; +diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h +index c620d13..bf08ee0 100644 +--- a/net/sunrpc/xprtrdma/xprt_rdma.h ++++ b/net/sunrpc/xprtrdma/xprt_rdma.h +@@ -127,7 +127,6 @@ struct rpcrdma_rep { + struct rpc_xprt *rr_xprt; /* needed for request/reply matching */ + void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ + struct list_head rr_list; /* tasklet list */ +- wait_queue_head_t rr_unbind; /* optional unbind wait */ + struct ib_sge rr_iov; /* for posting */ + struct ib_mr *rr_handle; /* handle for mem in rr_iov */ + char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ +@@ -162,7 +161,6 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ + struct ib_mr *rl_mr; /* if registered directly */ + struct rpcrdma_mw { /* if registered from region */ + union { +- struct ib_mw *mw; + struct ib_fmr *fmr; + struct { + struct ib_fast_reg_page_list *fr_pgl; +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0112-xprtrdma-Remove-REGISTER-memory-registration-mode.patch b/linux-next-cherry-picks/0112-xprtrdma-Remove-REGISTER-memory-registration-mode.patch new file mode 100644 index 0000000..06c8809 --- /dev/null +++ b/linux-next-cherry-picks/0112-xprtrdma-Remove-REGISTER-memory-registration-mode.patch @@ -0,0 +1,191 @@ +From 0ac531c1832318efa3dc3d723e356a7e09330e80 Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:32:43 -0400 +Subject: [PATCH 113/132] xprtrdma: Remove REGISTER memory registration mode + +All kernel RDMA providers except amso1100 support either MTHCAFMR +or FRMR, both of which are faster than REGISTER. amso1100 can +continue to use ALLPHYSICAL. + +The only other ULP consumer in the kernel that uses the reg_phys_mr +verb is Lustre. + +Signed-off-by: Chuck Lever +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/rpc_rdma.c | 3 +- + net/sunrpc/xprtrdma/verbs.c | 90 ++-------------------------------------- + 2 files changed, 5 insertions(+), 88 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c +index 46b5172..aae1726 100644 +--- a/net/sunrpc/xprtrdma/rpc_rdma.c ++++ b/net/sunrpc/xprtrdma/rpc_rdma.c +@@ -476,8 +476,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) + * on receive. Therefore, we request a reply chunk + * for non-writes wherever feasible and efficient. + */ +- if (wtype == rpcrdma_noch && +- r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER) ++ if (wtype == rpcrdma_noch) + wtype = rpcrdma_replych; + } + } +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index 304c7ad..6bb9a07 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -494,19 +494,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) + switch (memreg) { + case RPCRDMA_MTHCAFMR: + if (!ia->ri_id->device->alloc_fmr) { +-#if RPCRDMA_PERSISTENT_REGISTRATION + dprintk("RPC: %s: MTHCAFMR registration " + "specified but not supported by adapter, " + "using riskier RPCRDMA_ALLPHYSICAL\n", + __func__); + memreg = RPCRDMA_ALLPHYSICAL; +-#else +- dprintk("RPC: %s: MTHCAFMR registration " +- "specified but not supported by adapter, " +- "using slower RPCRDMA_REGISTER\n", +- __func__); +- memreg = RPCRDMA_REGISTER; +-#endif + } + break; + case RPCRDMA_FRMR: +@@ -514,19 +506,11 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) + if ((devattr.device_cap_flags & + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { +-#if RPCRDMA_PERSISTENT_REGISTRATION + dprintk("RPC: %s: FRMR registration " + "specified but not supported by adapter, " + "using riskier RPCRDMA_ALLPHYSICAL\n", + __func__); + memreg = RPCRDMA_ALLPHYSICAL; +-#else +- dprintk("RPC: %s: FRMR registration " +- "specified but not supported by adapter, " +- "using slower RPCRDMA_REGISTER\n", +- __func__); +- memreg = RPCRDMA_REGISTER; +-#endif + } else { + /* Mind the ia limit on FRMR page list depth */ + ia->ri_max_frmr_depth = min_t(unsigned int, +@@ -545,7 +529,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) + * adapter. + */ + switch (memreg) { +- case RPCRDMA_REGISTER: + case RPCRDMA_FRMR: + break; + #if RPCRDMA_PERSISTENT_REGISTRATION +@@ -565,11 +548,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) + ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); + if (IS_ERR(ia->ri_bind_mem)) { + printk(KERN_ALERT "%s: ib_get_dma_mr for " +- "phys register failed with %lX\n\t" +- "Will continue with degraded performance\n", ++ "phys register failed with %lX\n", + __func__, PTR_ERR(ia->ri_bind_mem)); +- memreg = RPCRDMA_REGISTER; +- ia->ri_bind_mem = NULL; ++ rc = -ENOMEM; ++ goto out2; + } + break; + default: +@@ -1611,67 +1593,6 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, + return rc; + } + +-static int +-rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg, +- int *nsegs, int writing, struct rpcrdma_ia *ia) +-{ +- int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : +- IB_ACCESS_REMOTE_READ); +- struct rpcrdma_mr_seg *seg1 = seg; +- struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; +- int len, i, rc = 0; +- +- if (*nsegs > RPCRDMA_MAX_DATA_SEGS) +- *nsegs = RPCRDMA_MAX_DATA_SEGS; +- for (len = 0, i = 0; i < *nsegs;) { +- rpcrdma_map_one(ia, seg, writing); +- ipb[i].addr = seg->mr_dma; +- ipb[i].size = seg->mr_len; +- len += seg->mr_len; +- ++seg; +- ++i; +- /* Check for holes */ +- if ((i < *nsegs && offset_in_page(seg->mr_offset)) || +- offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) +- break; +- } +- seg1->mr_base = seg1->mr_dma; +- seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, +- ipb, i, mem_priv, &seg1->mr_base); +- if (IS_ERR(seg1->mr_chunk.rl_mr)) { +- rc = PTR_ERR(seg1->mr_chunk.rl_mr); +- dprintk("RPC: %s: failed ib_reg_phys_mr " +- "%u@0x%llx (%d)... status %i\n", +- __func__, len, +- (unsigned long long)seg1->mr_dma, i, rc); +- while (i--) +- rpcrdma_unmap_one(ia, --seg); +- } else { +- seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; +- seg1->mr_nsegs = i; +- seg1->mr_len = len; +- } +- *nsegs = i; +- return rc; +-} +- +-static int +-rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg, +- struct rpcrdma_ia *ia) +-{ +- struct rpcrdma_mr_seg *seg1 = seg; +- int rc; +- +- rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); +- seg1->mr_chunk.rl_mr = NULL; +- while (seg1->mr_nsegs--) +- rpcrdma_unmap_one(ia, seg++); +- if (rc) +- dprintk("RPC: %s: failed ib_dereg_mr," +- " status %i\n", __func__, rc); +- return rc; +-} +- + int + rpcrdma_register_external(struct rpcrdma_mr_seg *seg, + int nsegs, int writing, struct rpcrdma_xprt *r_xprt) +@@ -1701,10 +1622,8 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, + rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); + break; + +- /* Default registration each time */ + default: +- rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia); +- break; ++ return -1; + } + if (rc) + return -1; +@@ -1738,7 +1657,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, + break; + + default: +- rc = rpcrdma_deregister_default_external(seg, ia); + break; + } + if (r) { +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0113-xprtrdma-Fall-back-to-MTHCAFMR-when-FRMR-is-not-supp.patch b/linux-next-cherry-picks/0113-xprtrdma-Fall-back-to-MTHCAFMR-when-FRMR-is-not-supp.patch new file mode 100644 index 0000000..0743d58 --- /dev/null +++ b/linux-next-cherry-picks/0113-xprtrdma-Fall-back-to-MTHCAFMR-when-FRMR-is-not-supp.patch @@ -0,0 +1,73 @@ +From f10eafd3a6ce9da7e96999c124b643ea6c4921f3 Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:32:51 -0400 +Subject: [PATCH 114/132] xprtrdma: Fall back to MTHCAFMR when FRMR is not supported + +An audit of in-kernel RDMA providers that do not support the FRMR +memory registration shows that several of them support MTHCAFMR. +Prefer MTHCAFMR when FRMR is not supported. + +If MTHCAFMR is not supported, only then choose ALLPHYSICAL. + +Signed-off-by: Chuck Lever +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/verbs.c | 31 +++++++++++++++---------------- + 1 files changed, 15 insertions(+), 16 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index 6bb9a07..a352798 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -491,33 +491,32 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) + ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey; + } + +- switch (memreg) { +- case RPCRDMA_MTHCAFMR: +- if (!ia->ri_id->device->alloc_fmr) { +- dprintk("RPC: %s: MTHCAFMR registration " +- "specified but not supported by adapter, " +- "using riskier RPCRDMA_ALLPHYSICAL\n", +- __func__); +- memreg = RPCRDMA_ALLPHYSICAL; +- } +- break; +- case RPCRDMA_FRMR: ++ if (memreg == RPCRDMA_FRMR) { + /* Requires both frmr reg and local dma lkey */ + if ((devattr.device_cap_flags & + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { + dprintk("RPC: %s: FRMR registration " +- "specified but not supported by adapter, " +- "using riskier RPCRDMA_ALLPHYSICAL\n", +- __func__); +- memreg = RPCRDMA_ALLPHYSICAL; ++ "not supported by HCA\n", __func__); ++ memreg = RPCRDMA_MTHCAFMR; + } else { + /* Mind the ia limit on FRMR page list depth */ + ia->ri_max_frmr_depth = min_t(unsigned int, + RPCRDMA_MAX_DATA_SEGS, + devattr.max_fast_reg_page_list_len); + } +- break; ++ } ++ if (memreg == RPCRDMA_MTHCAFMR) { ++ if (!ia->ri_id->device->alloc_fmr) { ++ dprintk("RPC: %s: MTHCAFMR registration " ++ "not supported by HCA\n", __func__); ++#if RPCRDMA_PERSISTENT_REGISTRATION ++ memreg = RPCRDMA_ALLPHYSICAL; ++#else ++ rc = -EINVAL; ++ goto out2; ++#endif ++ } + } + + /* +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0114-xprtrdma-mount-reports-Invalid-mount-option-if-memre.patch b/linux-next-cherry-picks/0114-xprtrdma-mount-reports-Invalid-mount-option-if-memre.patch new file mode 100644 index 0000000..8052dfa --- /dev/null +++ b/linux-next-cherry-picks/0114-xprtrdma-mount-reports-Invalid-mount-option-if-memre.patch @@ -0,0 +1,46 @@ +From cdd9ade711599e7672a635add0406080856f8b92 Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:33:00 -0400 +Subject: [PATCH 115/132] xprtrdma: mount reports "Invalid mount option" if memreg mode not supported + +If the selected memory registration mode is not supported by the +underlying provider/HCA, the NFS mount command reports that there was +an invalid mount option, and fails. This is misleading. + +Reporting a problem allocating memory is a lot closer to the truth. + +Signed-off-by: Chuck Lever +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/verbs.c | 8 ++++---- + 1 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index a352798..7c7e9b4 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -513,7 +513,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) + #if RPCRDMA_PERSISTENT_REGISTRATION + memreg = RPCRDMA_ALLPHYSICAL; + #else +- rc = -EINVAL; ++ rc = -ENOMEM; + goto out2; + #endif + } +@@ -554,9 +554,9 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) + } + break; + default: +- printk(KERN_ERR "%s: invalid memory registration mode %d\n", +- __func__, memreg); +- rc = -EINVAL; ++ printk(KERN_ERR "RPC: Unsupported memory " ++ "registration mode: %d\n", memreg); ++ rc = -ENOMEM; + goto out2; + } + dprintk("RPC: %s: memory registration strategy is %d\n", +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0115-xprtrdma-Simplify-rpcrdma_deregister_external-synops.patch b/linux-next-cherry-picks/0115-xprtrdma-Simplify-rpcrdma_deregister_external-synops.patch new file mode 100644 index 0000000..883ab79 --- /dev/null +++ b/linux-next-cherry-picks/0115-xprtrdma-Simplify-rpcrdma_deregister_external-synops.patch @@ -0,0 +1,86 @@ +From 13c9ff8f673862b69e795ea99a237b461c557eb3 Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:33:08 -0400 +Subject: [PATCH 116/132] xprtrdma: Simplify rpcrdma_deregister_external() synopsis + +Clean up: All remaining callers of rpcrdma_deregister_external() +pass NULL as the last argument, so remove that argument. + +Signed-off-by: Chuck Lever +Tested-by: Steve Wise +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/rpc_rdma.c | 2 +- + net/sunrpc/xprtrdma/transport.c | 2 +- + net/sunrpc/xprtrdma/verbs.c | 8 +------- + net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- + 4 files changed, 4 insertions(+), 10 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c +index aae1726..436d229 100644 +--- a/net/sunrpc/xprtrdma/rpc_rdma.c ++++ b/net/sunrpc/xprtrdma/rpc_rdma.c +@@ -270,7 +270,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, + out: + for (pos = 0; nchunks--;) + pos += rpcrdma_deregister_external( +- &req->rl_segments[pos], r_xprt, NULL); ++ &req->rl_segments[pos], r_xprt); + return 0; + } + +diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c +index c23b0c1..430cabb 100644 +--- a/net/sunrpc/xprtrdma/transport.c ++++ b/net/sunrpc/xprtrdma/transport.c +@@ -575,7 +575,7 @@ xprt_rdma_free(void *buffer) + for (i = 0; req->rl_nchunks;) { + --req->rl_nchunks; + i += rpcrdma_deregister_external( +- &req->rl_segments[i], r_xprt, NULL); ++ &req->rl_segments[i], r_xprt); + } + + if (req->rl_iov.length == 0) { /* see allocate above */ +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index 7c7e9b4..0cbc83c 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -1632,7 +1632,7 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, + + int + rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, +- struct rpcrdma_xprt *r_xprt, void *r) ++ struct rpcrdma_xprt *r_xprt) + { + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + int nsegs = seg->mr_nsegs, rc; +@@ -1658,12 +1658,6 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, + default: + break; + } +- if (r) { +- struct rpcrdma_rep *rep = r; +- void (*func)(struct rpcrdma_rep *) = rep->rr_func; +- rep->rr_func = NULL; +- func(rep); /* dereg done, callback now */ +- } + return nsegs; + } + +diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h +index bf08ee0..3f44d6a 100644 +--- a/net/sunrpc/xprtrdma/xprt_rdma.h ++++ b/net/sunrpc/xprtrdma/xprt_rdma.h +@@ -331,7 +331,7 @@ int rpcrdma_deregister_internal(struct rpcrdma_ia *, + int rpcrdma_register_external(struct rpcrdma_mr_seg *, + int, int, struct rpcrdma_xprt *); + int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, +- struct rpcrdma_xprt *, void *); ++ struct rpcrdma_xprt *); + + /* + * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0116-xprtrdma-Make-rpcrdma_ep_destroy-return-void.patch b/linux-next-cherry-picks/0116-xprtrdma-Make-rpcrdma_ep_destroy-return-void.patch new file mode 100644 index 0000000..bf5a979 --- /dev/null +++ b/linux-next-cherry-picks/0116-xprtrdma-Make-rpcrdma_ep_destroy-return-void.patch @@ -0,0 +1,95 @@ +From 7f1d54191ed6fa0f79f584fe3ebf6519738e817f Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:33:16 -0400 +Subject: [PATCH 117/132] xprtrdma: Make rpcrdma_ep_destroy() return void + +Clean up: rpcrdma_ep_destroy() returns a value that is used +only to print a debugging message. rpcrdma_ep_destroy() already +prints debugging messages in all error cases. + +Make rpcrdma_ep_destroy() return void instead. + +Signed-off-by: Chuck Lever +Tested-by: Steve Wise +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/transport.c | 8 ++------ + net/sunrpc/xprtrdma/verbs.c | 7 +------ + net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- + 3 files changed, 4 insertions(+), 13 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c +index 430cabb..d18b2a3 100644 +--- a/net/sunrpc/xprtrdma/transport.c ++++ b/net/sunrpc/xprtrdma/transport.c +@@ -229,7 +229,6 @@ static void + xprt_rdma_destroy(struct rpc_xprt *xprt) + { + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); +- int rc; + + dprintk("RPC: %s: called\n", __func__); + +@@ -238,10 +237,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) + xprt_clear_connected(xprt); + + rpcrdma_buffer_destroy(&r_xprt->rx_buf); +- rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); +- if (rc) +- dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n", +- __func__, rc); ++ rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_ia_close(&r_xprt->rx_ia); + + xprt_rdma_free_addresses(xprt); +@@ -391,7 +387,7 @@ out4: + xprt_rdma_free_addresses(xprt); + rc = -EINVAL; + out3: +- (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); ++ rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); + out2: + rpcrdma_ia_close(&new_xprt->rx_ia); + out1: +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index 0cbc83c..edc951e 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -748,11 +748,8 @@ out1: + * Disconnect and destroy endpoint. After this, the only + * valid operations on the ep are to free it (if dynamically + * allocated) or re-create it. +- * +- * The caller's error handling must be sure to not leak the endpoint +- * if this function fails. + */ +-int ++void + rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) + { + int rc; +@@ -782,8 +779,6 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) + if (rc) + dprintk("RPC: %s: ib_destroy_cq returned %i\n", + __func__, rc); +- +- return rc; + } + + /* +diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h +index 3f44d6a..362a19d 100644 +--- a/net/sunrpc/xprtrdma/xprt_rdma.h ++++ b/net/sunrpc/xprtrdma/xprt_rdma.h +@@ -301,7 +301,7 @@ void rpcrdma_ia_close(struct rpcrdma_ia *); + */ + int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, + struct rpcrdma_create_data_internal *); +-int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); ++void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); + int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); + int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); + +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0117-xprtrdma-Split-the-completion-queue.patch b/linux-next-cherry-picks/0117-xprtrdma-Split-the-completion-queue.patch new file mode 100644 index 0000000..8c813af --- /dev/null +++ b/linux-next-cherry-picks/0117-xprtrdma-Split-the-completion-queue.patch @@ -0,0 +1,395 @@ +From fc66448549bbb77f2f1a38b270ab2d6b6a22da33 Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:33:25 -0400 +Subject: [PATCH 118/132] xprtrdma: Split the completion queue + +The current CQ handler uses the ib_wc.opcode field to distinguish +between event types. However, the contents of that field are not +reliable if the completion status is not IB_WC_SUCCESS. + +When an error completion occurs on a send event, the CQ handler +schedules a tasklet with something that is not a struct rpcrdma_rep. +This is never correct behavior, and sometimes it results in a panic. + +To resolve this issue, split the completion queue into a send CQ and +a receive CQ. The send CQ handler now handles only struct rpcrdma_mw +wr_id's, and the receive CQ handler now handles only struct +rpcrdma_rep wr_id's. + +Fix suggested by Shirley Ma + +Reported-by: Rafael Reiter +Fixes: 5c635e09cec0feeeb310968e51dad01040244851 +BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=73211 +Signed-off-by: Chuck Lever +Tested-by: Klemens Senn +Tested-by: Steve Wise +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/verbs.c | 228 +++++++++++++++++++++++---------------- + net/sunrpc/xprtrdma/xprt_rdma.h | 1 - + 2 files changed, 137 insertions(+), 92 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index edc951e..af2d097 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -142,96 +142,115 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) + } + } + +-static inline +-void rpcrdma_event_process(struct ib_wc *wc) ++static void ++rpcrdma_sendcq_process_wc(struct ib_wc *wc) + { +- struct rpcrdma_mw *frmr; +- struct rpcrdma_rep *rep = +- (struct rpcrdma_rep *)(unsigned long) wc->wr_id; ++ struct rpcrdma_mw *frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + +- dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", +- __func__, rep, wc->status, wc->opcode, wc->byte_len); ++ dprintk("RPC: %s: frmr %p status %X opcode %d\n", ++ __func__, frmr, wc->status, wc->opcode); + +- if (!rep) /* send completion that we don't care about */ ++ if (wc->wr_id == 0ULL) + return; +- +- if (IB_WC_SUCCESS != wc->status) { +- dprintk("RPC: %s: WC opcode %d status %X, connection lost\n", +- __func__, wc->opcode, wc->status); +- rep->rr_len = ~0U; +- if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV) +- rpcrdma_schedule_tasklet(rep); ++ if (wc->status != IB_WC_SUCCESS) + return; +- } + +- switch (wc->opcode) { +- case IB_WC_FAST_REG_MR: +- frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; ++ if (wc->opcode == IB_WC_FAST_REG_MR) + frmr->r.frmr.state = FRMR_IS_VALID; +- break; +- case IB_WC_LOCAL_INV: +- frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; ++ else if (wc->opcode == IB_WC_LOCAL_INV) + frmr->r.frmr.state = FRMR_IS_INVALID; +- break; +- case IB_WC_RECV: +- rep->rr_len = wc->byte_len; +- ib_dma_sync_single_for_cpu( +- rdmab_to_ia(rep->rr_buffer)->ri_id->device, +- rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); +- /* Keep (only) the most recent credits, after check validity */ +- if (rep->rr_len >= 16) { +- struct rpcrdma_msg *p = +- (struct rpcrdma_msg *) rep->rr_base; +- unsigned int credits = ntohl(p->rm_credit); +- if (credits == 0) { +- dprintk("RPC: %s: server" +- " dropped credits to 0!\n", __func__); +- /* don't deadlock */ +- credits = 1; +- } else if (credits > rep->rr_buffer->rb_max_requests) { +- dprintk("RPC: %s: server" +- " over-crediting: %d (%d)\n", +- __func__, credits, +- rep->rr_buffer->rb_max_requests); +- credits = rep->rr_buffer->rb_max_requests; +- } +- atomic_set(&rep->rr_buffer->rb_credits, credits); +- } +- rpcrdma_schedule_tasklet(rep); +- break; +- default: +- dprintk("RPC: %s: unexpected WC event %X\n", +- __func__, wc->opcode); +- break; +- } + } + +-static inline int +-rpcrdma_cq_poll(struct ib_cq *cq) ++static int ++rpcrdma_sendcq_poll(struct ib_cq *cq) + { + struct ib_wc wc; + int rc; + +- for (;;) { +- rc = ib_poll_cq(cq, 1, &wc); +- if (rc < 0) { +- dprintk("RPC: %s: ib_poll_cq failed %i\n", +- __func__, rc); +- return rc; +- } +- if (rc == 0) +- break; ++ while ((rc = ib_poll_cq(cq, 1, &wc)) == 1) ++ rpcrdma_sendcq_process_wc(&wc); ++ return rc; ++} + +- rpcrdma_event_process(&wc); ++/* ++ * Handle send, fast_reg_mr, and local_inv completions. ++ * ++ * Send events are typically suppressed and thus do not result ++ * in an upcall. Occasionally one is signaled, however. This ++ * prevents the provider's completion queue from wrapping and ++ * losing a completion. ++ */ ++static void ++rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) ++{ ++ int rc; ++ ++ rc = rpcrdma_sendcq_poll(cq); ++ if (rc) { ++ dprintk("RPC: %s: ib_poll_cq failed: %i\n", ++ __func__, rc); ++ return; + } + +- return 0; ++ rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); ++ if (rc) { ++ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", ++ __func__, rc); ++ return; ++ } ++ ++ rpcrdma_sendcq_poll(cq); ++} ++ ++static void ++rpcrdma_recvcq_process_wc(struct ib_wc *wc) ++{ ++ struct rpcrdma_rep *rep = ++ (struct rpcrdma_rep *)(unsigned long)wc->wr_id; ++ ++ dprintk("RPC: %s: rep %p status %X opcode %X length %u\n", ++ __func__, rep, wc->status, wc->opcode, wc->byte_len); ++ ++ if (wc->status != IB_WC_SUCCESS) { ++ rep->rr_len = ~0U; ++ goto out_schedule; ++ } ++ if (wc->opcode != IB_WC_RECV) ++ return; ++ ++ rep->rr_len = wc->byte_len; ++ ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device, ++ rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); ++ ++ if (rep->rr_len >= 16) { ++ struct rpcrdma_msg *p = (struct rpcrdma_msg *)rep->rr_base; ++ unsigned int credits = ntohl(p->rm_credit); ++ ++ if (credits == 0) ++ credits = 1; /* don't deadlock */ ++ else if (credits > rep->rr_buffer->rb_max_requests) ++ credits = rep->rr_buffer->rb_max_requests; ++ atomic_set(&rep->rr_buffer->rb_credits, credits); ++ } ++ ++out_schedule: ++ rpcrdma_schedule_tasklet(rep); ++} ++ ++static int ++rpcrdma_recvcq_poll(struct ib_cq *cq) ++{ ++ struct ib_wc wc; ++ int rc; ++ ++ while ((rc = ib_poll_cq(cq, 1, &wc)) == 1) ++ rpcrdma_recvcq_process_wc(&wc); ++ return rc; + } + + /* +- * rpcrdma_cq_event_upcall ++ * Handle receive completions. + * +- * This upcall handles recv and send events. + * It is reentrant but processes single events in order to maintain + * ordering of receives to keep server credits. + * +@@ -240,26 +259,27 @@ rpcrdma_cq_poll(struct ib_cq *cq) + * connection shutdown. That is, the structures required for + * the completion of the reply handler must remain intact until + * all memory has been reclaimed. +- * +- * Note that send events are suppressed and do not result in an upcall. + */ + static void +-rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context) ++rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) + { + int rc; + +- rc = rpcrdma_cq_poll(cq); +- if (rc) ++ rc = rpcrdma_recvcq_poll(cq); ++ if (rc) { ++ dprintk("RPC: %s: ib_poll_cq failed: %i\n", ++ __func__, rc); + return; ++ } + + rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (rc) { +- dprintk("RPC: %s: ib_req_notify_cq failed %i\n", ++ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + return; + } + +- rpcrdma_cq_poll(cq); ++ rpcrdma_recvcq_poll(cq); + } + + #ifdef RPC_DEBUG +@@ -610,6 +630,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + struct rpcrdma_create_data_internal *cdata) + { + struct ib_device_attr devattr; ++ struct ib_cq *sendcq, *recvcq; + int rc, err; + + rc = ib_query_device(ia->ri_id->device, &devattr); +@@ -685,7 +706,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + ep->rep_attr.cap.max_recv_sge); + + /* set trigger for requesting send completion */ +- ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; ++ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; + if (ep->rep_cqinit <= 2) + ep->rep_cqinit = 0; + INIT_CQCOUNT(ep); +@@ -693,26 +714,43 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + init_waitqueue_head(&ep->rep_connect_wait); + INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); + +- ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall, ++ sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall, + rpcrdma_cq_async_error_upcall, NULL, +- ep->rep_attr.cap.max_recv_wr + + ep->rep_attr.cap.max_send_wr + 1, 0); +- if (IS_ERR(ep->rep_cq)) { +- rc = PTR_ERR(ep->rep_cq); +- dprintk("RPC: %s: ib_create_cq failed: %i\n", ++ if (IS_ERR(sendcq)) { ++ rc = PTR_ERR(sendcq); ++ dprintk("RPC: %s: failed to create send CQ: %i\n", + __func__, rc); + goto out1; + } + +- rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP); ++ rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP); + if (rc) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + goto out2; + } + +- ep->rep_attr.send_cq = ep->rep_cq; +- ep->rep_attr.recv_cq = ep->rep_cq; ++ recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall, ++ rpcrdma_cq_async_error_upcall, NULL, ++ ep->rep_attr.cap.max_recv_wr + 1, 0); ++ if (IS_ERR(recvcq)) { ++ rc = PTR_ERR(recvcq); ++ dprintk("RPC: %s: failed to create recv CQ: %i\n", ++ __func__, rc); ++ goto out2; ++ } ++ ++ rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP); ++ if (rc) { ++ dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", ++ __func__, rc); ++ ib_destroy_cq(recvcq); ++ goto out2; ++ } ++ ++ ep->rep_attr.send_cq = sendcq; ++ ep->rep_attr.recv_cq = recvcq; + + /* Initialize cma parameters */ + +@@ -734,7 +772,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + return 0; + + out2: +- err = ib_destroy_cq(ep->rep_cq); ++ err = ib_destroy_cq(sendcq); + if (err) + dprintk("RPC: %s: ib_destroy_cq returned %i\n", + __func__, err); +@@ -774,8 +812,14 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) + ep->rep_pad_mr = NULL; + } + +- rpcrdma_clean_cq(ep->rep_cq); +- rc = ib_destroy_cq(ep->rep_cq); ++ rpcrdma_clean_cq(ep->rep_attr.recv_cq); ++ rc = ib_destroy_cq(ep->rep_attr.recv_cq); ++ if (rc) ++ dprintk("RPC: %s: ib_destroy_cq returned %i\n", ++ __func__, rc); ++ ++ rpcrdma_clean_cq(ep->rep_attr.send_cq); ++ rc = ib_destroy_cq(ep->rep_attr.send_cq); + if (rc) + dprintk("RPC: %s: ib_destroy_cq returned %i\n", + __func__, rc); +@@ -798,7 +842,9 @@ retry: + if (rc && rc != -ENOTCONN) + dprintk("RPC: %s: rpcrdma_ep_disconnect" + " status %i\n", __func__, rc); +- rpcrdma_clean_cq(ep->rep_cq); ++ ++ rpcrdma_clean_cq(ep->rep_attr.recv_cq); ++ rpcrdma_clean_cq(ep->rep_attr.send_cq); + + xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + id = rpcrdma_create_id(xprt, ia, +@@ -907,7 +953,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) + { + int rc; + +- rpcrdma_clean_cq(ep->rep_cq); ++ rpcrdma_clean_cq(ep->rep_attr.recv_cq); ++ rpcrdma_clean_cq(ep->rep_attr.send_cq); + rc = rdma_disconnect(ia->ri_id); + if (!rc) { + /* returns without wait if not connected */ +@@ -1727,7 +1774,6 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, + ib_dma_sync_single_for_cpu(ia->ri_id->device, + rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); + +- DECR_CQCOUNT(ep); + rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); + + if (rc) +diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h +index 362a19d..334ab6e 100644 +--- a/net/sunrpc/xprtrdma/xprt_rdma.h ++++ b/net/sunrpc/xprtrdma/xprt_rdma.h +@@ -79,7 +79,6 @@ struct rpcrdma_ep { + int rep_cqinit; + int rep_connected; + struct rpcrdma_ia *rep_ia; +- struct ib_cq *rep_cq; + struct ib_qp_init_attr rep_attr; + wait_queue_head_t rep_connect_wait; + struct ib_sge rep_pad; /* holds zeroed pad */ +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0118-xprtrmda-Reduce-lock-contention-in-completion-handle.patch b/linux-next-cherry-picks/0118-xprtrmda-Reduce-lock-contention-in-completion-handle.patch new file mode 100644 index 0000000..115ab68 --- /dev/null +++ b/linux-next-cherry-picks/0118-xprtrmda-Reduce-lock-contention-in-completion-handle.patch @@ -0,0 +1,50 @@ +From 7f23f6f6e388d2003c4ecf5d558f3c2191e12530 Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:33:34 -0400 +Subject: [PATCH 119/132] xprtrmda: Reduce lock contention in completion handlers + +Skip the ib_poll_cq() after re-arming, if the provider knows there +are no additional items waiting. (Have a look at commit ed23a727 for +more details). + +Signed-off-by: Chuck Lever +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/verbs.c | 14 ++++++++++---- + 1 files changed, 10 insertions(+), 4 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index af2d097..c7d5281 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -192,8 +192,11 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) + return; + } + +- rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); +- if (rc) { ++ rc = ib_req_notify_cq(cq, ++ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); ++ if (rc == 0) ++ return; ++ if (rc < 0) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + return; +@@ -272,8 +275,11 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) + return; + } + +- rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); +- if (rc) { ++ rc = ib_req_notify_cq(cq, ++ IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); ++ if (rc == 0) ++ return; ++ if (rc < 0) { + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", + __func__, rc); + return; +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0119-xprtrmda-Reduce-calls-to-ib_poll_cq-in-completion-ha.patch b/linux-next-cherry-picks/0119-xprtrmda-Reduce-calls-to-ib_poll_cq-in-completion-ha.patch new file mode 100644 index 0000000..49703e6 --- /dev/null +++ b/linux-next-cherry-picks/0119-xprtrmda-Reduce-calls-to-ib_poll_cq-in-completion-ha.patch @@ -0,0 +1,165 @@ +From 1c00dd0776543608e13c74a527660cb8cd28a74f Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:33:42 -0400 +Subject: [PATCH 120/132] xprtrmda: Reduce calls to ib_poll_cq() in completion handlers + +Change the completion handlers to grab up to 16 items per +ib_poll_cq() call. No extra ib_poll_cq() is needed if fewer than 16 +items are returned. + +Signed-off-by: Chuck Lever +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/verbs.c | 56 ++++++++++++++++++++++++++------------ + net/sunrpc/xprtrdma/xprt_rdma.h | 4 +++ + 2 files changed, 42 insertions(+), 18 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index c7d5281..b8caee9 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -162,14 +162,23 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc) + } + + static int +-rpcrdma_sendcq_poll(struct ib_cq *cq) ++rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) + { +- struct ib_wc wc; +- int rc; ++ struct ib_wc *wcs; ++ int count, rc; + +- while ((rc = ib_poll_cq(cq, 1, &wc)) == 1) +- rpcrdma_sendcq_process_wc(&wc); +- return rc; ++ do { ++ wcs = ep->rep_send_wcs; ++ ++ rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); ++ if (rc <= 0) ++ return rc; ++ ++ count = rc; ++ while (count-- > 0) ++ rpcrdma_sendcq_process_wc(wcs++); ++ } while (rc == RPCRDMA_POLLSIZE); ++ return 0; + } + + /* +@@ -183,9 +192,10 @@ rpcrdma_sendcq_poll(struct ib_cq *cq) + static void + rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) + { ++ struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; + int rc; + +- rc = rpcrdma_sendcq_poll(cq); ++ rc = rpcrdma_sendcq_poll(cq, ep); + if (rc) { + dprintk("RPC: %s: ib_poll_cq failed: %i\n", + __func__, rc); +@@ -202,7 +212,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) + return; + } + +- rpcrdma_sendcq_poll(cq); ++ rpcrdma_sendcq_poll(cq, ep); + } + + static void +@@ -241,14 +251,23 @@ out_schedule: + } + + static int +-rpcrdma_recvcq_poll(struct ib_cq *cq) ++rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) + { +- struct ib_wc wc; +- int rc; ++ struct ib_wc *wcs; ++ int count, rc; + +- while ((rc = ib_poll_cq(cq, 1, &wc)) == 1) +- rpcrdma_recvcq_process_wc(&wc); +- return rc; ++ do { ++ wcs = ep->rep_recv_wcs; ++ ++ rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); ++ if (rc <= 0) ++ return rc; ++ ++ count = rc; ++ while (count-- > 0) ++ rpcrdma_recvcq_process_wc(wcs++); ++ } while (rc == RPCRDMA_POLLSIZE); ++ return 0; + } + + /* +@@ -266,9 +285,10 @@ rpcrdma_recvcq_poll(struct ib_cq *cq) + static void + rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) + { ++ struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; + int rc; + +- rc = rpcrdma_recvcq_poll(cq); ++ rc = rpcrdma_recvcq_poll(cq, ep); + if (rc) { + dprintk("RPC: %s: ib_poll_cq failed: %i\n", + __func__, rc); +@@ -285,7 +305,7 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) + return; + } + +- rpcrdma_recvcq_poll(cq); ++ rpcrdma_recvcq_poll(cq, ep); + } + + #ifdef RPC_DEBUG +@@ -721,7 +741,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); + + sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall, +- rpcrdma_cq_async_error_upcall, NULL, ++ rpcrdma_cq_async_error_upcall, ep, + ep->rep_attr.cap.max_send_wr + 1, 0); + if (IS_ERR(sendcq)) { + rc = PTR_ERR(sendcq); +@@ -738,7 +758,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, + } + + recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall, +- rpcrdma_cq_async_error_upcall, NULL, ++ rpcrdma_cq_async_error_upcall, ep, + ep->rep_attr.cap.max_recv_wr + 1, 0); + if (IS_ERR(recvcq)) { + rc = PTR_ERR(recvcq); +diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h +index 334ab6e..cb4c882 100644 +--- a/net/sunrpc/xprtrdma/xprt_rdma.h ++++ b/net/sunrpc/xprtrdma/xprt_rdma.h +@@ -74,6 +74,8 @@ struct rpcrdma_ia { + * RDMA Endpoint -- one per transport instance + */ + ++#define RPCRDMA_POLLSIZE (16) ++ + struct rpcrdma_ep { + atomic_t rep_cqcount; + int rep_cqinit; +@@ -88,6 +90,8 @@ struct rpcrdma_ep { + struct rdma_conn_param rep_remote_cma; + struct sockaddr_storage rep_remote_addr; + struct delayed_work rep_connect_worker; ++ struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE]; ++ struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE]; + }; + + #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0120-xprtrdma-Limit-work-done-by-completion-handler.patch b/linux-next-cherry-picks/0120-xprtrdma-Limit-work-done-by-completion-handler.patch new file mode 100644 index 0000000..9b7c86e --- /dev/null +++ b/linux-next-cherry-picks/0120-xprtrdma-Limit-work-done-by-completion-handler.patch @@ -0,0 +1,79 @@ +From 8301a2c047cc25dabd645e5590c1db0ead4c5af4 Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:33:51 -0400 +Subject: [PATCH 121/132] xprtrdma: Limit work done by completion handler + +Sagi Grimberg points out that a steady +stream of CQ events could starve other work because of the boundless +loop pooling in rpcrdma_{send,recv}_poll(). + +Instead of a (potentially infinite) while loop, return after +collecting a budgeted number of completions. + +Signed-off-by: Chuck Lever +Acked-by: Sagi Grimberg +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/verbs.c | 10 ++++++---- + net/sunrpc/xprtrdma/xprt_rdma.h | 1 + + 2 files changed, 7 insertions(+), 4 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index b8caee9..1d08366 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -165,8 +165,9 @@ static int + rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) + { + struct ib_wc *wcs; +- int count, rc; ++ int budget, count, rc; + ++ budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; + do { + wcs = ep->rep_send_wcs; + +@@ -177,7 +178,7 @@ rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) + count = rc; + while (count-- > 0) + rpcrdma_sendcq_process_wc(wcs++); +- } while (rc == RPCRDMA_POLLSIZE); ++ } while (rc == RPCRDMA_POLLSIZE && --budget); + return 0; + } + +@@ -254,8 +255,9 @@ static int + rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) + { + struct ib_wc *wcs; +- int count, rc; ++ int budget, count, rc; + ++ budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; + do { + wcs = ep->rep_recv_wcs; + +@@ -266,7 +268,7 @@ rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) + count = rc; + while (count-- > 0) + rpcrdma_recvcq_process_wc(wcs++); +- } while (rc == RPCRDMA_POLLSIZE); ++ } while (rc == RPCRDMA_POLLSIZE && --budget); + return 0; + } + +diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h +index cb4c882..0c3b88e 100644 +--- a/net/sunrpc/xprtrdma/xprt_rdma.h ++++ b/net/sunrpc/xprtrdma/xprt_rdma.h +@@ -74,6 +74,7 @@ struct rpcrdma_ia { + * RDMA Endpoint -- one per transport instance + */ + ++#define RPCRDMA_WC_BUDGET (128) + #define RPCRDMA_POLLSIZE (16) + + struct rpcrdma_ep { +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0121-xprtrdma-Reduce-the-number-of-hardway-buffer-allocat.patch b/linux-next-cherry-picks/0121-xprtrdma-Reduce-the-number-of-hardway-buffer-allocat.patch new file mode 100644 index 0000000..cb74e73 --- /dev/null +++ b/linux-next-cherry-picks/0121-xprtrdma-Reduce-the-number-of-hardway-buffer-allocat.patch @@ -0,0 +1,128 @@ +From 65866f8259851cea5e356d2fd46fc37a4e26330e Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:33:59 -0400 +Subject: [PATCH 122/132] xprtrdma: Reduce the number of hardway buffer allocations + +While marshaling an RPC/RDMA request, the inline_{rsize,wsize} +settings determine whether an inline request is used, or whether +read or write chunks lists are built. The current default value of +these settings is 1024. Any RPC request smaller than 1024 bytes is +sent to the NFS server completely inline. + +rpcrdma_buffer_create() allocates and pre-registers a set of RPC +buffers for each transport instance, also based on the inline rsize +and wsize settings. + +RPC/RDMA requests and replies are built in these buffers. However, +if an RPC/RDMA request is expected to be larger than 1024, a buffer +has to be allocated and registered for that RPC, and deregistered +and released when the RPC is complete. This is known has a +"hardway allocation." + +Since the introduction of NFSv4, the size of RPC requests has become +larger, and hardway allocations are thus more frequent. Hardway +allocations are significant overhead, and they waste the existing +RPC buffers pre-allocated by rpcrdma_buffer_create(). + +We'd like fewer hardway allocations. + +Increasing the size of the pre-registered buffers is the most direct +way to do this. However, a blanket increase of the inline thresholds +has interoperability consequences. + +On my 64-bit system, rpcrdma_buffer_create() requests roughly 7000 +bytes for each RPC request buffer, using kmalloc(). Due to internal +fragmentation, this wastes nearly 1200 bytes because kmalloc() +already returns an 8192-byte piece of memory for a 7000-byte +allocation request, though the extra space remains unused. + +So let's round up the size of the pre-allocated buffers, and make +use of the unused space in the kmalloc'd memory. + +This change reduces the amount of hardway allocated memory for an +NFSv4 general connectathon run from 1322092 to 9472 bytes (99%). + +Signed-off-by: Chuck Lever +Tested-by: Steve Wise +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/verbs.c | 25 +++++++++++++------------ + 1 files changed, 13 insertions(+), 12 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index 1d08366..c80995a 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -50,6 +50,7 @@ + #include + #include /* for Tavor hack below */ + #include ++#include + + #include "xprt_rdma.h" + +@@ -1005,7 +1006,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, + struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) + { + char *p; +- size_t len; ++ size_t len, rlen, wlen; + int i, rc; + struct rpcrdma_mw *r; + +@@ -1120,16 +1121,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, + * Allocate/init the request/reply buffers. Doing this + * using kmalloc for now -- one for each buf. + */ ++ wlen = 1 << fls(cdata->inline_wsize + sizeof(struct rpcrdma_req)); ++ rlen = 1 << fls(cdata->inline_rsize + sizeof(struct rpcrdma_rep)); ++ dprintk("RPC: %s: wlen = %zu, rlen = %zu\n", ++ __func__, wlen, rlen); ++ + for (i = 0; i < buf->rb_max_requests; i++) { + struct rpcrdma_req *req; + struct rpcrdma_rep *rep; + +- len = cdata->inline_wsize + sizeof(struct rpcrdma_req); +- /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */ +- /* Typical ~2400b, so rounding up saves work later */ +- if (len < 4096) +- len = 4096; +- req = kmalloc(len, GFP_KERNEL); ++ req = kmalloc(wlen, GFP_KERNEL); + if (req == NULL) { + dprintk("RPC: %s: request buffer %d alloc" + " failed\n", __func__, i); +@@ -1141,16 +1142,16 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, + buf->rb_send_bufs[i]->rl_buffer = buf; + + rc = rpcrdma_register_internal(ia, req->rl_base, +- len - offsetof(struct rpcrdma_req, rl_base), ++ wlen - offsetof(struct rpcrdma_req, rl_base), + &buf->rb_send_bufs[i]->rl_handle, + &buf->rb_send_bufs[i]->rl_iov); + if (rc) + goto out; + +- buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req); ++ buf->rb_send_bufs[i]->rl_size = wlen - ++ sizeof(struct rpcrdma_req); + +- len = cdata->inline_rsize + sizeof(struct rpcrdma_rep); +- rep = kmalloc(len, GFP_KERNEL); ++ rep = kmalloc(rlen, GFP_KERNEL); + if (rep == NULL) { + dprintk("RPC: %s: reply buffer %d alloc failed\n", + __func__, i); +@@ -1162,7 +1163,7 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, + buf->rb_recv_bufs[i]->rr_buffer = buf; + + rc = rpcrdma_register_internal(ia, rep->rr_base, +- len - offsetof(struct rpcrdma_rep, rr_base), ++ rlen - offsetof(struct rpcrdma_rep, rr_base), + &buf->rb_recv_bufs[i]->rr_handle, + &buf->rb_recv_bufs[i]->rr_iov); + if (rc) +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0122-xprtrdma-Ensure-ia-ri_id-qp-is-not-NULL-when-reconne.patch b/linux-next-cherry-picks/0122-xprtrdma-Ensure-ia-ri_id-qp-is-not-NULL-when-reconne.patch new file mode 100644 index 0000000..3581bee --- /dev/null +++ b/linux-next-cherry-picks/0122-xprtrdma-Ensure-ia-ri_id-qp-is-not-NULL-when-reconne.patch @@ -0,0 +1,94 @@ +From ec62f40d3505a643497d105c297093bb90afd44e Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:34:07 -0400 +Subject: [PATCH 123/132] xprtrdma: Ensure ia->ri_id->qp is not NULL when reconnecting + +Devesh Sharma reports that after a +disconnect, his HCA is failing to create a fresh QP, leaving +ia_ri->ri_id->qp set to NULL. But xprtrdma still allows RPCs to +wake up and post LOCAL_INV as they exit, causing an oops. + +rpcrdma_ep_connect() is allowing the wake-up by leaking the QP +creation error code (-EPERM in this case) to the RPC client's +generic layer. xprt_connect_status() does not recognize -EPERM, so +it kills pending RPC tasks immediately rather than retrying the +connect. + +Re-arrange the QP creation logic so that when it fails on reconnect, +it leaves ->qp with the old QP rather than NULL. If pending RPC +tasks wake and exit, LOCAL_INV work requests will flush rather than +oops. + +On initial connect, leaving ->qp == NULL is OK, since there are no +pending RPCs that might use ->qp. But be sure not to try to destroy +a NULL QP when rpcrdma_ep_connect() is retried. + +Signed-off-by: Chuck Lever +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/verbs.c | 29 ++++++++++++++++++++--------- + 1 files changed, 20 insertions(+), 9 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index c80995a..54edf2a 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -867,6 +867,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) + if (ep->rep_connected != 0) { + struct rpcrdma_xprt *xprt; + retry: ++ dprintk("RPC: %s: reconnecting...\n", __func__); + rc = rpcrdma_ep_disconnect(ep, ia); + if (rc && rc != -ENOTCONN) + dprintk("RPC: %s: rpcrdma_ep_disconnect" +@@ -879,7 +880,7 @@ retry: + id = rpcrdma_create_id(xprt, ia, + (struct sockaddr *)&xprt->rx_data.addr); + if (IS_ERR(id)) { +- rc = PTR_ERR(id); ++ rc = -EHOSTUNREACH; + goto out; + } + /* TEMP TEMP TEMP - fail if new device: +@@ -893,20 +894,30 @@ retry: + printk("RPC: %s: can't reconnect on " + "different device!\n", __func__); + rdma_destroy_id(id); +- rc = -ENETDOWN; ++ rc = -ENETUNREACH; + goto out; + } + /* END TEMP */ ++ rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); ++ if (rc) { ++ dprintk("RPC: %s: rdma_create_qp failed %i\n", ++ __func__, rc); ++ rdma_destroy_id(id); ++ rc = -ENETUNREACH; ++ goto out; ++ } + rdma_destroy_qp(ia->ri_id); + rdma_destroy_id(ia->ri_id); + ia->ri_id = id; +- } +- +- rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); +- if (rc) { +- dprintk("RPC: %s: rdma_create_qp failed %i\n", +- __func__, rc); +- goto out; ++ } else { ++ dprintk("RPC: %s: connecting...\n", __func__); ++ rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); ++ if (rc) { ++ dprintk("RPC: %s: rdma_create_qp failed %i\n", ++ __func__, rc); ++ /* do not update ep->rep_connected */ ++ return -ENETUNREACH; ++ } + } + + /* XXX Tavor device performs badly with 2K MTU! */ +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0123-xprtrdma-Remove-Tavor-MTU-setting.patch b/linux-next-cherry-picks/0123-xprtrdma-Remove-Tavor-MTU-setting.patch new file mode 100644 index 0000000..4fde4bf --- /dev/null +++ b/linux-next-cherry-picks/0123-xprtrdma-Remove-Tavor-MTU-setting.patch @@ -0,0 +1,55 @@ +From 5bc4bc729275a0bfc2bfd04466e8ab7c85af2f6e Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:34:16 -0400 +Subject: [PATCH 124/132] xprtrdma: Remove Tavor MTU setting + +Clean up. Remove HCA-specific clutter in xprtrdma, which is +supposed to be device-independent. + +Hal Rosenstock observes: +> Note that there is OpenSM option (enable_quirks) to return 1K MTU +> in SA PathRecord responses for Tavor so that can be used for this. +> The default setting for enable_quirks is FALSE so that would need +> changing. + +Signed-off-by: Chuck Lever +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/verbs.c | 14 -------------- + 1 files changed, 0 insertions(+), 14 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index 54edf2a..515dfc1 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -48,7 +48,6 @@ + */ + + #include +-#include /* for Tavor hack below */ + #include + #include + +@@ -920,19 +919,6 @@ retry: + } + } + +-/* XXX Tavor device performs badly with 2K MTU! */ +-if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { +- struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device); +- if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR && +- (pcid->vendor == PCI_VENDOR_ID_MELLANOX || +- pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) { +- struct ib_qp_attr attr = { +- .path_mtu = IB_MTU_1024 +- }; +- rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU); +- } +-} +- + ep->rep_connected = 0; + + rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0124-xprtrdma-Allocate-missing-pagelist.patch b/linux-next-cherry-picks/0124-xprtrdma-Allocate-missing-pagelist.patch new file mode 100644 index 0000000..cdf6409 --- /dev/null +++ b/linux-next-cherry-picks/0124-xprtrdma-Allocate-missing-pagelist.patch @@ -0,0 +1,38 @@ +From 196c69989d84ab902bbe545f7bd8ce78ee74dac4 Mon Sep 17 00:00:00 2001 +From: Shirley Ma +Date: Wed, 28 May 2014 10:34:24 -0400 +Subject: [PATCH 125/132] xprtrdma: Allocate missing pagelist + +GETACL relies on transport layer to alloc memory for reply buffer. +However xprtrdma assumes that the reply buffer (pagelist) has been +pre-allocated in upper layer. This problem was reported by IOL OFA lab +test on PPC. + +Signed-off-by: Shirley Ma +Reviewed-by: Chuck Lever +Tested-by: Edward Mossman +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/rpc_rdma.c | 6 ++++++ + 1 files changed, 6 insertions(+), 0 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c +index 436d229..dc4a826 100644 +--- a/net/sunrpc/xprtrdma/rpc_rdma.c ++++ b/net/sunrpc/xprtrdma/rpc_rdma.c +@@ -99,6 +99,12 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, + page_base = xdrbuf->page_base & ~PAGE_MASK; + p = 0; + while (len && n < nsegs) { ++ if (!ppages[p]) { ++ /* alloc the pagelist for receiving buffer */ ++ ppages[p] = alloc_page(GFP_ATOMIC); ++ if (!ppages[p]) ++ return 0; ++ } + seg[n].mr_page = ppages[p]; + seg[n].mr_offset = (void *)(unsigned long) page_base; + seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0125-xprtrdma-Use-macros-for-reconnection-timeout-constan.patch b/linux-next-cherry-picks/0125-xprtrdma-Use-macros-for-reconnection-timeout-constan.patch new file mode 100644 index 0000000..79de6d2 --- /dev/null +++ b/linux-next-cherry-picks/0125-xprtrdma-Use-macros-for-reconnection-timeout-constan.patch @@ -0,0 +1,61 @@ +From bfaee096deaa680195df5491eb650f81051c145d Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:34:32 -0400 +Subject: [PATCH 126/132] xprtrdma: Use macros for reconnection timeout constants + +Clean up: Ensure the same max and min constant values are used +everywhere when setting reconnect timeouts. + +Signed-off-by: Chuck Lever +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/transport.c | 19 ++++++++++++------- + 1 files changed, 12 insertions(+), 7 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c +index d18b2a3..6b84d7d 100644 +--- a/net/sunrpc/xprtrdma/transport.c ++++ b/net/sunrpc/xprtrdma/transport.c +@@ -149,6 +149,11 @@ static struct ctl_table sunrpc_table[] = { + + #endif + ++#define RPCRDMA_BIND_TO (60U * HZ) ++#define RPCRDMA_INIT_REEST_TO (5U * HZ) ++#define RPCRDMA_MAX_REEST_TO (30U * HZ) ++#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) ++ + static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ + + static void +@@ -285,9 +290,9 @@ xprt_setup_rdma(struct xprt_create *args) + + /* 60 second timeout, no retries */ + xprt->timeout = &xprt_rdma_default_timeout; +- xprt->bind_timeout = (60U * HZ); +- xprt->reestablish_timeout = (5U * HZ); +- xprt->idle_timeout = (5U * 60 * HZ); ++ xprt->bind_timeout = RPCRDMA_BIND_TO; ++ xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; ++ xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; + + xprt->resvport = 0; /* privileged port not needed */ + xprt->tsh_size = 0; /* RPC-RDMA handles framing */ +@@ -432,10 +437,10 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) + schedule_delayed_work(&r_xprt->rdma_connect, + xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; +- if (xprt->reestablish_timeout > (30 * HZ)) +- xprt->reestablish_timeout = (30 * HZ); +- else if (xprt->reestablish_timeout < (5 * HZ)) +- xprt->reestablish_timeout = (5 * HZ); ++ if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO) ++ xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO; ++ else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) ++ xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; + } else { + schedule_delayed_work(&r_xprt->rdma_connect, 0); + if (!RPC_IS_ASYNC(task)) +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0126-xprtrdma-Reset-connection-timeout-after-successful-r.patch b/linux-next-cherry-picks/0126-xprtrdma-Reset-connection-timeout-after-successful-r.patch new file mode 100644 index 0000000..8a678dc --- /dev/null +++ b/linux-next-cherry-picks/0126-xprtrdma-Reset-connection-timeout-after-successful-r.patch @@ -0,0 +1,33 @@ +From 18906972aa1103c07869c9b43860a52e0e27e8e5 Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:34:41 -0400 +Subject: [PATCH 127/132] xprtrdma: Reset connection timeout after successful reconnect + +If the new connection is able to make forward progress, reset the +re-establish timeout. Otherwise it keeps growing even if disconnect +events are rare. + +The same behavior as TCP is adopted: reconnect immediately if the +transport instance has been able to make some forward progress. + +Signed-off-by: Chuck Lever +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/rpc_rdma.c | 1 + + 1 files changed, 1 insertions(+), 0 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c +index dc4a826..ac65b0c 100644 +--- a/net/sunrpc/xprtrdma/rpc_rdma.c ++++ b/net/sunrpc/xprtrdma/rpc_rdma.c +@@ -770,6 +770,7 @@ repost: + + /* from here on, the reply is no longer an orphan */ + req->rl_reply = rep; ++ xprt->reestablish_timeout = 0; + + /* check for expected message types */ + /* The order of some of these tests is important. */ +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0127-xprtrdma-Avoid-deadlock-when-credit-window-is-reset.patch b/linux-next-cherry-picks/0127-xprtrdma-Avoid-deadlock-when-credit-window-is-reset.patch new file mode 100644 index 0000000..1578dbb --- /dev/null +++ b/linux-next-cherry-picks/0127-xprtrdma-Avoid-deadlock-when-credit-window-is-reset.patch @@ -0,0 +1,104 @@ +From e7ce710a8802351bd4118c5d6136c1d850f67cf9 Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:34:57 -0400 +Subject: [PATCH 128/132] xprtrdma: Avoid deadlock when credit window is reset + +Update the cwnd while processing the server's reply. Otherwise the +next task on the xprt_sending queue is still subject to the old +credit window. Currently, no task is awoken if the old congestion +window is still exceeded, even if the new window is larger, and a +deadlock results. + +This is an issue during a transport reconnect. Servers don't +normally shrink the credit window, but the client does reset it to +1 when reconnecting so the server can safely grow it again. + +As a minor optimization, remove the hack of grabbing the initial +cwnd size (which happens to be RPC_CWNDSCALE) and using that value +as the congestion scaling factor. The scaling value is invariant, +and we are better off without the multiplication operation. + +Signed-off-by: Chuck Lever +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/rpc_rdma.c | 6 ++++++ + net/sunrpc/xprtrdma/transport.c | 19 +------------------ + net/sunrpc/xprtrdma/xprt_rdma.h | 1 - + 3 files changed, 7 insertions(+), 19 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c +index ac65b0c..77b84cf 100644 +--- a/net/sunrpc/xprtrdma/rpc_rdma.c ++++ b/net/sunrpc/xprtrdma/rpc_rdma.c +@@ -716,6 +716,7 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + __be32 *iptr; + int rdmalen, status; ++ unsigned long cwnd; + + /* Check status. If bad, signal disconnect and return rep to pool */ + if (rep->rr_len == ~0U) { +@@ -845,6 +846,11 @@ badheader: + break; + } + ++ cwnd = xprt->cwnd; ++ xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; ++ if (xprt->cwnd > cwnd) ++ xprt_release_rqst_cong(rqst->rq_task); ++ + dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", + __func__, xprt, rqst, status); + xprt_complete_rqst(rqst->rq_task, status); +diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c +index 6b84d7d..187894b 100644 +--- a/net/sunrpc/xprtrdma/transport.c ++++ b/net/sunrpc/xprtrdma/transport.c +@@ -448,23 +448,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) + } + } + +-static int +-xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) +-{ +- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); +- int credits = atomic_read(&r_xprt->rx_buf.rb_credits); +- +- /* == RPC_CWNDSCALE @ init, but *after* setup */ +- if (r_xprt->rx_buf.rb_cwndscale == 0UL) { +- r_xprt->rx_buf.rb_cwndscale = xprt->cwnd; +- dprintk("RPC: %s: cwndscale %lu\n", __func__, +- r_xprt->rx_buf.rb_cwndscale); +- BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0); +- } +- xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale; +- return xprt_reserve_xprt_cong(xprt, task); +-} +- + /* + * The RDMA allocate/free functions need the task structure as a place + * to hide the struct rpcrdma_req, which is necessary for the actual send/recv +@@ -686,7 +669,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) + */ + + static struct rpc_xprt_ops xprt_rdma_procs = { +- .reserve_xprt = xprt_rdma_reserve_xprt, ++ .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ + .alloc_slot = xprt_alloc_slot, + .release_request = xprt_release_rqst_cong, /* ditto */ +diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h +index 0c3b88e..89e7cd4 100644 +--- a/net/sunrpc/xprtrdma/xprt_rdma.h ++++ b/net/sunrpc/xprtrdma/xprt_rdma.h +@@ -212,7 +212,6 @@ struct rpcrdma_req { + struct rpcrdma_buffer { + spinlock_t rb_lock; /* protects indexes */ + atomic_t rb_credits; /* most recent server credits */ +- unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */ + int rb_max_requests;/* client max requests */ + struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ + int rb_send_index; +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0128-xprtrdma-Remove-BUG_ON-call-sites.patch b/linux-next-cherry-picks/0128-xprtrdma-Remove-BUG_ON-call-sites.patch new file mode 100644 index 0000000..809584e --- /dev/null +++ b/linux-next-cherry-picks/0128-xprtrdma-Remove-BUG_ON-call-sites.patch @@ -0,0 +1,83 @@ +From c977dea22708688eae31774f70126c97aa4dfe83 Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:35:06 -0400 +Subject: [PATCH 129/132] xprtrdma: Remove BUG_ON() call sites + +If an error occurs in the marshaling logic, fail the RPC request +being processed, but leave the client running. + +Signed-off-by: Chuck Lever +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/transport.c | 3 ++- + net/sunrpc/xprtrdma/verbs.c | 18 ++++++++++-------- + 2 files changed, 12 insertions(+), 9 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c +index 187894b..93fe775 100644 +--- a/net/sunrpc/xprtrdma/transport.c ++++ b/net/sunrpc/xprtrdma/transport.c +@@ -463,7 +463,8 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) + struct rpcrdma_req *req, *nreq; + + req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); +- BUG_ON(NULL == req); ++ if (req == NULL) ++ return NULL; + + if (size > req->rl_size) { + dprintk("RPC: %s: size %zd too large for buffer[%zd]: " +diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c +index 515dfc1..13dbd1c 100644 +--- a/net/sunrpc/xprtrdma/verbs.c ++++ b/net/sunrpc/xprtrdma/verbs.c +@@ -1302,7 +1302,6 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) + int i; + unsigned long flags; + +- BUG_ON(req->rl_nchunks != 0); + spin_lock_irqsave(&buffers->rb_lock, flags); + buffers->rb_send_bufs[--buffers->rb_send_index] = req; + req->rl_niovs = 0; +@@ -1535,10 +1534,6 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, + } else + post_wr = &frmr_wr; + +- /* Bump the key */ +- key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); +- ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); +- + /* Prepare FRMR WR */ + memset(&frmr_wr, 0, sizeof frmr_wr); + frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; +@@ -1549,7 +1544,16 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, + frmr_wr.wr.fast_reg.page_list_len = page_no; + frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; +- BUG_ON(frmr_wr.wr.fast_reg.length < len); ++ if (frmr_wr.wr.fast_reg.length < len) { ++ while (seg1->mr_nsegs--) ++ rpcrdma_unmap_one(ia, seg++); ++ return -EIO; ++ } ++ ++ /* Bump the key */ ++ key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); ++ ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); ++ + frmr_wr.wr.fast_reg.access_flags = (writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ); +@@ -1709,9 +1713,7 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, + + #if RPCRDMA_PERSISTENT_REGISTRATION + case RPCRDMA_ALLPHYSICAL: +- BUG_ON(nsegs != 1); + rpcrdma_unmap_one(ia, seg); +- rc = 0; + break; + #endif + +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0129-xprtrdma-Disconnect-on-registration-failure.patch b/linux-next-cherry-picks/0129-xprtrdma-Disconnect-on-registration-failure.patch new file mode 100644 index 0000000..e943208 --- /dev/null +++ b/linux-next-cherry-picks/0129-xprtrdma-Disconnect-on-registration-failure.patch @@ -0,0 +1,215 @@ +From c93c62231cf55df4a26bd08937efeea97e6fc5e8 Mon Sep 17 00:00:00 2001 +From: Chuck Lever +Date: Wed, 28 May 2014 10:35:14 -0400 +Subject: [PATCH 130/132] xprtrdma: Disconnect on registration failure + +If rpcrdma_register_external() fails during request marshaling, the +current RPC request is killed. Instead, this RPC should be retried +after reconnecting the transport instance. + +The most likely reason for registration failure with FRMR is a +failed post_send, which would be due to a remote transport +disconnect or memory exhaustion. These issues can be recovered +by a retry. + +Problems encountered in the marshaling logic itself will not be +corrected by trying again, so these should still kill a request. + +Now that we've added a clean exit for marshaling errors, take the +opportunity to defang some BUG_ON's. + +Signed-off-by: Chuck Lever +Signed-off-by: Anna Schumaker +--- + net/sunrpc/xprtrdma/rpc_rdma.c | 48 +++++++++++++++++++++++++------------- + net/sunrpc/xprtrdma/transport.c | 17 +++++++++----- + 2 files changed, 42 insertions(+), 23 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c +index 77b84cf..693966d 100644 +--- a/net/sunrpc/xprtrdma/rpc_rdma.c ++++ b/net/sunrpc/xprtrdma/rpc_rdma.c +@@ -77,6 +77,8 @@ static const char transfertypes[][12] = { + * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk + * elements. Segments are then coalesced when registered, if possible + * within the selected memreg mode. ++ * ++ * Returns positive number of segments converted, or a negative errno. + */ + + static int +@@ -103,12 +105,13 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, + /* alloc the pagelist for receiving buffer */ + ppages[p] = alloc_page(GFP_ATOMIC); + if (!ppages[p]) +- return 0; ++ return -ENOMEM; + } + seg[n].mr_page = ppages[p]; + seg[n].mr_offset = (void *)(unsigned long) page_base; + seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); +- BUG_ON(seg[n].mr_len > PAGE_SIZE); ++ if (seg[n].mr_len > PAGE_SIZE) ++ return -EIO; + len -= seg[n].mr_len; + ++n; + ++p; +@@ -117,7 +120,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, + + /* Message overflows the seg array */ + if (len && n == nsegs) +- return 0; ++ return -EIO; + + if (xdrbuf->tail[0].iov_len) { + /* the rpcrdma protocol allows us to omit any trailing +@@ -126,7 +129,7 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, + return n; + if (n == nsegs) + /* Tail remains, but we're out of segments */ +- return 0; ++ return -EIO; + seg[n].mr_page = NULL; + seg[n].mr_offset = xdrbuf->tail[0].iov_base; + seg[n].mr_len = xdrbuf->tail[0].iov_len; +@@ -167,15 +170,17 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, + * Reply chunk (a counted array): + * N elements: + * 1 - N - HLOO - HLOO - ... - HLOO ++ * ++ * Returns positive RPC/RDMA header size, or negative errno. + */ + +-static unsigned int ++static ssize_t + rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, + struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) + { + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); +- int nsegs, nchunks = 0; ++ int n, nsegs, nchunks = 0; + unsigned int pos; + struct rpcrdma_mr_seg *seg = req->rl_segments; + struct rpcrdma_read_chunk *cur_rchunk = NULL; +@@ -201,11 +206,11 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, + pos = target->head[0].iov_len; + + nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); +- if (nsegs == 0) +- return 0; ++ if (nsegs < 0) ++ return nsegs; + + do { +- int n = rpcrdma_register_external(seg, nsegs, ++ n = rpcrdma_register_external(seg, nsegs, + cur_wchunk != NULL, r_xprt); + if (n <= 0) + goto out; +@@ -277,7 +282,7 @@ out: + for (pos = 0; nchunks--;) + pos += rpcrdma_deregister_external( + &req->rl_segments[pos], r_xprt); +- return 0; ++ return n; + } + + /* +@@ -359,6 +364,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) + * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. + * [2] -- optional padding. + * [3] -- if padded, header only in [1] and data here. ++ * ++ * Returns zero on success, otherwise a negative errno. + */ + + int +@@ -368,7 +375,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + char *base; +- size_t hdrlen, rpclen, padlen; ++ size_t rpclen, padlen; ++ ssize_t hdrlen; + enum rpcrdma_chunktype rtype, wtype; + struct rpcrdma_msg *headerp; + +@@ -439,7 +447,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) + /* The following simplification is not true forever */ + if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) + wtype = rpcrdma_noch; +- BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch); ++ if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { ++ dprintk("RPC: %s: cannot marshal multiple chunk lists\n", ++ __func__); ++ return -EIO; ++ } + + hdrlen = 28; /*sizeof *headerp;*/ + padlen = 0; +@@ -464,8 +476,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) + headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; + headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; + hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ +- BUG_ON(wtype != rpcrdma_noch); +- ++ if (wtype != rpcrdma_noch) { ++ dprintk("RPC: %s: invalid chunk list\n", ++ __func__); ++ return -EIO; ++ } + } else { + headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; + headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; +@@ -500,9 +515,8 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) + hdrlen = rpcrdma_create_chunks(rqst, + &rqst->rq_rcv_buf, headerp, wtype); + } +- +- if (hdrlen == 0) +- return -1; ++ if (hdrlen < 0) ++ return hdrlen; + + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" + " headerp 0x%p base 0x%p lkey 0x%x\n", +diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c +index 93fe775..66f91f0 100644 +--- a/net/sunrpc/xprtrdma/transport.c ++++ b/net/sunrpc/xprtrdma/transport.c +@@ -595,13 +595,12 @@ xprt_rdma_send_request(struct rpc_task *task) + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); ++ int rc; + +- /* marshal the send itself */ +- if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) { +- r_xprt->rx_stats.failed_marshal_count++; +- dprintk("RPC: %s: rpcrdma_marshal_req failed\n", +- __func__); +- return -EIO; ++ if (req->rl_niovs == 0) { ++ rc = rpcrdma_marshal_req(rqst); ++ if (rc < 0) ++ goto failed_marshal; + } + + if (req->rl_reply == NULL) /* e.g. reconnection */ +@@ -625,6 +624,12 @@ xprt_rdma_send_request(struct rpc_task *task) + rqst->rq_bytes_sent = 0; + return 0; + ++failed_marshal: ++ r_xprt->rx_stats.failed_marshal_count++; ++ dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", ++ __func__, rc); ++ if (rc == -EIO) ++ return -EIO; + drop_connection: + xprt_disconnect_done(xprt); + return -ENOTCONN; /* implies disconnect */ +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0130-svcrdma-refactor-marshalling-logic.patch b/linux-next-cherry-picks/0130-svcrdma-refactor-marshalling-logic.patch new file mode 100644 index 0000000..aac5be8 --- /dev/null +++ b/linux-next-cherry-picks/0130-svcrdma-refactor-marshalling-logic.patch @@ -0,0 +1,1243 @@ +From 0bf4828983dff062cd502f27ab8644b32774e72e Mon Sep 17 00:00:00 2001 +From: Steve Wise +Date: Wed, 28 May 2014 15:12:01 -0500 +Subject: [PATCH 131/132] svcrdma: refactor marshalling logic + +This patch refactors the NFSRDMA server marshalling logic to +remove the intermediary map structures. It also fixes an existing bug +where the NFSRDMA server was not minding the device fast register page +list length limitations. + +Signed-off-by: Tom Tucker +Signed-off-by: Steve Wise +--- + net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 643 +++++++++++++----------------- + net/sunrpc/xprtrdma/svc_rdma_sendto.c | 230 +---------- + net/sunrpc/xprtrdma/svc_rdma_transport.c | 62 ++-- + 3 files changed, 331 insertions(+), 604 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +index 8d904e4..52d9f2c 100644 +--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +@@ -1,4 +1,5 @@ + /* ++ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two +@@ -69,7 +70,8 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, + + /* Set up the XDR head */ + rqstp->rq_arg.head[0].iov_base = page_address(page); +- rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length); ++ rqstp->rq_arg.head[0].iov_len = ++ min_t(size_t, byte_count, ctxt->sge[0].length); + rqstp->rq_arg.len = byte_count; + rqstp->rq_arg.buflen = byte_count; + +@@ -85,7 +87,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, + page = ctxt->pages[sge_no]; + put_page(rqstp->rq_pages[sge_no]); + rqstp->rq_pages[sge_no] = page; +- bc -= min(bc, ctxt->sge[sge_no].length); ++ bc -= min_t(u32, bc, ctxt->sge[sge_no].length); + rqstp->rq_arg.buflen += ctxt->sge[sge_no].length; + sge_no++; + } +@@ -113,291 +115,265 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, + rqstp->rq_arg.tail[0].iov_len = 0; + } + +-/* Encode a read-chunk-list as an array of IB SGE +- * +- * Assumptions: +- * - chunk[0]->position points to pages[0] at an offset of 0 +- * - pages[] is not physically or virtually contiguous and consists of +- * PAGE_SIZE elements. +- * +- * Output: +- * - sge array pointing into pages[] array. +- * - chunk_sge array specifying sge index and count for each +- * chunk in the read list +- * +- */ +-static int map_read_chunks(struct svcxprt_rdma *xprt, +- struct svc_rqst *rqstp, +- struct svc_rdma_op_ctxt *head, +- struct rpcrdma_msg *rmsgp, +- struct svc_rdma_req_map *rpl_map, +- struct svc_rdma_req_map *chl_map, +- int ch_count, +- int byte_count) ++static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) + { +- int sge_no; +- int sge_bytes; +- int page_off; +- int page_no; +- int ch_bytes; +- int ch_no; +- struct rpcrdma_read_chunk *ch; ++ if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) == ++ RDMA_TRANSPORT_IWARP) ++ return 1; ++ else ++ return min_t(int, sge_count, xprt->sc_max_sge); ++} + +- sge_no = 0; +- page_no = 0; +- page_off = 0; +- ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; +- ch_no = 0; +- ch_bytes = ntohl(ch->rc_target.rs_length); +- head->arg.head[0] = rqstp->rq_arg.head[0]; +- head->arg.tail[0] = rqstp->rq_arg.tail[0]; +- head->arg.pages = &head->pages[head->count]; +- head->hdr_count = head->count; /* save count of hdr pages */ +- head->arg.page_base = 0; +- head->arg.page_len = ch_bytes; +- head->arg.len = rqstp->rq_arg.len + ch_bytes; +- head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes; +- head->count++; +- chl_map->ch[0].start = 0; +- while (byte_count) { +- rpl_map->sge[sge_no].iov_base = +- page_address(rqstp->rq_arg.pages[page_no]) + page_off; +- sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes); +- rpl_map->sge[sge_no].iov_len = sge_bytes; +- /* +- * Don't bump head->count here because the same page +- * may be used by multiple SGE. +- */ +- head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; +- rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; ++typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt, ++ struct svc_rqst *rqstp, ++ struct svc_rdma_op_ctxt *head, ++ int *page_no, ++ u32 *page_offset, ++ u32 rs_handle, ++ u32 rs_length, ++ u64 rs_offset, ++ int last); ++ ++/* Issue an RDMA_READ using the local lkey to map the data sink */ ++static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, ++ struct svc_rqst *rqstp, ++ struct svc_rdma_op_ctxt *head, ++ int *page_no, ++ u32 *page_offset, ++ u32 rs_handle, ++ u32 rs_length, ++ u64 rs_offset, ++ int last) ++{ ++ struct ib_send_wr read_wr; ++ int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; ++ struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); ++ int ret, read, pno; ++ u32 pg_off = *page_offset; ++ u32 pg_no = *page_no; ++ ++ ctxt->direction = DMA_FROM_DEVICE; ++ ctxt->read_hdr = head; ++ pages_needed = ++ min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed)); ++ read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); ++ ++ for (pno = 0; pno < pages_needed; pno++) { ++ int len = min_t(int, rs_length, PAGE_SIZE - pg_off); ++ ++ head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; ++ head->arg.page_len += len; ++ head->arg.len += len; ++ if (!pg_off) ++ head->count++; ++ rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; + rqstp->rq_next_page = rqstp->rq_respages + 1; ++ ctxt->sge[pno].addr = ++ ib_dma_map_page(xprt->sc_cm_id->device, ++ head->arg.pages[pg_no], pg_off, ++ PAGE_SIZE - pg_off, ++ DMA_FROM_DEVICE); ++ ret = ib_dma_mapping_error(xprt->sc_cm_id->device, ++ ctxt->sge[pno].addr); ++ if (ret) ++ goto err; ++ atomic_inc(&xprt->sc_dma_used); + +- byte_count -= sge_bytes; +- ch_bytes -= sge_bytes; +- sge_no++; +- /* +- * If all bytes for this chunk have been mapped to an +- * SGE, move to the next SGE +- */ +- if (ch_bytes == 0) { +- chl_map->ch[ch_no].count = +- sge_no - chl_map->ch[ch_no].start; +- ch_no++; +- ch++; +- chl_map->ch[ch_no].start = sge_no; +- ch_bytes = ntohl(ch->rc_target.rs_length); +- /* If bytes remaining account for next chunk */ +- if (byte_count) { +- head->arg.page_len += ch_bytes; +- head->arg.len += ch_bytes; +- head->arg.buflen += ch_bytes; +- } ++ /* The lkey here is either a local dma lkey or a dma_mr lkey */ ++ ctxt->sge[pno].lkey = xprt->sc_dma_lkey; ++ ctxt->sge[pno].length = len; ++ ctxt->count++; ++ ++ /* adjust offset and wrap to next page if needed */ ++ pg_off += len; ++ if (pg_off == PAGE_SIZE) { ++ pg_off = 0; ++ pg_no++; + } +- /* +- * If this SGE consumed all of the page, move to the +- * next page +- */ +- if ((sge_bytes + page_off) == PAGE_SIZE) { +- page_no++; +- page_off = 0; +- /* +- * If there are still bytes left to map, bump +- * the page count +- */ +- if (byte_count) +- head->count++; +- } else +- page_off += sge_bytes; ++ rs_length -= len; + } +- BUG_ON(byte_count != 0); +- return sge_no; ++ ++ if (last && rs_length == 0) ++ set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); ++ else ++ clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); ++ ++ memset(&read_wr, 0, sizeof(read_wr)); ++ read_wr.wr_id = (unsigned long)ctxt; ++ read_wr.opcode = IB_WR_RDMA_READ; ++ ctxt->wr_op = read_wr.opcode; ++ read_wr.send_flags = IB_SEND_SIGNALED; ++ read_wr.wr.rdma.rkey = rs_handle; ++ read_wr.wr.rdma.remote_addr = rs_offset; ++ read_wr.sg_list = ctxt->sge; ++ read_wr.num_sge = pages_needed; ++ ++ ret = svc_rdma_send(xprt, &read_wr); ++ if (ret) { ++ pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); ++ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); ++ goto err; ++ } ++ ++ /* return current location in page array */ ++ *page_no = pg_no; ++ *page_offset = pg_off; ++ ret = read; ++ atomic_inc(&rdma_stat_read); ++ return ret; ++ err: ++ svc_rdma_unmap_dma(ctxt); ++ svc_rdma_put_context(ctxt, 0); ++ return ret; + } + +-/* Map a read-chunk-list to an XDR and fast register the page-list. +- * +- * Assumptions: +- * - chunk[0] position points to pages[0] at an offset of 0 +- * - pages[] will be made physically contiguous by creating a one-off memory +- * region using the fastreg verb. +- * - byte_count is # of bytes in read-chunk-list +- * - ch_count is # of chunks in read-chunk-list +- * +- * Output: +- * - sge array pointing into pages[] array. +- * - chunk_sge array specifying sge index and count for each +- * chunk in the read list +- */ +-static int fast_reg_read_chunks(struct svcxprt_rdma *xprt, ++/* Issue an RDMA_READ using an FRMR to map the data sink */ ++static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, +- struct rpcrdma_msg *rmsgp, +- struct svc_rdma_req_map *rpl_map, +- struct svc_rdma_req_map *chl_map, +- int ch_count, +- int byte_count) ++ int *page_no, ++ u32 *page_offset, ++ u32 rs_handle, ++ u32 rs_length, ++ u64 rs_offset, ++ int last) + { +- int page_no; +- int ch_no; +- u32 offset; +- struct rpcrdma_read_chunk *ch; +- struct svc_rdma_fastreg_mr *frmr; +- int ret = 0; ++ struct ib_send_wr read_wr; ++ struct ib_send_wr inv_wr; ++ struct ib_send_wr fastreg_wr; ++ u8 key; ++ int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; ++ struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); ++ struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt); ++ int ret, read, pno; ++ u32 pg_off = *page_offset; ++ u32 pg_no = *page_no; + +- frmr = svc_rdma_get_frmr(xprt); + if (IS_ERR(frmr)) + return -ENOMEM; + +- head->frmr = frmr; +- head->arg.head[0] = rqstp->rq_arg.head[0]; +- head->arg.tail[0] = rqstp->rq_arg.tail[0]; +- head->arg.pages = &head->pages[head->count]; +- head->hdr_count = head->count; /* save count of hdr pages */ +- head->arg.page_base = 0; +- head->arg.page_len = byte_count; +- head->arg.len = rqstp->rq_arg.len + byte_count; +- head->arg.buflen = rqstp->rq_arg.buflen + byte_count; ++ ctxt->direction = DMA_FROM_DEVICE; ++ ctxt->frmr = frmr; ++ pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len); ++ read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); + +- /* Fast register the page list */ +- frmr->kva = page_address(rqstp->rq_arg.pages[0]); ++ frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]); + frmr->direction = DMA_FROM_DEVICE; + frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE); +- frmr->map_len = byte_count; +- frmr->page_list_len = PAGE_ALIGN(byte_count) >> PAGE_SHIFT; +- for (page_no = 0; page_no < frmr->page_list_len; page_no++) { +- frmr->page_list->page_list[page_no] = ++ frmr->map_len = pages_needed << PAGE_SHIFT; ++ frmr->page_list_len = pages_needed; ++ ++ for (pno = 0; pno < pages_needed; pno++) { ++ int len = min_t(int, rs_length, PAGE_SIZE - pg_off); ++ ++ head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; ++ head->arg.page_len += len; ++ head->arg.len += len; ++ if (!pg_off) ++ head->count++; ++ rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; ++ rqstp->rq_next_page = rqstp->rq_respages + 1; ++ frmr->page_list->page_list[pno] = + ib_dma_map_page(xprt->sc_cm_id->device, +- rqstp->rq_arg.pages[page_no], 0, ++ head->arg.pages[pg_no], 0, + PAGE_SIZE, DMA_FROM_DEVICE); +- if (ib_dma_mapping_error(xprt->sc_cm_id->device, +- frmr->page_list->page_list[page_no])) +- goto fatal_err; ++ ret = ib_dma_mapping_error(xprt->sc_cm_id->device, ++ frmr->page_list->page_list[pno]); ++ if (ret) ++ goto err; + atomic_inc(&xprt->sc_dma_used); +- head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; +- } +- head->count += page_no; +- +- /* rq_respages points one past arg pages */ +- rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; +- rqstp->rq_next_page = rqstp->rq_respages + 1; + +- /* Create the reply and chunk maps */ +- offset = 0; +- ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; +- for (ch_no = 0; ch_no < ch_count; ch_no++) { +- int len = ntohl(ch->rc_target.rs_length); +- rpl_map->sge[ch_no].iov_base = frmr->kva + offset; +- rpl_map->sge[ch_no].iov_len = len; +- chl_map->ch[ch_no].count = 1; +- chl_map->ch[ch_no].start = ch_no; +- offset += len; +- ch++; ++ /* adjust offset and wrap to next page if needed */ ++ pg_off += len; ++ if (pg_off == PAGE_SIZE) { ++ pg_off = 0; ++ pg_no++; ++ } ++ rs_length -= len; + } + +- ret = svc_rdma_fastreg(xprt, frmr); +- if (ret) +- goto fatal_err; +- +- return ch_no; +- +- fatal_err: +- printk("svcrdma: error fast registering xdr for xprt %p", xprt); +- svc_rdma_put_frmr(xprt, frmr); +- return -EIO; +-} +- +-static int rdma_set_ctxt_sge(struct svcxprt_rdma *xprt, +- struct svc_rdma_op_ctxt *ctxt, +- struct svc_rdma_fastreg_mr *frmr, +- struct kvec *vec, +- u64 *sgl_offset, +- int count) +-{ +- int i; +- unsigned long off; ++ if (last && rs_length == 0) ++ set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); ++ else ++ clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + +- ctxt->count = count; +- ctxt->direction = DMA_FROM_DEVICE; +- for (i = 0; i < count; i++) { +- ctxt->sge[i].length = 0; /* in case map fails */ +- if (!frmr) { +- BUG_ON(!virt_to_page(vec[i].iov_base)); +- off = (unsigned long)vec[i].iov_base & ~PAGE_MASK; +- ctxt->sge[i].addr = +- ib_dma_map_page(xprt->sc_cm_id->device, +- virt_to_page(vec[i].iov_base), +- off, +- vec[i].iov_len, +- DMA_FROM_DEVICE); +- if (ib_dma_mapping_error(xprt->sc_cm_id->device, +- ctxt->sge[i].addr)) +- return -EINVAL; +- ctxt->sge[i].lkey = xprt->sc_dma_lkey; +- atomic_inc(&xprt->sc_dma_used); +- } else { +- ctxt->sge[i].addr = (unsigned long)vec[i].iov_base; +- ctxt->sge[i].lkey = frmr->mr->lkey; +- } +- ctxt->sge[i].length = vec[i].iov_len; +- *sgl_offset = *sgl_offset + vec[i].iov_len; ++ /* Bump the key */ ++ key = (u8)(frmr->mr->lkey & 0x000000FF); ++ ib_update_fast_reg_key(frmr->mr, ++key); ++ ++ ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset; ++ ctxt->sge[0].lkey = frmr->mr->lkey; ++ ctxt->sge[0].length = read; ++ ctxt->count = 1; ++ ctxt->read_hdr = head; ++ ++ /* Prepare FASTREG WR */ ++ memset(&fastreg_wr, 0, sizeof(fastreg_wr)); ++ fastreg_wr.opcode = IB_WR_FAST_REG_MR; ++ fastreg_wr.send_flags = IB_SEND_SIGNALED; ++ fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; ++ fastreg_wr.wr.fast_reg.page_list = frmr->page_list; ++ fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; ++ fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; ++ fastreg_wr.wr.fast_reg.length = frmr->map_len; ++ fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; ++ fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; ++ fastreg_wr.next = &read_wr; ++ ++ /* Prepare RDMA_READ */ ++ memset(&read_wr, 0, sizeof(read_wr)); ++ read_wr.send_flags = IB_SEND_SIGNALED; ++ read_wr.wr.rdma.rkey = rs_handle; ++ read_wr.wr.rdma.remote_addr = rs_offset; ++ read_wr.sg_list = ctxt->sge; ++ read_wr.num_sge = 1; ++ if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) { ++ read_wr.opcode = IB_WR_RDMA_READ_WITH_INV; ++ read_wr.wr_id = (unsigned long)ctxt; ++ read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey; ++ } else { ++ read_wr.opcode = IB_WR_RDMA_READ; ++ read_wr.next = &inv_wr; ++ /* Prepare invalidate */ ++ memset(&inv_wr, 0, sizeof(inv_wr)); ++ inv_wr.wr_id = (unsigned long)ctxt; ++ inv_wr.opcode = IB_WR_LOCAL_INV; ++ inv_wr.send_flags = IB_SEND_SIGNALED; ++ inv_wr.ex.invalidate_rkey = frmr->mr->lkey; ++ } ++ ctxt->wr_op = read_wr.opcode; ++ ++ /* Post the chain */ ++ ret = svc_rdma_send(xprt, &fastreg_wr); ++ if (ret) { ++ pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); ++ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); ++ goto err; + } +- return 0; +-} + +-static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) +-{ +- if ((rdma_node_get_transport(xprt->sc_cm_id->device->node_type) == +- RDMA_TRANSPORT_IWARP) && +- sge_count > 1) +- return 1; +- else +- return min_t(int, sge_count, xprt->sc_max_sge); ++ /* return current location in page array */ ++ *page_no = pg_no; ++ *page_offset = pg_off; ++ ret = read; ++ atomic_inc(&rdma_stat_read); ++ return ret; ++ err: ++ svc_rdma_unmap_dma(ctxt); ++ svc_rdma_put_context(ctxt, 0); ++ svc_rdma_put_frmr(xprt, frmr); ++ return ret; + } + +-/* +- * Use RDMA_READ to read data from the advertised client buffer into the +- * XDR stream starting at rq_arg.head[0].iov_base. +- * Each chunk in the array +- * contains the following fields: +- * discrim - '1', This isn't used for data placement +- * position - The xdr stream offset (the same for every chunk) +- * handle - RMR for client memory region +- * length - data transfer length +- * offset - 64 bit tagged offset in remote memory region +- * +- * On our side, we need to read into a pagelist. The first page immediately +- * follows the RPC header. +- * +- * This function returns: +- * 0 - No error and no read-list found. +- * +- * 1 - Successful read-list processing. The data is not yet in +- * the pagelist and therefore the RPC request must be deferred. The +- * I/O completion will enqueue the transport again and +- * svc_rdma_recvfrom will complete the request. +- * +- * <0 - Error processing/posting read-list. +- * +- * NOTE: The ctxt must not be touched after the last WR has been posted +- * because the I/O completion processing may occur on another +- * processor and free / modify the context. Ne touche pas! +- */ +-static int rdma_read_xdr(struct svcxprt_rdma *xprt, +- struct rpcrdma_msg *rmsgp, +- struct svc_rqst *rqstp, +- struct svc_rdma_op_ctxt *hdr_ctxt) ++static int rdma_read_chunks(struct svcxprt_rdma *xprt, ++ struct rpcrdma_msg *rmsgp, ++ struct svc_rqst *rqstp, ++ struct svc_rdma_op_ctxt *head) + { +- struct ib_send_wr read_wr; +- struct ib_send_wr inv_wr; +- int err = 0; +- int ch_no; +- int ch_count; +- int byte_count; +- int sge_count; +- u64 sgl_offset; ++ int page_no, ch_count, ret; + struct rpcrdma_read_chunk *ch; +- struct svc_rdma_op_ctxt *ctxt = NULL; +- struct svc_rdma_req_map *rpl_map; +- struct svc_rdma_req_map *chl_map; ++ u32 page_offset, byte_count; ++ u64 rs_offset; ++ rdma_reader_fn reader; + + /* If no read list is present, return 0 */ + ch = svc_rdma_get_read_chunk(rmsgp); +@@ -408,122 +384,55 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, + if (ch_count > RPCSVC_MAXPAGES) + return -EINVAL; + +- /* Allocate temporary reply and chunk maps */ +- rpl_map = svc_rdma_get_req_map(); +- chl_map = svc_rdma_get_req_map(); ++ /* The request is completed when the RDMA_READs complete. The ++ * head context keeps all the pages that comprise the ++ * request. ++ */ ++ head->arg.head[0] = rqstp->rq_arg.head[0]; ++ head->arg.tail[0] = rqstp->rq_arg.tail[0]; ++ head->arg.pages = &head->pages[head->count]; ++ head->hdr_count = head->count; ++ head->arg.page_base = 0; ++ head->arg.page_len = 0; ++ head->arg.len = rqstp->rq_arg.len; ++ head->arg.buflen = rqstp->rq_arg.buflen; + +- if (!xprt->sc_frmr_pg_list_len) +- sge_count = map_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, +- rpl_map, chl_map, ch_count, +- byte_count); ++ /* Use FRMR if supported */ ++ if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) ++ reader = rdma_read_chunk_frmr; + else +- sge_count = fast_reg_read_chunks(xprt, rqstp, hdr_ctxt, rmsgp, +- rpl_map, chl_map, ch_count, +- byte_count); +- if (sge_count < 0) { +- err = -EIO; +- goto out; +- } +- +- sgl_offset = 0; +- ch_no = 0; ++ reader = rdma_read_chunk_lcl; + ++ page_no = 0; page_offset = 0; + for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; +- ch->rc_discrim != 0; ch++, ch_no++) { +- u64 rs_offset; +-next_sge: +- ctxt = svc_rdma_get_context(xprt); +- ctxt->direction = DMA_FROM_DEVICE; +- ctxt->frmr = hdr_ctxt->frmr; +- ctxt->read_hdr = NULL; +- clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); +- clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); ++ ch->rc_discrim != 0; ch++) { + +- /* Prepare READ WR */ +- memset(&read_wr, 0, sizeof read_wr); +- read_wr.wr_id = (unsigned long)ctxt; +- read_wr.opcode = IB_WR_RDMA_READ; +- ctxt->wr_op = read_wr.opcode; +- read_wr.send_flags = IB_SEND_SIGNALED; +- read_wr.wr.rdma.rkey = ntohl(ch->rc_target.rs_handle); + xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset, + &rs_offset); +- read_wr.wr.rdma.remote_addr = rs_offset + sgl_offset; +- read_wr.sg_list = ctxt->sge; +- read_wr.num_sge = +- rdma_read_max_sge(xprt, chl_map->ch[ch_no].count); +- err = rdma_set_ctxt_sge(xprt, ctxt, hdr_ctxt->frmr, +- &rpl_map->sge[chl_map->ch[ch_no].start], +- &sgl_offset, +- read_wr.num_sge); +- if (err) { +- svc_rdma_unmap_dma(ctxt); +- svc_rdma_put_context(ctxt, 0); +- goto out; +- } +- if (((ch+1)->rc_discrim == 0) && +- (read_wr.num_sge == chl_map->ch[ch_no].count)) { +- /* +- * Mark the last RDMA_READ with a bit to +- * indicate all RPC data has been fetched from +- * the client and the RPC needs to be enqueued. +- */ +- set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); +- if (hdr_ctxt->frmr) { +- set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); +- /* +- * Invalidate the local MR used to map the data +- * sink. +- */ +- if (xprt->sc_dev_caps & +- SVCRDMA_DEVCAP_READ_W_INV) { +- read_wr.opcode = +- IB_WR_RDMA_READ_WITH_INV; +- ctxt->wr_op = read_wr.opcode; +- read_wr.ex.invalidate_rkey = +- ctxt->frmr->mr->lkey; +- } else { +- /* Prepare INVALIDATE WR */ +- memset(&inv_wr, 0, sizeof inv_wr); +- inv_wr.opcode = IB_WR_LOCAL_INV; +- inv_wr.send_flags = IB_SEND_SIGNALED; +- inv_wr.ex.invalidate_rkey = +- hdr_ctxt->frmr->mr->lkey; +- read_wr.next = &inv_wr; +- } +- } +- ctxt->read_hdr = hdr_ctxt; +- } +- /* Post the read */ +- err = svc_rdma_send(xprt, &read_wr); +- if (err) { +- printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n", +- err); +- set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); +- svc_rdma_unmap_dma(ctxt); +- svc_rdma_put_context(ctxt, 0); +- goto out; ++ byte_count = ntohl(ch->rc_target.rs_length); ++ ++ while (byte_count > 0) { ++ ret = reader(xprt, rqstp, head, ++ &page_no, &page_offset, ++ ntohl(ch->rc_target.rs_handle), ++ byte_count, rs_offset, ++ ((ch+1)->rc_discrim == 0) /* last */ ++ ); ++ if (ret < 0) ++ goto err; ++ byte_count -= ret; ++ rs_offset += ret; ++ head->arg.buflen += ret; + } +- atomic_inc(&rdma_stat_read); +- +- if (read_wr.num_sge < chl_map->ch[ch_no].count) { +- chl_map->ch[ch_no].count -= read_wr.num_sge; +- chl_map->ch[ch_no].start += read_wr.num_sge; +- goto next_sge; +- } +- sgl_offset = 0; +- err = 1; + } +- +- out: +- svc_rdma_put_req_map(rpl_map); +- svc_rdma_put_req_map(chl_map); +- ++ ret = 1; ++ err: + /* Detach arg pages. svc_recv will replenish them */ +- for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) +- rqstp->rq_pages[ch_no] = NULL; ++ for (page_no = 0; ++ &rqstp->rq_pages[page_no] < rqstp->rq_respages; page_no++) ++ rqstp->rq_pages[page_no] = NULL; + +- return err; ++ return ret; + } + + static int rdma_read_complete(struct svc_rqst *rqstp, +@@ -595,13 +504,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); +- } +- if (ctxt) { + spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + return rdma_read_complete(rqstp, ctxt); +- } +- +- if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { ++ } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { + ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, + struct svc_rdma_op_ctxt, + dto_q); +@@ -621,7 +526,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) + goto close_out; + +- BUG_ON(ret); + goto out; + } + dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", +@@ -644,12 +548,11 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) + } + + /* Read read-list data. */ +- ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt); ++ ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt); + if (ret > 0) { + /* read-list posted, defer until data received from client. */ + goto defer; +- } +- if (ret < 0) { ++ } else if (ret < 0) { + /* Post of read-list failed, free context. */ + svc_rdma_put_context(ctxt, 1); + return 0; +diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c +index 7e024a5..49fd21a 100644 +--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c +@@ -1,4 +1,5 @@ + /* ++ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two +@@ -49,152 +50,6 @@ + + #define RPCDBG_FACILITY RPCDBG_SVCXPRT + +-/* Encode an XDR as an array of IB SGE +- * +- * Assumptions: +- * - head[0] is physically contiguous. +- * - tail[0] is physically contiguous. +- * - pages[] is not physically or virtually contiguous and consists of +- * PAGE_SIZE elements. +- * +- * Output: +- * SGE[0] reserved for RCPRDMA header +- * SGE[1] data from xdr->head[] +- * SGE[2..sge_count-2] data from xdr->pages[] +- * SGE[sge_count-1] data from xdr->tail. +- * +- * The max SGE we need is the length of the XDR / pagesize + one for +- * head + one for tail + one for RPCRDMA header. Since RPCSVC_MAXPAGES +- * reserves a page for both the request and the reply header, and this +- * array is only concerned with the reply we are assured that we have +- * on extra page for the RPCRMDA header. +- */ +-static int fast_reg_xdr(struct svcxprt_rdma *xprt, +- struct xdr_buf *xdr, +- struct svc_rdma_req_map *vec) +-{ +- int sge_no; +- u32 sge_bytes; +- u32 page_bytes; +- u32 page_off; +- int page_no = 0; +- u8 *frva; +- struct svc_rdma_fastreg_mr *frmr; +- +- frmr = svc_rdma_get_frmr(xprt); +- if (IS_ERR(frmr)) +- return -ENOMEM; +- vec->frmr = frmr; +- +- /* Skip the RPCRDMA header */ +- sge_no = 1; +- +- /* Map the head. */ +- frva = (void *)((unsigned long)(xdr->head[0].iov_base) & PAGE_MASK); +- vec->sge[sge_no].iov_base = xdr->head[0].iov_base; +- vec->sge[sge_no].iov_len = xdr->head[0].iov_len; +- vec->count = 2; +- sge_no++; +- +- /* Map the XDR head */ +- frmr->kva = frva; +- frmr->direction = DMA_TO_DEVICE; +- frmr->access_flags = 0; +- frmr->map_len = PAGE_SIZE; +- frmr->page_list_len = 1; +- page_off = (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK; +- frmr->page_list->page_list[page_no] = +- ib_dma_map_page(xprt->sc_cm_id->device, +- virt_to_page(xdr->head[0].iov_base), +- page_off, +- PAGE_SIZE - page_off, +- DMA_TO_DEVICE); +- if (ib_dma_mapping_error(xprt->sc_cm_id->device, +- frmr->page_list->page_list[page_no])) +- goto fatal_err; +- atomic_inc(&xprt->sc_dma_used); +- +- /* Map the XDR page list */ +- page_off = xdr->page_base; +- page_bytes = xdr->page_len + page_off; +- if (!page_bytes) +- goto encode_tail; +- +- /* Map the pages */ +- vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; +- vec->sge[sge_no].iov_len = page_bytes; +- sge_no++; +- while (page_bytes) { +- struct page *page; +- +- page = xdr->pages[page_no++]; +- sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); +- page_bytes -= sge_bytes; +- +- frmr->page_list->page_list[page_no] = +- ib_dma_map_page(xprt->sc_cm_id->device, +- page, page_off, +- sge_bytes, DMA_TO_DEVICE); +- if (ib_dma_mapping_error(xprt->sc_cm_id->device, +- frmr->page_list->page_list[page_no])) +- goto fatal_err; +- +- atomic_inc(&xprt->sc_dma_used); +- page_off = 0; /* reset for next time through loop */ +- frmr->map_len += PAGE_SIZE; +- frmr->page_list_len++; +- } +- vec->count++; +- +- encode_tail: +- /* Map tail */ +- if (0 == xdr->tail[0].iov_len) +- goto done; +- +- vec->count++; +- vec->sge[sge_no].iov_len = xdr->tail[0].iov_len; +- +- if (((unsigned long)xdr->tail[0].iov_base & PAGE_MASK) == +- ((unsigned long)xdr->head[0].iov_base & PAGE_MASK)) { +- /* +- * If head and tail use the same page, we don't need +- * to map it again. +- */ +- vec->sge[sge_no].iov_base = xdr->tail[0].iov_base; +- } else { +- void *va; +- +- /* Map another page for the tail */ +- page_off = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; +- va = (void *)((unsigned long)xdr->tail[0].iov_base & PAGE_MASK); +- vec->sge[sge_no].iov_base = frva + frmr->map_len + page_off; +- +- frmr->page_list->page_list[page_no] = +- ib_dma_map_page(xprt->sc_cm_id->device, virt_to_page(va), +- page_off, +- PAGE_SIZE, +- DMA_TO_DEVICE); +- if (ib_dma_mapping_error(xprt->sc_cm_id->device, +- frmr->page_list->page_list[page_no])) +- goto fatal_err; +- atomic_inc(&xprt->sc_dma_used); +- frmr->map_len += PAGE_SIZE; +- frmr->page_list_len++; +- } +- +- done: +- if (svc_rdma_fastreg(xprt, frmr)) +- goto fatal_err; +- +- return 0; +- +- fatal_err: +- printk("svcrdma: Error fast registering memory for xprt %p\n", xprt); +- vec->frmr = NULL; +- svc_rdma_put_frmr(xprt, frmr); +- return -EIO; +-} +- + static int map_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) +@@ -208,9 +63,6 @@ static int map_xdr(struct svcxprt_rdma *xprt, + BUG_ON(xdr->len != + (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); + +- if (xprt->sc_frmr_pg_list_len) +- return fast_reg_xdr(xprt, xdr, vec); +- + /* Skip the first sge, this is for the RPCRDMA header */ + sge_no = 1; + +@@ -282,8 +134,6 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, + } + + /* Assumptions: +- * - We are using FRMR +- * - or - + * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE + */ + static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, +@@ -327,23 +177,16 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, + sge_bytes = min_t(size_t, + bc, vec->sge[xdr_sge_no].iov_len-sge_off); + sge[sge_no].length = sge_bytes; +- if (!vec->frmr) { +- sge[sge_no].addr = +- dma_map_xdr(xprt, &rqstp->rq_res, xdr_off, +- sge_bytes, DMA_TO_DEVICE); +- xdr_off += sge_bytes; +- if (ib_dma_mapping_error(xprt->sc_cm_id->device, +- sge[sge_no].addr)) +- goto err; +- atomic_inc(&xprt->sc_dma_used); +- sge[sge_no].lkey = xprt->sc_dma_lkey; +- } else { +- sge[sge_no].addr = (unsigned long) +- vec->sge[xdr_sge_no].iov_base + sge_off; +- sge[sge_no].lkey = vec->frmr->mr->lkey; +- } ++ sge[sge_no].addr = ++ dma_map_xdr(xprt, &rqstp->rq_res, xdr_off, ++ sge_bytes, DMA_TO_DEVICE); ++ xdr_off += sge_bytes; ++ if (ib_dma_mapping_error(xprt->sc_cm_id->device, ++ sge[sge_no].addr)) ++ goto err; ++ atomic_inc(&xprt->sc_dma_used); ++ sge[sge_no].lkey = xprt->sc_dma_lkey; + ctxt->count++; +- ctxt->frmr = vec->frmr; + sge_off = 0; + sge_no++; + xdr_sge_no++; +@@ -369,7 +212,6 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, + return 0; + err: + svc_rdma_unmap_dma(ctxt); +- svc_rdma_put_frmr(xprt, vec->frmr); + svc_rdma_put_context(ctxt, 0); + /* Fatal error, close transport */ + return -EIO; +@@ -397,10 +239,7 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, + res_ary = (struct rpcrdma_write_array *) + &rdma_resp->rm_body.rm_chunks[1]; + +- if (vec->frmr) +- max_write = vec->frmr->map_len; +- else +- max_write = xprt->sc_max_sge * PAGE_SIZE; ++ max_write = xprt->sc_max_sge * PAGE_SIZE; + + /* Write chunks start at the pagelist */ + for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; +@@ -472,10 +311,7 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, + res_ary = (struct rpcrdma_write_array *) + &rdma_resp->rm_body.rm_chunks[2]; + +- if (vec->frmr) +- max_write = vec->frmr->map_len; +- else +- max_write = xprt->sc_max_sge * PAGE_SIZE; ++ max_write = xprt->sc_max_sge * PAGE_SIZE; + + /* xdr offset starts at RPC message */ + nchunks = ntohl(arg_ary->wc_nchunks); +@@ -545,7 +381,6 @@ static int send_reply(struct svcxprt_rdma *rdma, + int byte_count) + { + struct ib_send_wr send_wr; +- struct ib_send_wr inv_wr; + int sge_no; + int sge_bytes; + int page_no; +@@ -559,7 +394,6 @@ static int send_reply(struct svcxprt_rdma *rdma, + "svcrdma: could not post a receive buffer, err=%d." + "Closing transport %p.\n", ret, rdma); + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); +- svc_rdma_put_frmr(rdma, vec->frmr); + svc_rdma_put_context(ctxt, 0); + return -ENOTCONN; + } +@@ -567,11 +401,6 @@ static int send_reply(struct svcxprt_rdma *rdma, + /* Prepare the context */ + ctxt->pages[0] = page; + ctxt->count = 1; +- ctxt->frmr = vec->frmr; +- if (vec->frmr) +- set_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); +- else +- clear_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags); + + /* Prepare the SGE for the RPCRDMA Header */ + ctxt->sge[0].lkey = rdma->sc_dma_lkey; +@@ -590,21 +419,15 @@ static int send_reply(struct svcxprt_rdma *rdma, + int xdr_off = 0; + sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); + byte_count -= sge_bytes; +- if (!vec->frmr) { +- ctxt->sge[sge_no].addr = +- dma_map_xdr(rdma, &rqstp->rq_res, xdr_off, +- sge_bytes, DMA_TO_DEVICE); +- xdr_off += sge_bytes; +- if (ib_dma_mapping_error(rdma->sc_cm_id->device, +- ctxt->sge[sge_no].addr)) +- goto err; +- atomic_inc(&rdma->sc_dma_used); +- ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; +- } else { +- ctxt->sge[sge_no].addr = (unsigned long) +- vec->sge[sge_no].iov_base; +- ctxt->sge[sge_no].lkey = vec->frmr->mr->lkey; +- } ++ ctxt->sge[sge_no].addr = ++ dma_map_xdr(rdma, &rqstp->rq_res, xdr_off, ++ sge_bytes, DMA_TO_DEVICE); ++ xdr_off += sge_bytes; ++ if (ib_dma_mapping_error(rdma->sc_cm_id->device, ++ ctxt->sge[sge_no].addr)) ++ goto err; ++ atomic_inc(&rdma->sc_dma_used); ++ ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; + ctxt->sge[sge_no].length = sge_bytes; + } + BUG_ON(byte_count != 0); +@@ -627,6 +450,7 @@ static int send_reply(struct svcxprt_rdma *rdma, + ctxt->sge[page_no+1].length = 0; + } + rqstp->rq_next_page = rqstp->rq_respages + 1; ++ + BUG_ON(sge_no > rdma->sc_max_sge); + memset(&send_wr, 0, sizeof send_wr); + ctxt->wr_op = IB_WR_SEND; +@@ -635,15 +459,6 @@ static int send_reply(struct svcxprt_rdma *rdma, + send_wr.num_sge = sge_no; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; +- if (vec->frmr) { +- /* Prepare INVALIDATE WR */ +- memset(&inv_wr, 0, sizeof inv_wr); +- inv_wr.opcode = IB_WR_LOCAL_INV; +- inv_wr.send_flags = IB_SEND_SIGNALED; +- inv_wr.ex.invalidate_rkey = +- vec->frmr->mr->lkey; +- send_wr.next = &inv_wr; +- } + + ret = svc_rdma_send(rdma, &send_wr); + if (ret) +@@ -653,7 +468,6 @@ static int send_reply(struct svcxprt_rdma *rdma, + + err: + svc_rdma_unmap_dma(ctxt); +- svc_rdma_put_frmr(rdma, vec->frmr); + svc_rdma_put_context(ctxt, 1); + return -EIO; + } +diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c +index 02db8d9..e7323fb 100644 +--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c +@@ -1,4 +1,5 @@ + /* ++ * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. + * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two +@@ -162,7 +163,6 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void) + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); + } + map->count = 0; +- map->frmr = NULL; + return map; + } + +@@ -338,22 +338,21 @@ static void process_context(struct svcxprt_rdma *xprt, + + switch (ctxt->wr_op) { + case IB_WR_SEND: +- if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) +- svc_rdma_put_frmr(xprt, ctxt->frmr); ++ BUG_ON(ctxt->frmr); + svc_rdma_put_context(ctxt, 1); + break; + + case IB_WR_RDMA_WRITE: ++ BUG_ON(ctxt->frmr); + svc_rdma_put_context(ctxt, 0); + break; + + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: ++ svc_rdma_put_frmr(xprt, ctxt->frmr); + if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { + struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; + BUG_ON(!read_hdr); +- if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) +- svc_rdma_put_frmr(xprt, ctxt->frmr); + spin_lock_bh(&xprt->sc_rq_dto_lock); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + list_add_tail(&read_hdr->dto_q, +@@ -365,6 +364,7 @@ static void process_context(struct svcxprt_rdma *xprt, + break; + + default: ++ BUG_ON(1); + printk(KERN_ERR "svcrdma: unexpected completion type, " + "opcode=%d\n", + ctxt->wr_op); +@@ -380,29 +380,42 @@ static void process_context(struct svcxprt_rdma *xprt, + static void sq_cq_reap(struct svcxprt_rdma *xprt) + { + struct svc_rdma_op_ctxt *ctxt = NULL; +- struct ib_wc wc; ++ struct ib_wc wc_a[6]; ++ struct ib_wc *wc; + struct ib_cq *cq = xprt->sc_sq_cq; + int ret; + ++ memset(wc_a, 0, sizeof(wc_a)); ++ + if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) + return; + + ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); + atomic_inc(&rdma_stat_sq_poll); +- while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { +- if (wc.status != IB_WC_SUCCESS) +- /* Close the transport */ +- set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); ++ while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) { ++ int i; + +- /* Decrement used SQ WR count */ +- atomic_dec(&xprt->sc_sq_count); +- wake_up(&xprt->sc_send_wait); ++ for (i = 0; i < ret; i++) { ++ wc = &wc_a[i]; ++ if (wc->status != IB_WC_SUCCESS) { ++ dprintk("svcrdma: sq wc err status %d\n", ++ wc->status); + +- ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; +- if (ctxt) +- process_context(xprt, ctxt); ++ /* Close the transport */ ++ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); ++ } + +- svc_xprt_put(&xprt->sc_xprt); ++ /* Decrement used SQ WR count */ ++ atomic_dec(&xprt->sc_sq_count); ++ wake_up(&xprt->sc_send_wait); ++ ++ ctxt = (struct svc_rdma_op_ctxt *) ++ (unsigned long)wc->wr_id; ++ if (ctxt) ++ process_context(xprt, ctxt); ++ ++ svc_xprt_put(&xprt->sc_xprt); ++ } + } + + if (ctxt) +@@ -995,7 +1008,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) + need_dma_mr = 0; + break; + case RDMA_TRANSPORT_IB: +- if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { ++ if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { ++ need_dma_mr = 1; ++ dma_mr_acc = IB_ACCESS_LOCAL_WRITE; ++ } else if (!(devattr.device_cap_flags & ++ IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else +@@ -1192,14 +1209,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) + container_of(xprt, struct svcxprt_rdma, sc_xprt); + + /* +- * If there are fewer SQ WR available than required to send a +- * simple response, return false. +- */ +- if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3)) +- return 0; +- +- /* +- * ...or there are already waiters on the SQ, ++ * If there are already waiters on the SQ, + * return false. + */ + if (waitqueue_active(&rdma->sc_send_wait)) +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0131-svcrdma-Fence-LOCAL_INV-work-requests.patch b/linux-next-cherry-picks/0131-svcrdma-Fence-LOCAL_INV-work-requests.patch new file mode 100644 index 0000000..2ecff31 --- /dev/null +++ b/linux-next-cherry-picks/0131-svcrdma-Fence-LOCAL_INV-work-requests.patch @@ -0,0 +1,31 @@ +From 83710fc753d2ae158aa3cb7a7966d9c1bd05b792 Mon Sep 17 00:00:00 2001 +From: Steve Wise +Date: Thu, 5 Jun 2014 09:54:31 -0500 +Subject: [PATCH 132/132] svcrdma: Fence LOCAL_INV work requests + +Fencing forces the invalidate to only happen after all prior send +work requests have been completed. + +Signed-off-by: Steve Wise +Reported by : Devesh Sharma +Signed-off-by: J. Bruce Fields +--- + net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 +- + 1 files changed, 1 insertions(+), 1 deletions(-) + +diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +index 52d9f2c..8f92a61 100644 +--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +@@ -338,7 +338,7 @@ static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, + memset(&inv_wr, 0, sizeof(inv_wr)); + inv_wr.wr_id = (unsigned long)ctxt; + inv_wr.opcode = IB_WR_LOCAL_INV; +- inv_wr.send_flags = IB_SEND_SIGNALED; ++ inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE; + inv_wr.ex.invalidate_rkey = frmr->mr->lkey; + } + ctxt->wr_op = read_wr.opcode; +-- +1.7.1 + diff --git a/linux-next-cherry-picks/0132-svcrdma-send_write-must-not-overflow.patch b/linux-next-cherry-picks/0132-svcrdma-send_write-must-not-overflow.patch new file mode 100644 index 0000000..387f601 --- /dev/null +++ b/linux-next-cherry-picks/0132-svcrdma-send_write-must-not-overflow.patch @@ -0,0 +1,129 @@ +commit 255942907e7ff498ab1545b5edce5690833ff640 +Author: Steve Wise +Date: Wed Jul 9 13:49:15 2014 -0500 + + svcrdma: send_write() must not overflow the device's max sge + + Function send_write() must stop creating sges when it reaches the device + max and return the amount sent in the RDMA Write to the caller. + + Signed-off-by: Steve Wise + Signed-off-by: J. Bruce Fields + +diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c +index 49fd21a..9f1b506 100644 +--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c ++++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c +@@ -192,6 +192,8 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, + xdr_sge_no++; + BUG_ON(xdr_sge_no > vec->count); + bc -= sge_bytes; ++ if (sge_no == xprt->sc_max_sge) ++ break; + } + + /* Prepare WRITE WR */ +@@ -209,7 +211,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, + atomic_inc(&rdma_stat_write); + if (svc_rdma_send(xprt, &write_wr)) + goto err; +- return 0; ++ return write_len - bc; + err: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 0); +@@ -225,7 +227,6 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, + { + u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; + int write_len; +- int max_write; + u32 xdr_off; + int chunk_off; + int chunk_no; +@@ -239,8 +240,6 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, + res_ary = (struct rpcrdma_write_array *) + &rdma_resp->rm_body.rm_chunks[1]; + +- max_write = xprt->sc_max_sge * PAGE_SIZE; +- + /* Write chunks start at the pagelist */ + for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; + xfer_len && chunk_no < arg_ary->wc_nchunks; +@@ -260,23 +259,21 @@ static int send_write_chunks(struct svcxprt_rdma *xprt, + write_len); + chunk_off = 0; + while (write_len) { +- int this_write; +- this_write = min(write_len, max_write); + ret = send_write(xprt, rqstp, + ntohl(arg_ch->rs_handle), + rs_offset + chunk_off, + xdr_off, +- this_write, ++ write_len, + vec); +- if (ret) { ++ if (ret <= 0) { + dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", + ret); + return -EIO; + } +- chunk_off += this_write; +- xdr_off += this_write; +- xfer_len -= this_write; +- write_len -= this_write; ++ chunk_off += ret; ++ xdr_off += ret; ++ xfer_len -= ret; ++ write_len -= ret; + } + } + /* Update the req with the number of chunks actually used */ +@@ -293,7 +290,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, + { + u32 xfer_len = rqstp->rq_res.len; + int write_len; +- int max_write; + u32 xdr_off; + int chunk_no; + int chunk_off; +@@ -311,8 +307,6 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, + res_ary = (struct rpcrdma_write_array *) + &rdma_resp->rm_body.rm_chunks[2]; + +- max_write = xprt->sc_max_sge * PAGE_SIZE; +- + /* xdr offset starts at RPC message */ + nchunks = ntohl(arg_ary->wc_nchunks); + for (xdr_off = 0, chunk_no = 0; +@@ -330,24 +324,21 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt, + write_len); + chunk_off = 0; + while (write_len) { +- int this_write; +- +- this_write = min(write_len, max_write); + ret = send_write(xprt, rqstp, + ntohl(ch->rs_handle), + rs_offset + chunk_off, + xdr_off, +- this_write, ++ write_len, + vec); +- if (ret) { ++ if (ret <= 0) { + dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", + ret); + return -EIO; + } +- chunk_off += this_write; +- xdr_off += this_write; +- xfer_len -= this_write; +- write_len -= this_write; ++ chunk_off += ret; ++ xdr_off += ret; ++ xfer_len -= ret; ++ write_len -= ret; + } + } + /* Update the req with the number of chunks actually used */ diff --git a/linux-next-cherry-picks/0133-nfsrdma-backport-fixes.patch b/linux-next-cherry-picks/0133-nfsrdma-backport-fixes.patch new file mode 100644 index 0000000..d492471 --- /dev/null +++ b/linux-next-cherry-picks/0133-nfsrdma-backport-fixes.patch @@ -0,0 +1,48 @@ +Index: compat-rdma/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +=================================================================== +--- compat-rdma.orig/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c ++++ compat-rdma/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +@@ -92,7 +92,9 @@ static void rdma_build_arg_xdr(struct sv + sge_no++; + } + rqstp->rq_respages = &rqstp->rq_pages[sge_no]; ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,7,0)) + rqstp->rq_next_page = rqstp->rq_respages + 1; ++#endif + + /* We should never run out of SGE because the limit is defined to + * support the max allowed RPC data length +@@ -167,7 +169,9 @@ static int rdma_read_chunk_lcl(struct sv + if (!pg_off) + head->count++; + rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,7,0)) + rqstp->rq_next_page = rqstp->rq_respages + 1; ++#endif + ctxt->sge[pno].addr = + ib_dma_map_page(xprt->sc_cm_id->device, + head->arg.pages[pg_no], pg_off, +@@ -272,7 +276,9 @@ static int rdma_read_chunk_frmr(struct s + if (!pg_off) + head->count++; + rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,7,0)) + rqstp->rq_next_page = rqstp->rq_respages + 1; ++#endif + frmr->page_list->page_list[pno] = + ib_dma_map_page(xprt->sc_cm_id->device, + head->arg.pages[pg_no], 0, +Index: compat-rdma/net/sunrpc/xprtrdma/svc_rdma_sendto.c +=================================================================== +--- compat-rdma.orig/net/sunrpc/xprtrdma/svc_rdma_sendto.c ++++ compat-rdma/net/sunrpc/xprtrdma/svc_rdma_sendto.c +@@ -446,7 +446,9 @@ static int send_reply(struct svcxprt_rdm + if (page_no+1 >= sge_no) + ctxt->sge[page_no+1].length = 0; + } ++#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,7,0)) + rqstp->rq_next_page = rqstp->rq_respages + 1; ++#endif + + BUG_ON(sge_no > rdma->sc_max_sge); + memset(&send_wr, 0, sizeof send_wr); diff --git a/linux-next-pending/0024-SUNRPC-Fix-large_reads-on-NFS-RDMA.patch b/linux-next-pending/0024-SUNRPC-Fix-large_reads-on-NFS-RDMA.patch deleted file mode 100644 index d3ad959..0000000 --- a/linux-next-pending/0024-SUNRPC-Fix-large_reads-on-NFS-RDMA.patch +++ /dev/null @@ -1,46 +0,0 @@ -commit 2b7bbc963da8d076f263574af4138b5df2e1581f -Author: Chuck Lever -Date: Wed Mar 12 12:51:30 2014 -0400 - - SUNRPC: Fix large reads on NFS/RDMA - - After commit a11a2bf4, "SUNRPC: Optimise away unnecessary data moves - in xdr_align_pages", Thu Aug 2 13:21:43 2012, READs larger than a - few hundred bytes via NFS/RDMA no longer work. This commit exposed - a long-standing bug in rpcrdma_inline_fixup(). - - I reproduce this with an rsize=4096 mount using the cthon04 basic - tests. Test 5 fails with an EIO error. - - For my reproducer, kernel log shows: - - NFS: server cheating in read reply: count 4096 > recvd 0 - - rpcrdma_inline_fixup() is zeroing the xdr_stream::page_len field, - and xdr_align_pages() is now returning that value to the READ XDR - decoder function. - - That field is set up by xdr_inline_pages() by the READ XDR encoder - function. As far as I can tell, it is supposed to be left alone - after that, as it describes the dimensions of the reply xdr_stream, - not the contents of that stream. - - Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=68391 - Signed-off-by: Chuck Lever - Signed-off-by: Trond Myklebust - -diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c -index e03725b..96ead52 100644 ---- a/net/sunrpc/xprtrdma/rpc_rdma.c -+++ b/net/sunrpc/xprtrdma/rpc_rdma.c -@@ -649,9 +649,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) - break; - page_base = 0; - } -- rqst->rq_rcv_buf.page_len = olen - copy_len; -- } else -- rqst->rq_rcv_buf.page_len = 0; -+ } - - if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { - curlen = copy_len; diff --git a/linux-next-pending/0025-NFSRDMA-Fix-regression-in-NFSRDMA-server.patch b/linux-next-pending/0025-NFSRDMA-Fix-regression-in-NFSRDMA-server.patch deleted file mode 100644 index abd5a0a..0000000 --- a/linux-next-pending/0025-NFSRDMA-Fix-regression-in-NFSRDMA-server.patch +++ /dev/null @@ -1,66 +0,0 @@ -Fix regression in NFSRDMA server - -From: Tom Tucker - -The server regression was caused by the addition of rq_next_page -(afc59400d6c65bad66d4ad0b2daf879cbff8e23e). There were a few places that -were missed with the update of the rq_respages array. - -NOTE: Patch modified to apply against OFED. - -Signed-off-by: Tom Tucker -Tested-by: Steve Wise - ---- - ---- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2014-03-31 15:31:05.214903226 -0500 -+++ a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 2014-03-31 15:34:40.042047141 -0500 -@@ -90,6 +90,9 @@ static void rdma_build_arg_xdr(struct sv - sge_no++; - } - rqstp->rq_respages = &rqstp->rq_pages[sge_no]; -+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)) -+ rqstp->rq_next_page = rqstp->rq_respages + 1; -+#endif - - /* We should never run out of SGE because the limit is defined to - * support the max allowed RPC data length -@@ -169,6 +172,9 @@ static int map_read_chunks(struct svcxpr - */ - head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; - rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; -+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)) -+ rqstp->rq_next_page = rqstp->rq_respages + 1; -+#endif - - byte_count -= sge_bytes; - ch_bytes -= sge_bytes; -@@ -276,6 +282,9 @@ static int fast_reg_read_chunks(struct s - - /* rq_respages points one past arg pages */ - rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; -+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)) -+ rqstp->rq_next_page = rqstp->rq_respages + 1; -+#endif - - /* Create the reply and chunk maps */ - offset = 0; -@@ -527,9 +536,6 @@ next_sge: - #if (LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)) - while (rqstp->rq_resused) - rqstp->rq_respages[--rqstp->rq_resused] = NULL; --#else -- while (rqstp->rq_next_page != rqstp->rq_respages) -- *(--rqstp->rq_next_page) = NULL; - #endif - - return err; -@@ -558,7 +564,7 @@ static int rdma_read_complete(struct svc - #if (LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)) - rqstp->rq_resused = 0; - #else -- rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no]; -+ rqstp->rq_next_page = rqstp->rq_respages + 1; - #endif - - /* Rebuild rq_arg head and tail. */ diff --git a/patches/0023-nfsrdma-Backport-for-rhel6.5.patch b/patches/0023-nfsrdma-Backport-for-rhel6.5.patch index 6d184fb..c9a0bd6 100644 --- a/patches/0023-nfsrdma-Backport-for-rhel6.5.patch +++ b/patches/0023-nfsrdma-Backport-for-rhel6.5.patch @@ -55,28 +55,15 @@ diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_r index xxxxxxx..xxxxxxx xxxxxx --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c -@@ -524,8 +524,13 @@ next_sge: - * Detach res pages. If svc_release sees any it will attempt to - * put them. - */ -+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)) -+ while (rqstp->rq_resused) -+ rqstp->rq_respages[--rqstp->rq_resused] = NULL; -+#else - while (rqstp->rq_next_page != rqstp->rq_respages) - *(--rqstp->rq_next_page) = NULL; -+#endif - - return err; - } -@@ -550,7 +555,11 @@ static int rdma_read_complete(struct svc_rqst *rqstp, +@@ -550,7 +556,11 @@ static int rdma_read_complete(struct svc_rqst *rqstp, /* rq_respages starts after the last arg page */ rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; +- rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no]; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)) + rqstp->rq_resused = 0; +#else - rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no]; ++ rqstp->rq_next_page = rqstp->rq_respages + 1; +#endif /* Rebuild rq_arg head and tail. */ diff --git a/patches/0026-nfsrdma-Backport-for-sles11sp3.patch b/patches/0026-nfsrdma-Backport-for-sles11sp3.patch index 84bb97e..ec2fc34 100644 --- a/patches/0026-nfsrdma-Backport-for-sles11sp3.patch +++ b/patches/0026-nfsrdma-Backport-for-sles11sp3.patch @@ -20,31 +20,3 @@ index xxxxxxx..xxxxxxx xxxxxx xprt_rdma_slot_table_entries); if (xprt == NULL) { dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n", -@@ -450,8 +452,15 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) - } - - static int -+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)) || defined (CONFIG_COMPAT_XPRT_RESERVE_XPRT_CONG_2PARAMS) - xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) -+#else -+xprt_rdma_reserve_xprt(struct rpc_task *task) -+#endif - { -+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,1,0)) && !defined (CONFIG_COMPAT_XPRT_RESERVE_XPRT_CONG_2PARAMS) -+ struct rpc_xprt *xprt = task->tk_xprt; -+#endif - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - int credits = atomic_read(&r_xprt->rx_buf.rb_credits); - -@@ -463,7 +472,11 @@ xprt_rdma_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) - BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0); - } - xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale; -+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)) || defined (CONFIG_COMPAT_XPRT_RESERVE_XPRT_CONG_2PARAMS) - return xprt_reserve_xprt_cong(xprt, task); -+#else -+ return xprt_reserve_xprt_cong(task); -+#endif - } - - /* -- 2.41.0