From 80b76a3501add7905c3381aa60b1738c572b893f Mon Sep 17 00:00:00 2001 From: Arlin Davis Date: Mon, 16 Jun 2014 09:36:23 -0700 Subject: [PATCH] mcm: alltoall hangs on scale with MXS,MSS,HST intranode configurations HST based MCM provider can drop consumer (MPI) request events if consumer uses shared CQ's across a HST->MSS and HST->MXS connections and the CQ events is process in the PI progress thread. Change the mcm_rcv_pi_event function to mcm_dto_event and add support to process both direct (HST->MSS or HST) RW,RW_imm,SND requests (HST->MSS or HST) and proxy-in RW_imm requests (HST->MXS). Signed-off-by: Arlin Davis --- dapl/openib_mcm/cm.c | 5 +- dapl/openib_mcm/dapl_ib_util.h | 2 +- dapl/openib_mcm/proxy.c | 131 +++++++++++++++++++++------------ 3 files changed, 87 insertions(+), 51 deletions(-) diff --git a/dapl/openib_mcm/cm.c b/dapl/openib_mcm/cm.c index d41ca09..3c9d128 100644 --- a/dapl/openib_mcm/cm.c +++ b/dapl/openib_mcm/cm.c @@ -1624,6 +1624,7 @@ dapls_ib_connect(IN DAT_EP_HANDLE ep_handle, /* remote hca and port: lid, gid, network order */ dapl_os_memcpy(&cm->msg.daddr1, r_addr, sizeof(struct dat_mcm_addr)); + dapl_os_memcpy(&cm->msg.daddr2, r_addr, sizeof(struct dat_mcm_addr)); /* validate port and ep_map range */ if ((mcm_ia->port > 2) || (mcm_ia->ep_map > 3)) @@ -2053,8 +2054,8 @@ void cm_thread(void *arg) while (m_cq) { dapl_fd_set(m_cq->cq->channel->fd, set, DAPL_FD_READ); - dapl_log(DAPL_DBG_TYPE_CM, " cm_thread: mcm_rcv_pi_event(%p)\n", m_cq); - mcm_rcv_pi_event(m_cq); + dapl_log(DAPL_DBG_TYPE_CM, " cm_thread: mcm_pio_event(%p)\n", m_cq); + mcm_dto_event(m_cq); m_cq = dapl_llist_next_entry( &hca->ib_trans.cqlist, (DAPL_LLIST_ENTRY *)&m_cq->entry); diff --git a/dapl/openib_mcm/dapl_ib_util.h b/dapl/openib_mcm/dapl_ib_util.h index 8cc7d97..08390dc 100644 --- a/dapl/openib_mcm/dapl_ib_util.h +++ b/dapl/openib_mcm/dapl_ib_util.h @@ -173,7 +173,7 @@ DAT_RETURN dapls_modify_qp_rtu(struct ibv_qp *qp, uint32_t qpn, uint16_t lid, ib /* HST->MXS (MIC xsocket) remote PI communication, proxy.c */ int mcm_send_pi(ib_qp_handle_t m_qp, int len, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr); int mcm_post_rcv_wc(struct dcm_ib_qp *m_qp, int cnt); -void mcm_rcv_pi_event(struct dcm_ib_cq *m_cq); +void mcm_dto_event(struct dcm_ib_cq *m_cq); int mcm_create_wc_q(struct dcm_ib_qp *m_qp, int entries); void mcm_destroy_wc_q(struct dcm_ib_qp *m_qp); int mcm_create_pi_cq(struct dcm_ib_qp *m_qp, int len); diff --git a/dapl/openib_mcm/proxy.c b/dapl/openib_mcm/proxy.c index b2e0e9a..36c9b35 100644 --- a/dapl/openib_mcm/proxy.c +++ b/dapl/openib_mcm/proxy.c @@ -53,8 +53,9 @@ int mcm_send_pi(struct dcm_ib_qp *m_qp, int len, struct ibv_send_wr *wr, struct uint32_t wr_flags, offset=0; dapl_log(DAPL_DBG_TYPE_EP, - " mcm_send_pi: len %d ib_wr %p, WR: tl %d hd %d end %d\n", - len, wr, m_qp->wr_tl, m_qp->wr_hd, m_qp->wrc_rem.wr_end); + " mcm_send_pi: ep %p qpn %x ln %d WR: tl %d hd %d end %d wr_id %Lx\n", + m_qp->ep, m_qp->qp2->qp_num, len, m_qp->wr_tl, + m_qp->wr_hd, m_qp->wrc_rem.wr_end, wr->wr_id); if (wr->num_sge > DAT_MIX_SGE_MAX) { ret = EINVAL; @@ -81,7 +82,7 @@ int mcm_send_pi(struct dcm_ib_qp *m_qp, int len, struct ibv_send_wr *wr, struct wr_flags |= M_SEND_MP_SIG; dapl_os_unlock(&m_qp->lock); - dapl_log(DAPL_DBG_TYPE_EP, + dapl_log(DAPL_DBG_TYPE_EVD, " mcm_send_pi[%d]: ln %d wr_idx %d, tl %d hd %d\n", i, wr->sg_list[i].length, wr_idx, m_qp->wr_tl, m_qp->wr_hd); @@ -111,7 +112,7 @@ int mcm_send_pi(struct dcm_ib_qp *m_qp, int len, struct ibv_send_wr *wr, struct wrc.flags = 0; /* setup local WR for wr_rx transfer - RW_imm inline */ - wr_imm.wr_id = (uint64_t)(uintptr_t)m_qp; + wr_imm.wr_id = wr->wr_id; /* MUST be original cookie, CQ processing */ wr_imm.next = 0; wr_imm.sg_list = &sge; wr_imm.num_sge = 1; @@ -129,14 +130,14 @@ int mcm_send_pi(struct dcm_ib_qp *m_qp, int len, struct ibv_send_wr *wr, struct sge.length = (uint32_t) sizeof(struct mcm_wr_rx); /* 160 byte WR */ sge.lkey = 0; /* inline doesn't need registered */ - dapl_log(DAPL_DBG_TYPE_EP, + dapl_log(DAPL_DBG_TYPE_EVD, " mcm_send_pi[%d]: WR_RX wr_id %Lx qn %x op %d flgs 0x%x" " imm %x raddr %p rkey %x ln %d\n", i, wr_imm.wr_id, m_qp->qp2->qp_num, wr_imm.opcode, wr_flags, ntohl(wr_imm.imm_data), wr_imm.wr.rdma.remote_addr, wr_imm.wr.rdma.rkey, sizeof(struct mcm_wr_rx)); - dapl_log(DAPL_DBG_TYPE_EP, + dapl_log(DAPL_DBG_TYPE_EVD, " mcm_send_pi[%d]: WR wr_id %Lx qn %x op %d flgs %x" " imm %x raddr %p rkey %x ln %d tl %d me %d hd %d\n", i, wr->wr_id, m_qp->qp2->qp_num, wr->opcode, @@ -167,34 +168,80 @@ bail: return ret; } -/* Work completion of RW data to remote PI, remote RR completion */ -static inline void mcm_rcv_wc(struct dcm_ib_cq *m_cq, struct dcm_ib_qp *m_qp, struct wrc_idata *wrc) +/* TX - RW_imm work request data to remote PI or consumer TX data direct to peer */ +static inline void mcm_dto_req(struct dcm_ib_cq *m_cq, struct ibv_wc *wc) +{ + DAPL_COOKIE *cookie; + struct dcm_ib_qp *m_qp; + + cookie = (DAPL_COOKIE *)(uintptr_t)wc->wr_id; + m_qp = cookie->ep->qp_handle; + + if (!m_qp->tp->scif_ep && MXS_EP(m_qp) && + (wc->opcode == (uint32_t)IBV_WR_RDMA_WRITE_WITH_IMM)) { + dapl_log(DAPL_DBG_TYPE_EP, + " mcm_dto_req: RW_imm -> WR, wr_id %Lx\n", wc->wr_id); + return; /* post_send -> RW_imm to peer PI */ + } + + dapl_log(DAPL_DBG_TYPE_EP, + " mcm_dto_req: SIG evd %p ep %p WR tl %d hd %d WC tl %d wr_id %p\n", + m_qp->req_cq ? m_qp->req_cq->evd:0, m_qp->ep, m_qp->wr_tl, m_qp->wr_hd, + m_qp->wc_tl, cookie); + + dapls_evd_cqe_to_event(m_qp->req_cq->evd, wc); +} + +/* RX work completion of RW data to remote PI, remote RR completion */ +static inline void mcm_dto_rcv(struct dcm_ib_cq *m_cq, struct ibv_wc *wc) { struct mcm_wc_rx *m_wc; + struct dcm_ib_qp *m_qp = (struct dcm_ib_qp *)wc->wr_id; + struct wrc_idata wrc; + + wrc.id = WRC_ID_DATA(ntohl(wc->imm_data)); + wrc.type = WRC_TYPE_DATA(ntohl(wc->imm_data)); + wrc.flags = WRC_FLAGS_DATA(ntohl(wc->imm_data)); - if (wrc->id > m_qp->wrc.wc_end) { + if (wrc.type != M_WC_TYPE) { dapl_log(DAPL_DBG_TYPE_ERR, - " RX imm_data: WC id out of range %x > %x \n", - wrc->id, m_qp->wrc.wc_end); - return; + "mcm_dto_rcv: ERR imm WC type ?= 0x%x\n", wrc.type); + goto bail; + } + + if (wrc.id > m_qp->wrc.wc_end) { + dapl_log(DAPL_DBG_TYPE_ERR, + " mcm_dto_rcv: ERR WC id out of range %x > %x \n", + wrc.id, m_qp->wrc.wc_end); + goto bail; } - m_wc = (struct mcm_wc_rx *)(m_qp->wrc.wc_addr + (m_qp->wrc.wc_sz * wrc->id)); + m_wc = (struct mcm_wc_rx *)(m_qp->wrc.wc_addr + (m_qp->wrc.wc_sz * wrc.id)); mcm_ntoh_wc_rx(m_wc); /* convert WC contents, pushed via wire */ + dapl_log(DAPL_DBG_TYPE_EP, - " mcm_rcv_wc: WC id %d m_wc %p wr_id %Lx org_id %Lx flgs 0x%x\n", - wrc->id, m_wc, m_wc->wc.wr_id, m_wc->org_id, m_wc->flags); + " mcm_dto_rcv: MCM evd %p ep %p id %d wc %p wr_id %Lx flgs 0x%x %s\n", + m_qp->req_cq->evd, m_qp->ep, wrc.id, m_wc, m_wc->wc.wr_id, + m_wc->flags, m_wc->flags & M_SEND_CN_SIG ? "SIG":"NO_SIG"); + dapl_os_lock(&m_qp->lock); m_qp->wr_tl = m_wc->wr_tl; - m_qp->wc_tl = wrc->id; /* move wc_tl, for wc_tl_rem on peer PI service */ + m_qp->wc_tl = wrc.id; /* move wc_tl, for wc_tl_rem on peer PI service */ dapl_os_unlock(&m_qp->lock); if (m_wc->flags & M_SEND_CN_SIG) { struct ibv_wc ib_wc; + DAPL_COOKIE *cookie = (DAPL_COOKIE *)(uintptr_t) m_wc->wc.wr_id; + + dapl_log(DAPL_DBG_TYPE_EP, + " mcm_dto_rcv: MCM SIG evd %p ep %p WR tl %d hd %d WC tl %d wr_id %p\n", + m_qp->req_cq ? m_qp->req_cq->evd:0, m_qp->ep, m_qp->wr_tl, m_qp->wr_hd, + m_qp->wc_tl, cookie); + mcm_const_ib_wc(&ib_wc, &m_wc->wc, 1); dapls_evd_cqe_to_event(m_qp->req_cq->evd, &ib_wc); } - dapl_log(DAPL_DBG_TYPE_EP, - " mcm_rcv_wc: m_qp %p wr_tl %d wr_hd %d wc_tl %d \n", - m_qp, m_qp->wr_tl, m_qp->wr_tl, m_qp->wc_tl); +bail: + if (mcm_post_rcv_wc(m_qp, 1)) + dapl_log(DAPL_DBG_TYPE_ERR,"mcm_dto_rcv: recv wc repost failed\n"); } int mcm_post_rcv_wc(struct dcm_ib_qp *m_qp, int cnt) @@ -223,11 +270,18 @@ int mcm_post_rcv_wc(struct dcm_ib_qp *m_qp, int cnt) return 0; } -/* Proxy-in service - called from CM-RX thread, CQ2 is PI service +/* Proxy-in service - called from CM-RX thread + * + * This processes both TX and RX events + * rcv_cq is PI only service + * req_cq is PO-PI RW_imm or HST->Direct RW if CQ shared across QP's * * <- Work completion in (RW_imm - WC idata), local initiated RW + * -> RW_imm work requests out PO-PI + * -> RW direct from consumer post HST->Direct (remote is HST or MSS) + * */ -void mcm_rcv_pi_event(struct dcm_ib_cq *m_cq) +void mcm_dto_event(struct dcm_ib_cq *m_cq) { struct ibv_wc wc[5]; struct ibv_cq *ib_cq; @@ -256,46 +310,27 @@ retry: wc_cnt += ret; for (i=0; i WR\n"); - continue; /* post_send -> RW_imm to peer PI */ - } - - if (wc[i].opcode != IBV_WC_RECV_RDMA_WITH_IMM) { - dapl_log(DAPL_DBG_TYPE_ERR, - " PI event: ERR QPr WC op %d != RECV_RDMA_IMM, m_qp %p\n", - wc[i].opcode, m_qp); - continue; - } - dapl_log(DAPL_DBG_TYPE_THREAD," PI event: RX RW_imm <- WC\n"); - wrc.id = WRC_ID_DATA(ntohl(wc[i].imm_data)); - wrc.type = WRC_TYPE_DATA(ntohl(wc[i].imm_data)); - wrc.flags = WRC_FLAGS_DATA(ntohl(wc[i].imm_data)); - - if (wrc.type == M_WC_TYPE) - mcm_rcv_wc(m_cq, m_qp, &wrc); + /* only one expected receive event, otherwise request */ + if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) + mcm_dto_rcv(m_cq, &wc[i]); else - dapl_log(DAPL_DBG_TYPE_ERR, - "PI event: ERR RX_imm: WC type ?= 0x%x \n", wrc.type); - - err = mcm_post_rcv_wc(m_qp, 1); + mcm_dto_req(m_cq, &wc[i]); } goto retry; } -- 2.41.0