From 4aff960ae9882b206295168a3181b21af6fcc49e Mon Sep 17 00:00:00 2001 From: Arlin Davis Date: Fri, 16 May 2014 10:04:21 -0700 Subject: [PATCH] mpxyd: MIC scale-up issue with MPI gather workloads, I_MPI_FABRICS=dapl:dapl issue with shared proxy-in buffer pool when rdma reads complete out of order across QP's. The tail adjustment when read completes fails to walk entire queue and process head entry. Signed-off-by: Arlin Davis --- dapl/svc/mpxy_in.c | 77 ++++++++++++++++++++++++++------------------- dapl/svc/mpxy_out.c | 21 ++++++++++--- 2 files changed, 61 insertions(+), 37 deletions(-) diff --git a/dapl/svc/mpxy_in.c b/dapl/svc/mpxy_in.c index 7bbad03..ea00f56 100644 --- a/dapl/svc/mpxy_in.c +++ b/dapl/svc/mpxy_in.c @@ -296,7 +296,7 @@ static void m_pi_buf_tl(mcm_scif_dev_t *smd, int m_idx, struct mcm_wr_rx *m_wr_r } /* out of order, mark complete, move proxy buffer tail until empty slot */ - while (idx != smd->m_buf_hd_r) { + while (smd->m_buf_tl_r != smd->m_buf_hd_r) { if (smd->m_buf_wc_r[idx].done && !empty_slot) { smd->m_tl_r = m_idx; smd->m_buf_wc_r[idx].m_idx = 0; @@ -308,16 +308,20 @@ static void m_pi_buf_tl(mcm_scif_dev_t *smd, int m_idx, struct mcm_wr_rx *m_wr_r } else empty_slot = 1; + if (idx == smd->m_buf_hd_r) + break; + idx = (idx + 1) & smd->m_buf_end_r; } - if (log_level < 8) { - m_wr_rx->m_idx = 0; - return; - } + if (log_level < 8) + goto done; + for (idx = s_idx;;) { - mlog(8," SMD %p - OutofOrder: tl %d hd %d buf_wc_tl[%d].m_idx=0x%x != m_idx 0x%x\n", - smd, s_idx, smd->m_buf_hd_r, idx, smd->m_buf_wc_r[idx].m_idx, m_idx); + mlog(8," SMD %p - OutofOrder: tl %d hd %d buf_wc_tl[%d].m_idx=0x%x %s m_idx 0x%x %s\n", + smd, s_idx, smd->m_buf_hd_r, idx, smd->m_buf_wc_r[idx].m_idx, + smd->m_buf_wc_r[idx].m_idx == m_idx ? "==":"!=", + m_idx, smd->m_buf_wc_r[idx].done ? "DONE":"BUSY"); if (idx == smd->m_buf_hd_r) break; idx = (idx + 1) & smd->m_buf_end_r; @@ -338,6 +342,7 @@ static void m_pi_buf_tl(mcm_scif_dev_t *smd, int m_idx, struct mcm_wr_rx *m_wr_r if (idx == m_qp->wr_hd_r) break; } +done: m_wr_rx->m_idx = 0; } @@ -439,11 +444,12 @@ static int m_pi_send_wc(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx, int status struct ibv_qp *ib_qp; int wc_idx, ret; - mlog(0x10," WC_rem: wr_rx[%d] %p wc_hd %d flgs %x WR_r tl %d-%d" + mlog(0x10,"[%d:%d:%d] WC_rem: wr_rx[%d] %p wc_hd %d flgs %x WR_r tl %d-%d" " wt %d hd %d wr_id %Lx org_id %Lx\n", - wr_rx->w_idx, wr_rx, m_qp->wc_hd_rem, wr_rx->flags, - m_qp->wr_tl_r, wr_rx->w_idx, m_qp->wr_tl_r_wt, m_qp->wr_hd_r, - wr_rx->wr.wr_id, wr_rx->org_id); + m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->r_entry.tid, + wr_rx->w_idx, wr_rx, m_qp->wc_hd_rem, wr_rx->flags, m_qp->wr_tl_r, + wr_rx->w_idx, m_qp->wr_tl_r_wt, m_qp->wr_hd_r, wr_rx->wr.wr_id, + wr_rx->org_id); /* local WR and remote WR are serialized, should never reach tail of remote WR */ if (((m_qp->wc_hd_rem + 1) & m_qp->wrc.wc_end) == m_qp->wc_tl_rem) { @@ -801,10 +807,11 @@ static void m_pi_post_read(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx) m_qp->stall_cnt_rr++; MCNTR(smd->md, MCM_MX_RR_STALL); - mlog(0, " WARNING: WR_rx[%d] RR stalled (%d)" - " low memory (%p-%p) hd 0x%x tl 0x%x %x,%d\n", - wr_rx->w_idx, m_qp->stall_cnt_rr, - smd->m_buf_r, smd->m_buf_r + smd->m_len_r, + mlog(0, " WARN[%d:%d:%d] WR_rx[%d] org_id %Lx RR stall (%d)" + " low mem (%p-%p) hd 0x%x tl 0x%x %x,%d\n", + smd->md->mc->scif_id, smd->entry.tid, + m_qp->r_entry.tid, wr_rx->w_idx, wr_rx->org_id, + m_qp->stall_cnt_rr, smd->m_buf_r, smd->m_buf_r + smd->m_len_r, smd->m_hd_r, smd->m_tl_r, l_len, l_len); mlog(0, " wr[%d] %p RR(%d,%d,%d): flgs %x tl %d tl_wt %d hd %d\n", wr_rx->w_idx, wr_rx, m_qp->post_cnt_rr, m_qp->stall_cnt_rr, @@ -819,8 +826,9 @@ static void m_pi_post_read(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx) if ((m_qp->pi_rr_cnt >= mcm_rr_max) && !(wr_rx->flags & M_READ_PAUSED)) { wr_rx->flags |= M_READ_PAUSED; m_qp->stall_cnt_rr++; - mlog(0x10, " WARNING: WR_rx[%d] max RR's, stalling (%d)" + mlog(0x10, "WARN[%d:%d:%d] WR_rx[%d] max RR's, stalling (%d)" " memory (%p-%p) hd 0x%x tl 0x%x %x,%d\n", + smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid, wr_rx->w_idx, m_qp->stall_cnt_rr, smd->m_buf_r, smd->m_buf_r + smd->m_len_r, smd->m_hd_r, smd->m_tl_r, l_len, l_len); @@ -835,18 +843,20 @@ static void m_pi_post_read(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx) if (wr_rx->flags & M_READ_PAUSED) { m_qp->stall_cnt_rr--; wr_rx->flags &= ~M_READ_PAUSED; - mlog(0x10, " WR_rx[%d] RR released (%d) got memory (%p-%p)" - " hd 0x%x tl 0x%x %x, need %d\n", + mlog(0x10, "[%d:%d:%d] WR_rx[%d] RR released (%d) got memory (%p-%p)" + " hd 0x%x tl 0x%x ln %x,%d\n", + smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid, wr_rx->w_idx, m_qp->stall_cnt_rr, smd->m_buf_r, smd->m_buf_r + smd->m_len_r, smd->m_hd_r, smd->m_tl_r, l_len, l_len); } else if (m_qp->stall_cnt_rr) { wr_rx->flags |= M_READ_PAUSED; m_qp->stall_cnt_rr++; - mlog(0x10, " WARNING: WR_rx[%d] previous RR stall (%d)" + mlog(0x10, "WARN[%d:%d:%d] WR_rx[%d] previous RR stall (%d)" " memory (%p-%p) hd 0x%x tl 0x%x %x,%d\n", - wr_rx->w_idx, m_qp->stall_cnt_rr, - smd->m_buf_r, smd->m_buf_r + smd->m_len_r, + smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid, + wr_rx->w_idx, m_qp->stall_cnt_rr, smd->m_buf_r, + smd->m_buf_r + smd->m_len_r, smd->m_hd_r, smd->m_tl_r, l_len, l_len); mlog(0x10, " wr[%d] %p RR(%d,%d,%d): flgs %x tl %d tl_wt %d hd %d\n", wr_rx->w_idx, wr_rx, m_qp->post_cnt_rr, m_qp->stall_cnt_rr, @@ -913,13 +923,14 @@ static void m_pi_post_read(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx) m_qp->post_cnt_rr++; MCNTR(smd->md, MCM_QP_READ); - mlog(0x10, " wr[%d] %p RR(%d,%d,%d): wr_id %Lx qn %x flgs %x,%x ln %d " - "r_addr,key %Lx %x to l_addr,key %Lx %x tl %d hd %d\n", + mlog(0x10, "[%d:%d:%d] WR[%d] %p RR(%d,%d,%d): wr_id %Lx qn %x flgs %x,%x ln %d " + "r_addr,key %Lx %x to l_addr,key %Lx %x tl %d hd %d, m_idx %x\n", + smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid, wr_rx->w_idx, wr_rx, m_qp->post_cnt_rr, m_qp->stall_cnt_rr, m_qp->pi_rr_cnt, ib_wr.wr_id, ib_qp->qp_num, ib_wr.send_flags, wr_rx->flags, l_len, ib_wr.wr.rdma.remote_addr, ib_wr.wr.rdma.rkey, ib_wr.sg_list->addr, ib_wr.sg_list->lkey, - m_qp->wr_tl_r, m_qp->wr_hd_r); + m_qp->wr_tl_r, m_qp->wr_hd_r, wr_rx->m_idx); #if MCM_PROFILE_DBG if (m_qp->pi_rr_cnt == 1) { @@ -961,10 +972,10 @@ bail: m_pi_buf_tl(smd, wr_rx->m_idx, wr_rx); /* return buffer slot */ mpxy_unlock(&smd->rblock); - mlog(0, " WARNING: (%d,%d): wr[%d] %p RR ibv_post ERR stall (%d,%d,%d,%d):" + mlog(0, " WARN[%d] (%d,%d): wr[%d] %p RR ibv_post ERR stall (%d,%d,%d,%d):" " flgs 0x%x ln %d r_addr,key %Lx %x to l_addr,key %Lx %x" " tl %d w_tl %d hd %d\n", - ret, errno, wr_rx->w_idx, wr_rx, m_qp->pi_rr_cnt, + smd->entry.tid, ret, errno, wr_rx->w_idx, wr_rx, m_qp->pi_rr_cnt, m_qp->pi_rw_cnt, m_qp->post_sig_cnt, m_qp->stall_cnt_rr, ib_wr.send_flags, l_len, ib_wr.wr.rdma.remote_addr, ib_wr.wr.rdma.rkey, ib_wr.sg_list->addr, ib_wr.sg_list->lkey, @@ -1201,9 +1212,11 @@ void m_pi_pending_wr(struct mcm_qp *m_qp, int *data) wr_rx->flags &= ~M_READ_WRITE_TO; MCNTR(smd->md, MCM_SCIF_WRITE_TO_DONE); - mlog(4, " WR_rx[%d] wr %p writeto complete! flgs 0x%x tl %d w_tl %d hd %d org_id %Lx\n", - wr_rx->w_idx, wr_rx, wr_rx->flags, - m_qp->wr_tl_r, m_qp->wr_tl_r_wt, m_qp->wr_hd_r, wr_rx->wr.wr_id); + mlog(0x10, " WR_rx[%d] wr %p scif_wt DONE! flgs 0x%x" + " tl %d w_tl %d hd %d org_id %Lx m_idx %x\n", + wr_rx->w_idx, wr_rx, wr_rx->flags, + m_qp->wr_tl_r, m_qp->wr_tl_r_wt, m_qp->wr_hd_r, + wr_rx->wr.wr_id, wr_rx->m_idx); m_qp->post_cnt_wt--; if (wr_rx->m_idx) { @@ -1249,12 +1262,12 @@ void m_pi_pending_wr(struct mcm_qp *m_qp, int *data) } else if (wr_rx->flags & M_READ_PAUSED) { - mlog(0x10, " RR PAUSED: qp %p tl %d hd %d idx %d wr %p wr_id %p," + mlog(0x4, " RR PAUSED: qp %p tl %d hd %d idx %d wr %p wr_id %p," " addr %p sz %d sflg 0x%x mflg 0x%x\n", m_qp, m_qp->wr_tl_r, m_qp->wr_hd_r, wr_idx, wr_rx, wr_rx->org_id, wr_rx->sg[1].addr, wr_rx->sg[1].length, wr_rx->wr.send_flags, wr_rx->flags); - mlog(0x10, " WR_rx[%d] RR stall (pnd %d stl %d cnt %d max %d)" + mlog(0x4, " WR_rx[%d] RR stall (pnd %d stl %d cnt %d max %d)" " memory (%p-%p) hd 0x%x tl 0x%x %x\n", wr_rx->w_idx, m_qp->pi_rr_cnt, m_qp->stall_cnt_rr, wr_cnt, wr_max, smd->m_buf_r, smd->m_buf_r + smd->m_len_r, @@ -1265,7 +1278,7 @@ void m_pi_pending_wr(struct mcm_qp *m_qp, int *data) /* no progress or RR posted needs completion processing */ if ((wr_rx->flags & M_READ_PAUSED) || (m_qp->pi_rr_cnt >= 10)) { - mlog(0x10, " pi_rr_cnt (%d) > 10, DONE!\n", m_qp->pi_rr_cnt); + mlog(0x4, " PAUSED or pi_rr_cnt %d > 10, exit\n", m_qp->pi_rr_cnt); goto done; } } diff --git a/dapl/svc/mpxy_out.c b/dapl/svc/mpxy_out.c index 038442b..ba2d868 100644 --- a/dapl/svc/mpxy_out.c +++ b/dapl/svc/mpxy_out.c @@ -132,7 +132,7 @@ static void m_po_buf_tl(mcm_scif_dev_t *smd, int m_idx) } /* out of order, mark complete, move proxy buffer tail until empty slot */ - while (idx != smd->m_buf_hd) { + while (smd->m_buf_tl != smd->m_buf_hd) { if (smd->m_buf_wc[idx].done && !empty_slot) { smd->m_tl = m_idx; smd->m_buf_wc[idx].m_idx = 0; @@ -144,6 +144,9 @@ static void m_po_buf_tl(mcm_scif_dev_t *smd, int m_idx) } else empty_slot = 1; + if (idx == smd->m_buf_hd) + break; + idx = (idx + 1) & smd->m_buf_end; } mlog(8,"return, NOT 1st slot\n"); @@ -227,12 +230,18 @@ static int m_po_send_pi(struct mcm_qp *m_qp, struct mcm_wr *m_wr, int wr_idx) else ib_qp = m_qp->ib_qp2; - mlog(4, " RW_imm post: wr_id %Lx qn %x op %d flgs %x" + mlog(0x4, " RW_imm: wr_id %Lx qn %x op %d flgs %x" " idata %x wr_rx: raddr %p rkey %x ln %d tl %d me %d hd %d\n", wr.wr_id, ib_qp->qp_num, wr.opcode, wr.send_flags, ntohl(wr.imm_data), wr.wr.rdma.remote_addr, wr.wr.rdma.rkey, m_wr->sg[0].length, m_qp->wr_tl, wr_idx, m_qp->wr_hd); + mlog(0x10, "[%d:%d:%d] RW_wr[%d]: %p org_id %Lx op %d flgs %d imm 0x%x" + " raddr %p rkey %x\n", + m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->t_entry.tid, + wr_idx, m_wr, m_wr->wr.wr_id, m_wr->wr.opcode, m_wr->wr.send_flags, + m_wr->wr.imm_data, m_wr->wr.wr.rdma.remote_addr, m_wr->wr.wr.rdma.rkey); + errno = 0; ret = ibv_post_send(ib_qp, &wr, &bad_wr); if (ret) { @@ -861,8 +870,9 @@ void m_po_wc_event(struct mcm_qp *m_qp, struct mcm_wc_rx *wc_rx, int wc_idx) } } - mlog(0x10," mb_tl %Lx->%x, m_hd %Lx wr_tl %d->%d wr_id %d wr_hd %d wc_tl %d->%d - pst %d,%d cmp %d\n", - m_qp->smd->m_tl, m_wr->m_idx, m_qp->smd->m_hd, m_qp->wr_tl, wc_rx->wr_tl, + mlog(0x10," [%d:%d] mb_tl %Lx->%x, m_hd %Lx wr_tl %d->%d wr_id %d wr_hd %d wc_tl %d->%d - pst %d,%d cmp %d\n", + m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->smd->m_tl, m_wr->m_idx, + m_qp->smd->m_hd, m_qp->wr_tl, wc_rx->wr_tl, m_wr->w_idx, m_qp->wr_hd, m_qp->wc_tl, wc_idx, m_qp->post_cnt, m_qp->post_sig_cnt, m_qp->comp_cnt); @@ -1019,7 +1029,8 @@ retry: m_qp->wr_tl = m_wr->w_idx; /* move QP wr tail */ mpxy_unlock(&m_qp->txlock); } - mlog(0x10," mb_tl %Lx hd %Lx: WR tl %d idx %d hd %d: QP pst %d,%d cmp %d - %s\n", + mlog(0x10," [%d:%d] mb_tl %Lx hd %Lx: WR tl %d idx %d hd %d: QP pst %d,%d cmp %d - %s\n", + m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->smd->m_tl, m_qp->smd->m_hd, m_qp->wr_tl, m_wr->w_idx, m_qp->wr_hd, m_qp->post_cnt, m_qp->post_sig_cnt, m_qp->comp_cnt, mcm_map_str(m_qp->cm->msg.daddr1.ep_map)); -- 2.46.0