From 1b30fa751e0b7f679fa1f48a48860cfb77cf1b2b Mon Sep 17 00:00:00 2001 From: Arlin Davis Date: Fri, 21 Nov 2014 14:26:40 -0800 Subject: [PATCH] mpxyd: DTO completion ERR: status 12, op RDMA_WRITE running MPI alltoall test Running MIC scale-up configuration with mcm provider on a MXS node instead of shm causes DTO error due to heavy use of proxy-in buffer pools. Hit corner case where proxy buffer management hd ptr crossed tl ptr due to 64 byte alignment on start when hd < 64 bytes behind tl. Add additional checking on PO and PI buffer management to handle the case of HD passing TL on start locations. Also changed PO processing to hold lock until hd ptr is registered with buf_wc slot management to preserve order of memory usage across threads. Reduced the size of WC queue for PO and PI buffer management. Profiling, via MCM_PROFILE, was added to monitor and trigger buffer management errors. Signed-off-by: Arlin Davis --- dapl/svc/mix.c | 11 ++-- dapl/svc/mpxy_in.c | 88 ++++++++++++++++++----------- dapl/svc/mpxy_out.c | 132 ++++++++++++++++++++++---------------------- dapl/svc/mpxyd.c | 112 +++++++++++++++++++++++++------------ dapl/svc/mpxyd.h | 29 ++++++++-- 5 files changed, 228 insertions(+), 144 deletions(-) diff --git a/dapl/svc/mix.c b/dapl/svc/mix.c index f9058d9..0d03ff2 100644 --- a/dapl/svc/mix.c +++ b/dapl/svc/mix.c @@ -1955,7 +1955,7 @@ retry_mr: l_start = 64; l_end = l_start + len; - if (l_start < smd->m_tl && l_end > smd->m_tl) { + if (pb_full(l_start, l_end, smd->m_hd, smd->m_tl, len, smd->m_len)) { if (!retries) { MCNTR(smd->md, MCM_MX_MR_STALL); write(smd->md->mc->tx_pipe[1], "w", sizeof("w")); @@ -1985,11 +1985,11 @@ retry_mr: mpxy_unlock(&smd->tblock); goto bail; } - mpxy_unlock(&m_qp->txlock); mpxy_unlock(&smd->tblock); + mpxy_unlock(&m_qp->txlock); sleep_usec(10000); - mpxy_lock(&smd->tblock); mpxy_lock(&m_qp->txlock); + mpxy_lock(&smd->tblock); goto retry_mr; } @@ -1999,7 +1999,10 @@ retry_mr: smd->m_buf, smd->m_hd, smd->m_tl, smd->m_buf + smd->m_len, l_start, len, retries); } - +#ifdef MCM_PROFILE + if (l_end < smd->m_hd) + smd->m_hd_ro++; +#endif m_wr->sg->addr = (uint64_t)(smd->m_buf + l_start); m_wr->sg->lkey = smd->m_mr->lkey; m_wr->sg->length = len; diff --git a/dapl/svc/mpxy_in.c b/dapl/svc/mpxy_in.c index f612108..8c5d707 100644 --- a/dapl/svc/mpxy_in.c +++ b/dapl/svc/mpxy_in.c @@ -274,8 +274,21 @@ static int m_pi_buf_hd(mcm_scif_dev_t *smd, int m_idx, struct mcm_wr_rx *m_wr_rx smd->m_buf_wc_r[smd->m_buf_hd_r].m_idx = m_idx; smd->m_buf_wc_r[smd->m_buf_hd_r].done = 0; #ifdef MCM_PROFILE + smd->m_buf_wc_r[smd->m_buf_hd_r].hd = smd->m_hd_r; + smd->m_buf_wc_r[smd->m_buf_hd_r].tl = smd->m_tl_r; + smd->m_buf_wc_r[smd->m_buf_hd_r].ref++; smd->m_buf_wc_r[smd->m_buf_hd_r].ts = mcm_ts_us(); smd->m_buf_wc_r[smd->m_buf_hd_r].wr = (void *) m_wr_rx; + if ((smd->m_hd_ro_r != smd->m_tl_ro_r) && (smd->m_hd_r >= smd->m_tl_r)) { + struct mcm_qp *m_qp = (struct mcm_qp*)m_wr_rx->context; + mlog(0, " ERR: [%d:%d:%d] PI_buf HD(%d) passed TL(%d): w_tl %d w_hd %d:" + " wc_r[%d].m_idx=%x, m_tl %x m_hd %x\n", + smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid, + smd->m_hd_ro_r, smd->m_tl_ro_r, smd->m_buf_tl_r + 1, + smd->m_buf_hd_r, smd->m_buf_hd_r, m_idx, + smd->m_tl_r, smd->m_hd_r); + mcm_check_io(); + } #endif return 0; } @@ -285,7 +298,10 @@ static void m_pi_buf_tl(mcm_scif_dev_t *smd, int m_idx, struct mcm_wr_rx *m_wr_r { int s_idx, idx; int busy = 0, match = 0; - +#ifdef MCM_PROFILE + off_t tl_sav = smd->m_tl_r; + uint32_t now = mcm_ts_us(); +#endif idx = (smd->m_buf_tl_r + 1) & smd->m_buf_end_r; /* tl == hd is empty */ s_idx = idx; @@ -294,6 +310,10 @@ static void m_pi_buf_tl(mcm_scif_dev_t *smd, int m_idx, struct mcm_wr_rx *m_wr_r if (smd->m_buf_wc_r[idx].m_idx == m_idx) { smd->m_buf_wc_r[idx].done = 1; match = 1; +#ifdef MCM_PROFILE + smd->m_buf_wc_r[idx].ref--; + smd->m_buf_wc_r[idx].ts = now - smd->m_buf_wc_r[idx].ts; +#endif } if (smd->m_buf_wc_r[idx].done && !busy) { smd->m_tl_r = smd->m_buf_wc_r[idx].m_idx; @@ -308,33 +328,18 @@ static void m_pi_buf_tl(mcm_scif_dev_t *smd, int m_idx, struct mcm_wr_rx *m_wr_r idx = (idx + 1) & smd->m_buf_end_r; } -#ifdef MCM_PROFILE - if ((log_level < 4) || (smd->m_buf_wc_r[s_idx].done)) - return; - - if (smd->m_buf_wc_r[s_idx].done) { - mlog(0x10," [%d:%d] InOrder: m_wc %p: tl %d hd %d wc[%d].m_idx=0x%x " - "%s m_idx 0x%x %s wr %p \n", - smd->md->mc->scif_id, smd->entry.tid, smd->m_buf_wc_r, - s_idx, smd->m_buf_hd_r, s_idx, smd->m_buf_wc_r[s_idx].m_idx, - smd->m_buf_wc_r[s_idx].m_idx == m_idx ? "==":"!=", - m_idx, smd->m_buf_wc_r[s_idx].done ? "DONE":"BUSY", - smd->m_buf_wc_r[s_idx].wr); - return; - } - for (idx = s_idx;;) { - uint32_t now = mcm_ts_us(); +#ifdef MCM_PROFILE + if (tl_sav > smd->m_tl_r) + smd->m_tl_ro_r++; - mlog(4," [%d:%d] OutOfOrder: tl %d hd %d wc[%d].m_idx=0x%x %s m_idx 0x%x %s %d us\n", + if (!match) { + mlog(0, " [%d:%d] ERR: m_tl 0x%x m_hd 0x%x" + "- m_wc: tl %d hd %d - m_idx=0x%x NO MATCH\n", smd->md->mc->scif_id, smd->entry.tid, - s_idx, smd->m_buf_hd_r, idx, smd->m_buf_wc_r[idx].m_idx, - smd->m_buf_wc_r[idx].m_idx == m_idx ? "==":"!=", - m_idx, smd->m_buf_wc_r[idx].done ? "DONE":"BUSY", - now - smd->m_buf_wc_r[idx].ts); - if (idx == smd->m_buf_hd_r) - break; - idx = (idx + 1) & smd->m_buf_end_r; + smd->m_tl_r, smd->m_hd_r, smd->m_buf_tl_r, + (smd->m_buf_hd_r + 1) & smd->m_buf_end_r, m_idx); + mcm_check_io(); } #endif } @@ -738,19 +743,19 @@ static void m_pi_post_read(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx) l_start = 64; l_end = l_start + l_len; - if (l_start < smd->m_tl_r && l_end > smd->m_tl_r) { + if (pb_full(l_start, l_end, smd->m_hd_r, smd->m_tl_r, l_len, smd->m_len_r)) { if (!(wr_rx->flags & M_READ_PAUSED)) { wr_rx->flags |= M_READ_PAUSED; m_qp->stall_cnt_rr++; MCNTR(smd->md, MCM_MX_RR_STALL); - mlog(0, " WARN[%d:%d:%d] WR_rx[%d] org_id %Lx RR stall (%d)" + mlog(1, " WARN[%d:%d:%d] WR_rx[%d] org_id %Lx RR stall (%d)" " low mem (%p-%p) hd 0x%x tl 0x%x ln %x,%d\n", smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid, wr_rx->w_idx, wr_rx->org_id, m_qp->stall_cnt_rr, smd->m_buf_r, smd->m_buf_r + smd->m_len_r, smd->m_hd_r, smd->m_tl_r, l_len, l_len); - mlog(0, " wr[%d] %p RR(%d,%d,%d): flgs %x tl %d tl_wt %d hd %d\n", + mlog(1, " wr[%d] %p RR(%d,%d,%d): flgs %x tl %d tl_wt %d hd %d\n", wr_rx->w_idx, wr_rx, m_qp->post_cnt_rr, m_qp->stall_cnt_rr, m_qp->pi_rr_cnt, wr_rx->flags, m_qp->wr_tl_r, m_qp->wr_tl_r_wt, m_qp->wr_hd_r); @@ -780,12 +785,12 @@ static void m_pi_post_read(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx) if (wr_rx->flags & M_READ_PAUSED) { m_qp->stall_cnt_rr--; wr_rx->flags &= ~M_READ_PAUSED; - mlog(0x1, "[%d:%d:%d] WR_rx[%d] RR released (%d) got memory (%p-%p)" - " hd 0x%x tl 0x%x ln %x,%d\n", + mlog(0x1, "[%d:%d:%d] WR_rx[%d] RR (%d) got memory (%p-%p)" + " hd 0x%x tl 0x%x (0x%x-0x%x) ln %x,%d\n", smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid, wr_rx->w_idx, m_qp->stall_cnt_rr, smd->m_buf_r, smd->m_buf_r + smd->m_len_r, - smd->m_hd_r, smd->m_tl_r, l_len, l_len); + smd->m_hd_r, smd->m_tl_r, l_start, l_end, l_len, l_len); } else if (m_qp->stall_cnt_rr) { wr_rx->flags |= M_READ_PAUSED; m_qp->stall_cnt_rr++; @@ -829,10 +834,15 @@ static void m_pi_post_read(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx) (!((m_qp->post_cnt_rr+1) % mcm_rr_signal))) { ib_wr.send_flags = IBV_SEND_SIGNALED; wr_rx->m_idx = ((rbuf + (l_len - 1)) - smd->m_buf_r); - if (m_pi_buf_hd(smd, wr_rx->m_idx, wr_rx)) + if (m_pi_buf_hd(smd, wr_rx->m_idx, wr_rx)) { + mpxy_unlock(&smd->rblock); goto buf_err; + } } - +#ifdef MCM_PROFILE + if (l_end < smd->m_hd_r) + smd->m_hd_ro_r++; +#endif /* * update shared proxy-in buffer hd, save end of buffer idx * and save ref m_idx for out of order completions across QP's @@ -874,6 +884,14 @@ static void m_pi_post_read(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_rx) return; bail: mpxy_lock(&smd->rblock); + mlog(0x0, "[%d:%d:%d] ERR: WR[%d] %p RR(%d,%d,%d): wr_id %Lx qn %x f=%x,%x ln %d " + "RA %Lx %x to LA %Lx %x tl %d hd %d, m=%x\n", + smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid, + wr_rx->w_idx, wr_rx, m_qp->post_cnt_rr, m_qp->stall_cnt_rr, + m_qp->pi_rr_cnt, ib_wr.wr_id, ib_qp->qp_num, ib_wr.send_flags, + wr_rx->flags, l_len, ib_wr.wr.rdma.remote_addr, + ib_wr.wr.rdma.rkey, ib_wr.sg_list->addr, ib_wr.sg_list->lkey, + m_qp->wr_tl_r, m_qp->wr_hd_r, wr_rx->m_idx); m_pi_buf_tl(smd, wr_rx->m_idx, wr_rx); /* return buffer slot */ mpxy_unlock(&smd->rblock); buf_err: @@ -912,6 +930,10 @@ void m_pi_rcv_event(struct mcm_qp *m_qp, wrc_idata_t *wrc) mpxy_lock(&m_qp->rxlock); m_qp->wc_tl_rem = wr_rx->w_idx; /* remote WC tail update in WR */ +#ifdef MCM_PROFILE + if (wrc->id == m_qp->wr_hd_r) + mlog(0," ERR: RX imm_data: WR id %d duplicate!!!\n", wrc->id); +#endif m_qp->wr_hd_r = wrc->id; /* new WR took slot, move hd_r */ wr_rx->w_idx = wrc->id; /* my idx slot, to move tl */ m_pi_post_read(m_qp, wr_rx); diff --git a/dapl/svc/mpxy_out.c b/dapl/svc/mpxy_out.c index bcd0f5f..a6b3dd4 100644 --- a/dapl/svc/mpxy_out.c +++ b/dapl/svc/mpxy_out.c @@ -41,6 +41,10 @@ extern int mix_eager_completion; extern int mix_inline_threshold; extern uint64_t system_guid; +#ifdef MCM_PROFILE +extern void mcm_check_io(); +#endif + /* buffer pool for proxy outbount RDMA work request entries, SCIF registration, scif_ep */ void m_po_destroy_bpool(struct mcm_qp *m_qp) { @@ -106,8 +110,21 @@ int m_po_buf_hd(mcm_scif_dev_t *smd, int m_idx, struct mcm_wr *wr) smd->m_buf_wc[smd->m_buf_hd].m_idx = m_idx; smd->m_buf_wc[smd->m_buf_hd].done = 0; #ifdef MCM_PROFILE + smd->m_buf_wc[smd->m_buf_hd].hd = smd->m_hd; + smd->m_buf_wc[smd->m_buf_hd].tl = smd->m_tl; + smd->m_buf_wc[smd->m_buf_hd].ref++; smd->m_buf_wc[smd->m_buf_hd].ts = mcm_ts_us(); smd->m_buf_wc[smd->m_buf_hd].wr = (void *) wr; + if ((smd->m_hd_ro != smd->m_tl_ro) && (smd->m_hd >= smd->m_tl)) { + struct mcm_qp *m_qp = (struct mcm_qp*)wr->context; + mlog(0, " WARNING: [%d:%d:%d] PO_buf HD(%d) passed TL(%d): w_tl %d w_hd %d:" + " wc_r[%d].m_idx=%x, m_tl %x m_hd %x\n", + smd->md->mc->scif_id, smd->entry.tid, m_qp->r_entry.tid, + smd->m_hd_ro, smd->m_tl_ro, smd->m_buf_tl + 1, + smd->m_buf_hd, smd->m_buf_hd, m_idx, + smd->m_tl, smd->m_hd); + mcm_check_io(); + } #endif return 0; } @@ -117,7 +134,10 @@ static void m_po_buf_tl(mcm_scif_dev_t *smd, int m_idx) { int s_idx, idx; int busy = 0, match = 0, hits = 0; - +#ifdef MCM_PROFILE + off_t tl_sav = smd->m_tl; + uint32_t now = mcm_ts_us(); +#endif idx = (smd->m_buf_tl + 1) & smd->m_buf_end; /* tl == hd is empty */ s_idx = idx; @@ -132,6 +152,10 @@ static void m_po_buf_tl(mcm_scif_dev_t *smd, int m_idx) if (smd->m_buf_wc[idx].m_idx == m_idx) { smd->m_buf_wc[idx].done = 1; match = 1; +#ifdef MCM_PROFILE + smd->m_buf_wc[idx].ref--; + smd->m_buf_wc[idx].ts = now - smd->m_buf_wc[idx].ts; +#endif } if (smd->m_buf_wc[idx].done && !busy) { smd->m_tl = smd->m_buf_wc[idx].m_idx; @@ -149,59 +173,20 @@ static void m_po_buf_tl(mcm_scif_dev_t *smd, int m_idx) } #ifdef MCM_PROFILE -{ - int match = 0; - uint32_t now = mcm_ts_us(); + if (tl_sav > smd->m_tl) + smd->m_tl_ro++; - if (smd->m_buf_wc[s_idx].done) { - mlog(0x10," [%d:%d] InOrder: m_wc %p: tl %d hd %d wc[%d].m_idx=0x%x " - "%s m_idx 0x%x %s wr %p hits %d - %d us\n", - smd->md->mc->scif_id, smd->entry.tid, smd->m_buf_wc, - s_idx, smd->m_buf_hd, s_idx, smd->m_buf_wc[s_idx].m_idx, - smd->m_buf_wc[s_idx].m_idx == m_idx ? "==":"!=", - m_idx, smd->m_buf_wc[s_idx].done ? "DONE":"BUSY", - smd->m_buf_wc[s_idx].wr, hits, now - smd->m_buf_wc[s_idx].ts); - return; - } - - for (idx = s_idx;;) { - if (smd->m_buf_wc[idx].m_idx == m_idx) - match++; - - if ((!smd->m_buf_wc[idx].done) || (smd->m_buf_wc[idx].m_idx == m_idx)) { - mlog(0x10," [%d:%d] OutOfOrder: m_wc %p: tl %d hd %d wc[%d].m_idx=0x%x" - " %s m_idx 0x%x %s wr %p hits %d - %d us\n", - smd->md->mc->scif_id, smd->entry.tid, smd->m_buf_wc, - s_idx, smd->m_buf_hd, idx, smd->m_buf_wc[idx].m_idx, - smd->m_buf_wc[idx].m_idx == m_idx ? "==":"!=", - m_idx, smd->m_buf_wc[idx].done ? "DONE":"BUSY", - smd->m_buf_wc[idx].wr, hits, now - smd->m_buf_wc[idx].ts); - } - if (idx == smd->m_buf_hd) - break; - - idx = (idx + 1) & smd->m_buf_end; - } if (!match) { - mlog(0x1," [%d:%d] WARN: m_tl 0x%Lx m_hd 0x%Lx" - "- m_wc %p: tl %d hd %d - m_idx=0x%x not found\n", - smd->md->mc->scif_id, smd->entry.tid, - smd->m_tl, smd->m_hd, smd->m_buf_wc, smd->m_buf_tl, - (smd->m_buf_hd + 1) & smd->m_buf_end, m_idx); - } - if (match > 1) { - mlog(0x1," [%d:%d] WARN: m_tl 0x%Lx m_hd 0x%Lx" - "- m_wc %p: tl %d hd %d - m_idx=0x%x duplicate\n", - smd->md->mc->scif_id, smd->entry.tid, - smd->m_tl, smd->m_hd, smd->m_buf_wc, smd->m_buf_tl, - (smd->m_buf_hd + 1) & smd->m_buf_end, m_idx); + mlog(0, " [%d:%d] ERR: m_tl 0x%x m_hd 0x%x" + "- m_wc %p: tl %d hd %d - m_idx=0x%x NO MATCH\n", + smd->md->mc->scif_id, smd->entry.tid, + smd->m_tl, smd->m_hd, smd->m_buf_wc, smd->m_buf_tl, + (smd->m_buf_hd + 1) & smd->m_buf_end, m_idx); + mcm_check_io(); } - -} #endif } - /* * Proxy-out to Proxy-in - Endpoints are on same platform */ @@ -514,6 +499,8 @@ done: #if MCM_PROFILE static uint32_t last_rf = 0; #endif +static uint32_t po_ts = 0; + /* initiate proxy data transfer, operation channel */ int m_po_proxy_data(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg, struct mcm_qp *m_qp) @@ -611,20 +598,21 @@ retry_mr: l_start = 64; l_end = l_start + seg_len; - if (l_start < smd->m_tl && l_end > smd->m_tl) { + if (pb_full(l_start, l_end, smd->m_hd, smd->m_tl, seg_len, smd->m_len)) { if (!retries) { + po_ts = mcm_ts_us(); MCNTR(smd->md, MCM_MX_MR_STALL); write(smd->md->mc->tx_pipe[1], "w", sizeof("w")); } - if (!(++retries % 100)) { - mlog(1, " [%d:%d:%d] WARN: DTO delay, no PO memory," - " %x hd 0x%x tl 0x%x %x," + if (!(++retries % 200)) { + mlog(1, " [%d:%d:%d] WARN: no PO memory," + " 0x%x hd 0x%x tl 0x%x 0x%x 0x%x," " need 0x%x-0x%x ln %d %d<-%d," " retries = %d -> %s\n", m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->r_entry.tid, smd->m_buf, smd->m_hd, smd->m_tl, - smd->m_buf + smd->m_len, + smd->m_len, smd->m_buf + smd->m_len, l_start, l_end, seg_len, l_len, pmsg->sge[i].length, retries, mcm_map_str(m_qp->cm->msg.daddr1.ep_map)); @@ -638,27 +626,30 @@ retry_mr: if (retries == 1000) { ret = ENOMEM; wc_err = IBV_WC_RETRY_EXC_ERR; + mpxy_unlock(&smd->tblock); goto bail; } - mpxy_unlock(&m_qp->txlock); mpxy_unlock(&smd->tblock); + mpxy_unlock(&m_qp->txlock); sleep_usec(10000); - mpxy_lock(&smd->tblock); mpxy_lock(&m_qp->txlock); + mpxy_lock(&smd->tblock); goto retry_mr; } if (retries) { mlog(1, " MEM stalled: %x hd 0x%x tl 0x%x %x" - " got 0x%x-0x%x ln %d %d<-%d retried %d\n", + " got 0x%x-0x%x ln %d %d<-%d retried %d %dus\n", smd->m_buf, smd->m_hd, smd->m_tl, smd->m_buf + smd->m_len, l_start, l_end, seg_len, l_len, - pmsg->sge[i].length, retries); + pmsg->sge[i].length, retries, mcm_ts_us() - po_ts); } - +#ifdef MCM_PROFILE + if (l_end < smd->m_hd) + smd->m_hd_ro++; +#endif l_off = smd->m_offset + l_start; smd->m_hd = l_end; - mpxy_unlock(&smd->tblock); mlog(4, " SCIF_readfrom[%d] l_off %p, r_off %p," " l_start 0x%x l_end 0x%x seg_len %d," @@ -692,6 +683,7 @@ retry_mr: if (ret) { mlog(0, " ERR: scif_readfrom, ret %d\n", ret); + mpxy_unlock(&smd->tblock); goto bail; } MCNTR(smd->md, MCM_SCIF_READ_FROM); @@ -739,10 +731,10 @@ retry_mr: m_wr->wr.send_flags |= IBV_SEND_SIGNALED; m_wr->flags |= M_SEND_MP_SIG; m_wr->m_idx = (sbuf + (m_wr->wr.sg_list->length - 1)) - smd->m_buf; - mpxy_lock(&smd->tblock); - if (m_po_buf_hd(smd, m_wr->m_idx, m_wr)) + if (m_po_buf_hd(smd, m_wr->m_idx, m_wr)) { + mpxy_unlock(&smd->tblock); goto bail; - mpxy_unlock(&smd->tblock); + } mlog(0x10, "[%d:%d:%d] %s_RF_post_sig: qp %p wr %p wr_id %p flgs 0x%x," " pcnt %d sg_rate %d hd %d tl %d sz %d m_idx %x\n", m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, @@ -767,18 +759,18 @@ retry_mr: SCIF_SIGNAL_LOCAL); if (ret) { mlog(0," ERR: scif_fence_sig, ret %d \n", ret); + mpxy_unlock(&smd->tblock); goto bail; } m_qp->wr_pp++; MCNTR(smd->md, MCM_SCIF_SIGNAL); MCNTR(smd->md, MCM_MX_WRITE_SEG); - - mpxy_unlock(&m_qp->txlock); write(smd->md->mc->tx_pipe[1], "w", sizeof("w")); - mpxy_lock(&m_qp->txlock); - if (!len) /* done */ + if (!len) { /* done */ + mpxy_unlock(&smd->tblock); break; + } /* get next WR */ retries = 0; @@ -799,11 +791,14 @@ retry_mr: if (retries == 1000) { ret = ENOMEM; wc_err = IBV_WC_RETRY_EXC_ERR; + mpxy_unlock(&smd->tblock); goto bail; } + mpxy_unlock(&smd->tblock); mpxy_unlock(&m_qp->txlock); sleep_usec(10000); mpxy_lock(&m_qp->txlock); + mpxy_lock(&smd->tblock); } if (retries) { mlog(1, " WR stalled: sz %d, hd %d tl %d io %d" @@ -813,8 +808,10 @@ retry_mr: m_qp->post_sig_cnt, m_qp->comp_cnt, m_qp->wr_pp); } - if (m_wr->flags & M_SEND_LS) + if (m_wr->flags & M_SEND_LS) { + mpxy_unlock(&smd->tblock); goto bail; + } /* prepare the next WR */ m_qp->wr_hd = (m_qp->wr_hd + 1) & m_qp->wr_end; /* move hd */ @@ -828,6 +825,7 @@ retry_mr: const_ib_rw(&m_wr->wr, &pmsg->wr, m_sge); m_wr->wr.wr.rdma.remote_addr += total_offset; } + mpxy_unlock(&smd->tblock); } } ret = 0; diff --git a/dapl/svc/mpxyd.c b/dapl/svc/mpxyd.c index afabacf..29be436 100644 --- a/dapl/svc/mpxyd.c +++ b/dapl/svc/mpxyd.c @@ -482,8 +482,8 @@ static int create_smd_bpool(mcm_scif_dev_t *smd) mlog(8, " IB registered addr=%p,%d, mr_addr=%p handle=0x%x, lkey=0x%x rkey=0x%x \n", smd->m_buf, smd->m_len, smd->m_mr->addr, smd->m_mr->handle, smd->m_mr->lkey, smd->m_mr->rkey); - /* SEND WC queue for buffer management, manage empty slots */ - wcq_size = (((mix_max_msg_mb*1024*1024)/smd->m_seg) * mcm_tx_entries); /* power of 2 */ + /* SEND WC queue for buffer management, manage empty slots, power of 2 */ + wcq_size = (((mix_max_msg_mb*1024*1024)/smd->m_seg) * 8); wcq_entries = 1; while (wcq_entries < wcq_size) wcq_entries <<= 1; @@ -494,11 +494,13 @@ static int create_smd_bpool(mcm_scif_dev_t *smd) mlog(0, "failed to allocate smd m_bu_wc, m_len=%d, ERR: %d\n", wcq_len, ret); return -1; } + memset(smd->m_buf_wc, 0, wcq_len); + mlog(0x10, " m_buf_wc %p, len %d, entries %d \n", + smd->m_buf_wc, wcq_len, wcq_entries); + smd->m_buf_hd = 0; smd->m_buf_tl = 0; - smd->m_buf_end = (wcq_len/sizeof(mcm_buf_wc_t)) - 1; - - mlog(0x10, " m_buf_wc %p, len %d, entries %d \n", smd->m_buf_wc, wcq_len, wcq_entries); + smd->m_buf_end = wcq_entries - 1; /* RECEIVE proxy buffers */ smd->m_len_r = ((mix_buffer_mb + 8) * (1024 * 1024)); @@ -532,8 +534,8 @@ static int create_smd_bpool(mcm_scif_dev_t *smd) smd->m_buf_r, smd->m_len_r, smd->m_mr_r->addr, smd->m_mr_r->handle, smd->m_mr_r->lkey, smd->m_mr_r->rkey); - /* RECV WC queue for buffer management, manage empty slots */ - wcq_size = (((mix_max_msg_mb*1024*1024)/smd->m_seg) * mcm_rx_entries); /* power of 2 */ + /* RECV WC queue for buffer management, manage empty slots, power of 2 */ + wcq_size = (((mix_max_msg_mb*1024*1024)/smd->m_seg) * 8); wcq_entries = 1; while (wcq_entries < wcq_size) wcq_entries <<= 1; @@ -544,12 +546,13 @@ static int create_smd_bpool(mcm_scif_dev_t *smd) mlog(0, "failed to allocate smd m_buf_wc_r, m_len=%d, ERR: %d\n", wcq_len); return -1; } + memset(smd->m_buf_wc_r, 0, wcq_len); mlog(0x10, " m_buf_wc_r %p, len %d, entries %d \n", smd->m_buf_wc_r, wcq_len, wcq_entries); smd->m_buf_hd_r = 0; smd->m_buf_tl_r = 0; - smd->m_buf_end_r = (wcq_len/sizeof(mcm_buf_wc_t)) - 1; + smd->m_buf_end_r = wcq_entries - 1; return 0; } @@ -1342,7 +1345,6 @@ int main(int argc, char **argv) } #ifdef MCM_PROFILE - /* Diagnostic helper functions, log client/device/connection states */ void mcm_qp_log(struct mcm_qp *m_qp, int tx) { @@ -1353,10 +1355,12 @@ void mcm_qp_log(struct mcm_qp *m_qp, int tx) mlog(0, "[%d:%d:%d] PO QPt %p - WR tl %d tl_rf %d hd %d -" " RW pst %d sig %d po_cmp %d, wr_rem %d wr %d - IO %d ACT %d\n", m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, - m_qp->t_entry.tid, m_qp, m_qp->wr_tl, m_qp->wr_tl_rf, m_qp->wr_hd, + m_qp->r_entry.tid, m_qp, m_qp->wr_tl, m_qp->wr_tl_rf, m_qp->wr_hd, m_qp->post_cnt, m_qp->post_sig_cnt, m_qp->comp_cnt, m_qp->wr_pp_rem, m_qp->wr_pp, m_qp->post_cnt, io); + if (m_qp->cm) + mcm_pr_addrs(0, &m_qp->cm->msg, m_qp->cm->state, 0); } else { io = m_qp->stall_cnt_rr + m_qp->post_cnt_wt + m_qp->pi_rw_cnt; mlog(0, "[%d:%d:%d] PI QPr %p - WR tl %d tl_wt %d hd %d -" @@ -1366,9 +1370,9 @@ void mcm_qp_log(struct mcm_qp *m_qp, int tx) m_qp->wr_hd_r, m_qp->post_cnt_rr, m_qp->pi_rr_cnt, m_qp->stall_cnt_rr, m_qp->post_cnt_wt, m_qp->pi_rw_cnt, m_qp->post_cnt_rr, io); + if (m_qp->cm) + mcm_pr_addrs(0, &m_qp->cm->msg, m_qp->cm->state, 0); } - if (m_qp->cm) - mcm_pr_addrs(0, &m_qp->cm->msg, m_qp->cm->state, 0); } void mcm_connect_log(struct mcm_scif_dev *smd) @@ -1376,8 +1380,6 @@ void mcm_connect_log(struct mcm_scif_dev *smd) struct mcm_qp *m_qp_t; struct mcm_qp *m_qp_r; - mpxy_lock(&smd->qptlock); - mpxy_lock(&smd->qprlock); m_qp_t = get_head_entry(&smd->qptlist); m_qp_r = get_head_entry(&smd->qprlist); while (m_qp_t || m_qp_r) { @@ -1392,8 +1394,6 @@ void mcm_connect_log(struct mcm_scif_dev *smd) &smd->qprlist); } } - mpxy_unlock(&smd->qprlock); - mpxy_unlock(&smd->qptlock); } void mcm_dat_dev_log(struct mcm_scif_dev *smd) @@ -1401,13 +1401,14 @@ void mcm_dat_dev_log(struct mcm_scif_dev *smd) int idx; uint32_t now = mcm_ts_us(); - mlog(0, "[%d:%d] SMD %p \n", smd->md->mc->scif_id, smd->entry.tid, smd); - mlog(0, "[%d:%d] PO_BUF %p tl 0x%Lx hd 0x%Lx ln %d - WC %p tl %d hd %d ln %d - SEGs %d ACT %u\n", + mlog(0, "[%d:%d] PO_BUF %p tl 0x%x hd 0x%x ln %d -" + " WC %p tl %d hd %d ln %d - SEGs %d ACT %u\n", smd->md->mc->scif_id, smd->entry.tid, smd->m_buf, smd->m_tl, smd->m_hd, smd->m_len, smd->m_buf_wc, smd->m_buf_tl, smd->m_buf_hd, smd->m_buf_end, smd->m_buf_hd, smd->m_buf_hd - smd->m_buf_tl); - mlog(0, "[%d:%d] PI_BUF %p tl 0x%Lx hd 0x%Lx ln %d - WC %p tl %d hd %d ln %d - SEGs %d ACT %u\n", + mlog(0, "[%d:%d] PI_BUF %p tl 0x%x hd 0x%x ln %d -" + " WC %p tl %d hd %d ln %d - SEGs %d ACT %u\n", smd->md->mc->scif_id, smd->entry.tid, smd->m_buf_r, smd->m_tl_r, smd->m_hd_r, smd->m_len_r, smd->m_buf_wc_r, smd->m_buf_tl_r, smd->m_buf_hd_r, smd->m_buf_end_r, @@ -1415,19 +1416,59 @@ void mcm_dat_dev_log(struct mcm_scif_dev *smd) /* show PO mbuf_wc busy slots */ idx = smd->m_buf_tl; - while (smd->m_buf_tl != smd->m_buf_hd) { - if (smd->m_buf_wc[idx].m_idx) { - mlog(0, "[%d:%d] PO: m_wc %p: tl %d hd %d wc[%d].m_idx=0x%x" - " %s wr %p %d us\n", - smd->md->mc->scif_id, smd->entry.tid, smd->m_buf_wc, + while ((smd->m_buf_tl != smd->m_buf_hd) && + (smd->m_buf_hd - smd->m_buf_tl)) { + if ((smd->m_buf_wc[idx].m_idx && !smd->m_buf_wc[idx].done) || 1) { + struct mcm_wr *m_wr = NULL; + struct mcm_qp *m_qp = NULL; + if (smd->m_buf_wc[idx].wr) { + m_wr = (struct mcm_wr *)smd->m_buf_wc[idx].wr; + m_qp = (struct mcm_qp *)m_wr->context; + } + mlog(0, "[%d:%d:%d] PO: m_wc - tl %d hd %d wc[%d].m_idx=0x%x %s" + " wr[%d] f=%x,m=%x t=%u ref=%d m_tl %x hd %x\n", + smd->md->mc->scif_id, smd->entry.tid, + m_qp ? m_qp->r_entry.tid:0, smd->m_buf_tl, smd->m_buf_hd, idx, smd->m_buf_wc[idx].m_idx, smd->m_buf_wc[idx].done ? "DONE":"BUSY", - smd->m_buf_wc[idx].wr, now - smd->m_buf_wc[idx].ts); + m_wr ? m_wr->w_idx:0, m_wr ? m_wr->flags:0, + m_wr ? m_wr->m_idx:0, smd->m_buf_wc[idx].done ? + smd->m_buf_wc[idx].ts : now - smd->m_buf_wc[idx].ts, + smd->m_buf_wc[idx].ref, + smd->m_buf_wc[idx].tl, smd->m_buf_wc[idx].hd); } idx = (idx + 1) & smd->m_buf_end; + if (idx == (smd->m_buf_hd+2)) + break; + } - if (idx == smd->m_buf_hd) + /* show PI mbuf_wc busy slots, start from tail */ + idx = smd->m_buf_tl_r; + while ((smd->m_buf_tl_r != smd->m_buf_hd_r) && + (smd->m_buf_hd_r - smd->m_buf_tl_r)) { + if (smd->m_buf_wc_r[idx].m_idx || 1) { + struct mcm_wr_rx *m_wr = NULL; + struct mcm_qp *m_qp = NULL; + if (smd->m_buf_wc_r[idx].wr) { + m_wr = (struct mcm_wr_rx *)smd->m_buf_wc_r[idx].wr; + m_qp = (struct mcm_qp *)m_wr->context; + } + mlog(0, "[%d:%d:%d] PI: m_wc_r - tl %d hd %d wc[%d].m_idx=0x%x %s " + "wr[%d] f=%x,m=%x t=%u ref=%d m_tl %x hd %x\n", + smd->md->mc->scif_id, smd->entry.tid, + m_qp ? m_qp->r_entry.tid:0, + smd->m_buf_tl_r, smd->m_buf_hd_r, idx, + smd->m_buf_wc_r[idx].m_idx, + smd->m_buf_wc_r[idx].done ? "DONE":"BUSY", + m_wr ? m_wr->w_idx:0, m_wr ? m_wr->flags:0, + m_wr ? m_wr->m_idx:0, smd->m_buf_wc_r[idx].done ? + smd->m_buf_wc_r[idx].ts : now - smd->m_buf_wc_r[idx].ts, + smd->m_buf_wc_r[idx].ref, + smd->m_buf_wc_r[idx].tl, smd->m_buf_wc_r[idx].hd); + } + idx = (idx + 1) & smd->m_buf_end_r; + if (idx == (smd->m_buf_hd_r+2)) break; } } @@ -1442,6 +1483,7 @@ void mcm_ib_dev_log(struct mcm_ib_dev *md) md->addr.ep_map == MIC_SSOCK_DEV ? "MSS":"MXS", md->mc->ver); } +static int check_io_run; void mcm_check_io() { mcm_client_t *mc; @@ -1449,6 +1491,11 @@ void mcm_check_io() struct mcm_scif_dev *smd; int i, ii; + if (check_io_run) + return; + else + check_io_run++; + for (i=0;ioplock); - mpxy_lock(&mc->cmlock); - mpxy_lock(&mc->txlock); - mpxy_lock(&mc->rxlock); smd = get_head_entry(&md->smd_list); while (smd && !smd->destroy) { + mlog(0, "[%d:%d] SMD %p \n", + smd->md->mc->scif_id, + smd->entry.tid, smd); mcm_dat_dev_log(smd); /* dat_ia_open */ mcm_connect_log(smd); /* dat_connect */ smd = get_next_entry(&smd->entry, &md->smd_list); } - mpxy_unlock(&mc->rxlock); - mpxy_unlock(&mc->txlock); - mpxy_unlock(&mc->cmlock); - mpxy_unlock(&mc->oplock); } } + assert(0); } - #endif diff --git a/dapl/svc/mpxyd.h b/dapl/svc/mpxyd.h index 24a33ae..5ca1423 100644 --- a/dapl/svc/mpxyd.h +++ b/dapl/svc/mpxyd.h @@ -64,7 +64,7 @@ #define MIX_MIN 4 /* oldest version supported */ #define MIX_COMP 4 /* compatibility version */ #define MIX_MAX DAT_MIX_VER -/* #define MCM_PROFILE 1 */ +#define MCM_PROFILE 1 /* locking */ @@ -142,6 +142,9 @@ typedef struct mcm_buf_wc { uint32_t m_idx; uint32_t done; #ifdef MCM_PROFILE + uint32_t hd; + uint32_t tl; + uint32_t ref; uint32_t ts; void *wr; #endif @@ -325,8 +328,8 @@ typedef struct mcm_scif_dev { char *m_buf; /* MIC TX proxy buffer, SCIF and IB */ struct ibv_mr *m_mr; /* ib registration */ off_t m_offset; /* SCIF registration */ - off_t m_hd; /* buffer pool head */ - off_t m_tl; /* buffer pool tail */ + int m_hd; /* buffer pool head */ + int m_tl; /* buffer pool tail */ int m_len; /* TX buffer size */ int m_seg; /* segment size, same for TX and RX proxy */ struct mcm_buf_wc *m_buf_wc; /* Proxy Buffer work completion queue */ @@ -338,14 +341,20 @@ typedef struct mcm_scif_dev { char *m_buf_r; /* MIC RX proxy buffer, SCIF and IB */ struct ibv_mr *m_mr_r; /* Rcv proxy buffer, ib registration */ off_t m_offset_r; /* Rcv proxy buffer, SCIF registration */ - off_t m_hd_r; /* Rcv buffer pool head */ - off_t m_tl_r; /* Rcv buffer pool tail */ + int m_hd_r; /* Rcv buffer pool head */ + int m_tl_r; /* Rcv buffer pool tail */ int m_len_r; /* Rcv proxy buffer size */ struct mcm_buf_wc *m_buf_wc_r; /* Proxy Buffer work completion queue */ int m_buf_tl_r; /* Proxy Buffer WC queue tl */ int m_buf_hd_r; /* Proxy Buffer WC queue hd */ int m_buf_end_r; /* Proxy Buffer WC queue end */ char *cmd_buf; /* operation command buffer */ +#ifdef MCM_PROFILE + uint16_t m_hd_ro; /* HD,TL tracking */ + uint16_t m_tl_ro; + uint16_t m_hd_ro_r; + uint16_t m_tl_ro_r; +#endif } mcm_scif_dev_t; @@ -695,6 +704,16 @@ static inline uint64_t mcm_time_us(void) } #define mcm_time_ms() (mcm_time_us() / 1000) +static inline int pb_full(int start, int end, int head, int tail, int len, int size) +{ + if (((start < tail) && (end >= tail)) || + (((head - tail) < 0) && (ALIGN_64(head) + len) >= tail) || + (((ALIGN_64(head) + len) > size) && start >= tail)) + return 1; + else + return 0; +} + static inline void mcm_free_port(uint64_t *p_port, uint16_t port) { p_port[port] = 0; -- 2.46.0