]> git.openfabrics.org - ~ardavis/dapl.git/commitdiff
mpxyd: MXS based alltoall benchmark hangs or returns post_send timeout
authorArlin Davis <arlin.r.davis@intel.com>
Fri, 1 Aug 2014 17:54:14 +0000 (10:54 -0700)
committerArlin Davis <arlin.r.davis@intel.com>
Fri, 1 Aug 2014 17:54:14 +0000 (10:54 -0700)
Clean-up shared proxy buffer slot management during IO completions.
Current code adjusts proxy buffer tail, using m_idx, incorrectly
if freeing multiple in order buffer slots. Also, when processing
immediate in-order slot, m_po_buf_tl() failed to continue parsing
list to free other in-order !busy slots.

Signed-off-by: Arlin Davis <arlin.r.davis@intel.com>
dapl/svc/mpxy_in.c
dapl/svc/mpxy_out.c

index 1b1ef67a070fbf4716ad7fa259f08c64b9bd729f..f612108e095440ddff485a9d8bd1f57112a74cdb 100644 (file)
@@ -258,10 +258,10 @@ static int m_pi_buf_ordered(mcm_scif_dev_t *smd, int next)
 /* called with smd->rblock */
 static int m_pi_buf_hd(mcm_scif_dev_t *smd, int m_idx, struct mcm_wr_rx *m_wr_rx)
 {
-       mlog(8," smd %p - m_buf_wc_r %p: tl %d hd %d buf_wc_hd[%d].m_idx=0x%x\n",
-               smd, smd->m_buf_wc_r, smd->m_buf_tl_r,
+       mlog(4," [%d:%d] m_buf_wc_r %p: tl %d hd %d buf_wc_hd[%d].m_idx=0x%x\n",
+               smd->md->mc->scif_id, smd->entry.tid, smd->m_buf_wc_r, smd->m_buf_tl_r,
                (smd->m_buf_hd_r + 1) & smd->m_buf_end_r,
-               smd->m_buf_hd_r, m_idx);
+               (smd->m_buf_hd_r + 1) & smd->m_buf_end_r, m_idx);
 
        if (((smd->m_buf_hd_r + 1) & smd->m_buf_end_r) == smd->m_buf_tl_r) {
                mlog(0," ERR: PI Buf WC full (%d) m_buf_wc_r %p:"
@@ -273,78 +273,70 @@ static int m_pi_buf_hd(mcm_scif_dev_t *smd, int m_idx, struct mcm_wr_rx *m_wr_rx
        smd->m_buf_hd_r = (smd->m_buf_hd_r + 1) & smd->m_buf_end_r; /* move hd */
        smd->m_buf_wc_r[smd->m_buf_hd_r].m_idx = m_idx;
        smd->m_buf_wc_r[smd->m_buf_hd_r].done = 0;
+#ifdef MCM_PROFILE
+       smd->m_buf_wc_r[smd->m_buf_hd_r].ts = mcm_ts_us();
+       smd->m_buf_wc_r[smd->m_buf_hd_r].wr = (void *) m_wr_rx;
+#endif
        return 0;
 }
 
 /* called with smd->rblock */
 static void m_pi_buf_tl(mcm_scif_dev_t *smd, int m_idx, struct mcm_wr_rx *m_wr_rx)
 {
-       int s_idx, idx, empty_slot = 0;
-       struct mcm_wr_rx *wr_rx;
-       struct mcm_qp *m_qp = (struct mcm_qp *)m_wr_rx->context;
+       int s_idx, idx;
+       int busy = 0, match = 0;
 
        idx = (smd->m_buf_tl_r + 1) & smd->m_buf_end_r; /* tl == hd is empty */
        s_idx = idx;
 
-       /* 1st one, completed in order, move proxy buffer tl and buf wc tl */
-       if (smd->m_buf_wc_r[idx].m_idx == m_idx) {
-               mlog(8," [%d:%d] m_tl_r 0x%Lx -> m_idx 0x%x buf_wc[%d].m_idx=0x%x\n",
-                       smd->md->mc->scif_id, smd->entry.tid,
-                       smd->m_tl_r, m_idx, idx, smd->m_buf_wc_r[idx].m_idx);
-               smd->m_tl_r = m_idx;
-               smd->m_buf_wc_r[idx].m_idx = 0;
-               smd->m_buf_wc_r[idx].done = 1;
-               smd->m_buf_tl_r = (smd->m_buf_tl_r + 1) & smd->m_buf_end_r; /* move tl */
-               return;
-       }
-
-       /* out of order, mark complete, move proxy buffer tail until empty slot */
-       while (smd->m_buf_tl_r != smd->m_buf_hd_r) {
-               if (smd->m_buf_wc_r[idx].done && !empty_slot) {
-                       smd->m_tl_r = m_idx;
-                       smd->m_buf_wc_r[idx].m_idx = 0;
-                       smd->m_buf_tl_r = (smd->m_buf_tl_r + 1) & smd->m_buf_end_r; /* move tl */
-               } else if (smd->m_buf_wc_r[idx].m_idx == m_idx) {
+       /* mark m_idx complete, move proxy buffer tail until busy slot */
+       while ((!match || !busy) && smd->m_buf_tl_r != smd->m_buf_hd_r) {
+               if (smd->m_buf_wc_r[idx].m_idx == m_idx) {
                        smd->m_buf_wc_r[idx].done = 1;
-                       break;
-               } else
-                       empty_slot = 1;
+                       match = 1;
+               }
+               if (smd->m_buf_wc_r[idx].done && !busy) {
+                       smd->m_tl_r = smd->m_buf_wc_r[idx].m_idx;
+                       smd->m_buf_wc_r[idx].m_idx = 0;
+                       smd->m_buf_tl_r = (smd->m_buf_tl_r + 1) & smd->m_buf_end_r;
+               }
+               if (!smd->m_buf_wc_r[idx].done)
+                       busy = 1;
 
                if (idx == smd->m_buf_hd_r)
                        break;
 
                idx = (idx + 1) & smd->m_buf_end_r;
        }
+#ifdef MCM_PROFILE
+       if ((log_level < 4) || (smd->m_buf_wc_r[s_idx].done))
+               return;
 
-       if (log_level < 8)
+       if (smd->m_buf_wc_r[s_idx].done) {
+               mlog(0x10," [%d:%d] InOrder: m_wc %p: tl %d hd %d wc[%d].m_idx=0x%x "
+                         "%s m_idx 0x%x %s wr %p \n",
+                         smd->md->mc->scif_id, smd->entry.tid, smd->m_buf_wc_r,
+                         s_idx, smd->m_buf_hd_r, s_idx, smd->m_buf_wc_r[s_idx].m_idx,
+                         smd->m_buf_wc_r[s_idx].m_idx == m_idx ? "==":"!=",
+                         m_idx, smd->m_buf_wc_r[s_idx].done ? "DONE":"BUSY",
+                         smd->m_buf_wc_r[s_idx].wr);
                return;
+       }
 
        for (idx = s_idx;;) {
-               mlog(8," [%d:%d] OutofOrder: tl %d hd %d buf_wc_tl[%d].m_idx=0x%x %s m_idx 0x%x %s\n",
+               uint32_t now = mcm_ts_us();
+
+               mlog(4," [%d:%d] OutOfOrder: tl %d hd %d wc[%d].m_idx=0x%x %s m_idx 0x%x %s %d us\n",
                        smd->md->mc->scif_id, smd->entry.tid,
                        s_idx, smd->m_buf_hd_r, idx, smd->m_buf_wc_r[idx].m_idx,
                        smd->m_buf_wc_r[idx].m_idx == m_idx ? "==":"!=",
-                       m_idx, smd->m_buf_wc_r[idx].done ? "DONE":"BUSY");
+                       m_idx, smd->m_buf_wc_r[idx].done ? "DONE":"BUSY",
+                       now - smd->m_buf_wc_r[idx].ts);
                if (idx == smd->m_buf_hd_r)
                        break;
                idx = (idx + 1) & smd->m_buf_end_r;
        }
-       for (idx = m_qp->wr_tl_r;;) {
-               wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * idx));
-               mlog(8, " RX_wr[%d] %p RR(%d,%d,%d): WT(%d,%d) flgs %x op %x"
-                       " tl %d tl_wt %d hd %d m_idx %x-%x ln %d\n",
-                       wr_rx->w_idx, wr_rx, m_qp->post_cnt_rr, m_qp->stall_cnt_rr,
-                       m_qp->pi_rr_cnt,
-                       (uint32_t)((uint64_t *)m_qp->smd->md->cntrs)[MCM_SCIF_WRITE_TO],
-                       (uint32_t)((uint64_t *)m_qp->smd->md->cntrs)[MCM_SCIF_WRITE_TO_DONE],
-                       wr_rx->flags,   wr_rx->wr.opcode, m_qp->wr_tl_r,
-                       m_qp->wr_tl_r_wt, m_qp->wr_hd_r, wr_rx->m_idx - wr_rx->sg[0].length,
-                       wr_rx->m_idx, wr_rx->sg[0].length);
-
-               idx = (idx + 1) & m_qp->wrc.wr_end;
-               if (idx == m_qp->wr_hd_r)
-                       break;
-       }
+#endif
 }
 
 static off_t m_pi_mr_trans(mcm_scif_dev_t *smd, uint64_t raddr, uint32_t rkey, int len)
@@ -527,6 +519,7 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig, str
        struct mcm_sr *m_sr = NULL;
        off_t l_off, l_off_wr, r_off;
        int ret, i, l_start, l_end, l_len, sg_len, w_len, num_sge, wr_idx, wr_cnt = 0;
+       int wt_flag;
 
        wr_idx = m_qp->wr_tl_r_wt; /* from WT tail, process RR's posted until reaching wr_last */
 
@@ -598,6 +591,7 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig, str
 
                for (i=0; (i<num_sge && l_len); i++) {
                        w_len = min(sg_len, l_len);
+                       wt_flag = 0;
                        mlog(4, " WR_rx[%d] %p writeto l_off %Lx r_off %Lx rb_off 0x%x-0x%x ln %d org_id %Lx tl %d hd %d\n",
                                wr_rx->w_idx, wr_rx, l_off, r_off, l_start, l_end, w_len, wr_rx->org_id,
                                m_qp->wr_tl_r, m_qp->wr_hd_r);
@@ -606,9 +600,9 @@ static void m_pi_post_writeto(struct mcm_qp *m_qp, struct mcm_wr_rx *wr_sig, str
                        wr_rx->qcnt = m_qp->post_cnt_wt;
 #endif
                        if (w_len < 256)
-                               ret = scif_writeto(smd->scif_tx_ep, l_off, w_len, r_off, SCIF_RMA_USECPU);
-                       else
-                               ret = scif_writeto(smd->scif_tx_ep, l_off, w_len, r_off, 0);
+                               wt_flag = SCIF_RMA_USECPU;
+
+                       ret = scif_writeto(smd->scif_tx_ep, l_off, w_len, r_off, wt_flag);
 
                        if (ret) {
                                mlog(0, " ERR: scif_sendto, ret %d err: %d %s\n",
index 4af5e80d582b908e76086f9e802310e93add0535..6bdd084d53d145d06b83b99dbd1909b727bc11ee 100644 (file)
@@ -88,11 +88,12 @@ err:
 }
 
 /* called with smd->tblock */
-int m_po_buf_hd(mcm_scif_dev_t *smd, int m_idx)
+int m_po_buf_hd(mcm_scif_dev_t *smd, int m_idx, struct mcm_wr *wr)
 {
-       mlog(4," m_hd 0x%Lx - m_buf_wc %p: hd %d tl %d buf_wc[%d].m_idx=0x%x\n",
-               smd->m_hd, smd->m_buf_wc, (smd->m_buf_hd + 1) & smd->m_buf_end,
-               smd->m_buf_tl, smd->m_buf_hd, m_idx);
+       mlog(0x10," [%d:%d] m_hd 0x%Lx - m_wc %p: tl %d wc_hd[%d].m_idx=0x%x insert\n",
+               smd->md->mc->scif_id, smd->entry.tid,
+               smd->m_hd, smd->m_buf_wc, smd->m_buf_tl,
+               (smd->m_buf_hd + 1) & smd->m_buf_end, m_idx);
 
        if (((smd->m_buf_hd + 1) & smd->m_buf_end) == smd->m_buf_tl) {
                mlog(0, " ERR: PO Buf WC full (%d) m_buf_wc %p:"
@@ -104,40 +105,42 @@ int m_po_buf_hd(mcm_scif_dev_t *smd, int m_idx)
        smd->m_buf_hd = (smd->m_buf_hd + 1) & smd->m_buf_end; /* move hd */
        smd->m_buf_wc[smd->m_buf_hd].m_idx = m_idx;
        smd->m_buf_wc[smd->m_buf_hd].done = 0;
+#ifdef MCM_PROFILE
+       smd->m_buf_wc[smd->m_buf_hd].ts = mcm_ts_us();
+       smd->m_buf_wc[smd->m_buf_hd].wr = (void *) wr;
+#endif
        return 0;
 }
 
 /* called with smd->tblock */
 static void m_po_buf_tl(mcm_scif_dev_t *smd, int m_idx)
 {
-       int s_idx, idx, empty_slot = 0;
+       int s_idx, idx;
+       int busy = 0, match = 0, hits = 0;
 
        idx = (smd->m_buf_tl + 1) & smd->m_buf_end; /* tl == hd is empty */
        s_idx = idx;
 
-       /* 1st one, completed in order, move proxy buffer tl and buf wc tl */
-       if (smd->m_buf_wc[idx].m_idx == m_idx) {
-               mlog(8," [%d:%d] m_tl 0x%Lx -> m_idx 0x%x buf_wc[%d].m_idx=0x%x\n",
-                       smd->md->mc->scif_id, smd->entry.tid,
-                       smd->m_tl, m_idx, idx, smd->m_buf_wc[idx].m_idx);
-               smd->m_tl = m_idx;
-               smd->m_buf_wc[idx].m_idx = 0;
-               smd->m_buf_wc[idx].done = 1;
-               smd->m_buf_tl = (smd->m_buf_tl + 1) & smd->m_buf_end; /* move tl */
-               return;
-       }
+       mlog(0x10," [%d:%d] m_tl 0x%Lx m_hd 0x%Lx - "
+                 "m_wc %p: tl %d hd %d - m_idx=0x%x free\n",
+                 smd->md->mc->scif_id, smd->entry.tid,
+                 smd->m_tl, smd->m_hd, smd->m_buf_wc, idx,
+                 (smd->m_buf_hd + 1) & smd->m_buf_end, m_idx);
 
-       /* out of order, mark complete, move proxy buffer tail until empty slot */
-       while (smd->m_buf_tl != smd->m_buf_hd) {
-               if (smd->m_buf_wc[idx].done && !empty_slot) {
-                       smd->m_tl = m_idx;
-                       smd->m_buf_wc[idx].m_idx = 0;
-                       smd->m_buf_tl = (smd->m_buf_tl + 1) & smd->m_buf_end; /* move tl */
-               } else if (smd->m_buf_wc[idx].m_idx == m_idx) {
+       /* mark m_idx complete, move proxy buffer tail until busy slot */
+       while ((!match || !busy) && smd->m_buf_tl != smd->m_buf_hd) {
+               if (smd->m_buf_wc[idx].m_idx == m_idx) {
                        smd->m_buf_wc[idx].done = 1;
-                       break;
-               } else
-                       empty_slot = 1;
+                       match = 1;
+               }
+               if (smd->m_buf_wc[idx].done && !busy) {
+                       smd->m_tl = smd->m_buf_wc[idx].m_idx;
+                       smd->m_buf_wc[idx].m_idx = 0;
+                       smd->m_buf_tl = (smd->m_buf_tl + 1) & smd->m_buf_end;
+                       hits++;
+               }
+               if (!smd->m_buf_wc[idx].done)
+                       busy = 1;
 
                if (idx == smd->m_buf_hd)
                        break;
@@ -145,19 +148,57 @@ static void m_po_buf_tl(mcm_scif_dev_t *smd, int m_idx)
                idx = (idx + 1) & smd->m_buf_end;
        }
 
-       if (log_level < 8)
+#ifdef MCM_PROFILE
+{
+       int match = 0;
+       uint32_t now = mcm_ts_us();
+
+       if (smd->m_buf_wc[s_idx].done) {
+               mlog(0x10," [%d:%d] InOrder: m_wc %p: tl %d hd %d wc[%d].m_idx=0x%x "
+                         "%s m_idx 0x%x %s wr %p hits %d - %d us\n",
+                         smd->md->mc->scif_id, smd->entry.tid, smd->m_buf_wc,
+                         s_idx, smd->m_buf_hd, s_idx, smd->m_buf_wc[s_idx].m_idx,
+                         smd->m_buf_wc[s_idx].m_idx == m_idx ? "==":"!=",
+                         m_idx, smd->m_buf_wc[s_idx].done ? "DONE":"BUSY",
+                         smd->m_buf_wc[s_idx].wr, hits, now - smd->m_buf_wc[s_idx].ts);
                return;
+       }
 
        for (idx = s_idx;;) {
-               mlog(8," [%d:%d] OutofOrder: tl %d hd %d buf_wc_tl[%d].m_idx=0x%x %s m_idx 0x%x %s\n",
-                       smd->md->mc->scif_id, smd->entry.tid,
-                       s_idx, smd->m_buf_hd, idx, smd->m_buf_wc[idx].m_idx,
-                       smd->m_buf_wc[idx].m_idx == m_idx ? "==":"!=",
-                       m_idx, smd->m_buf_wc[idx].done ? "DONE":"BUSY");
+               if (smd->m_buf_wc[idx].m_idx == m_idx)
+                       match++;
+
+               if ((!smd->m_buf_wc[idx].done) || (smd->m_buf_wc[idx].m_idx == m_idx)) {
+                       mlog(0x10," [%d:%d] OutOfOrder: m_wc %p: tl %d hd %d wc[%d].m_idx=0x%x"
+                                 " %s m_idx 0x%x %s wr %p hits %d - %d us\n",
+                                 smd->md->mc->scif_id, smd->entry.tid, smd->m_buf_wc,
+                                 s_idx, smd->m_buf_hd, idx, smd->m_buf_wc[idx].m_idx,
+                                 smd->m_buf_wc[idx].m_idx == m_idx ? "==":"!=",
+                                 m_idx, smd->m_buf_wc[idx].done ? "DONE":"BUSY",
+                                 smd->m_buf_wc[idx].wr, hits, now - smd->m_buf_wc[idx].ts);
+               }
                if (idx == smd->m_buf_hd)
                        break;
+
                idx = (idx + 1) & smd->m_buf_end;
        }
+       if (!match) {
+               mlog(0x1," [%d:%d] WARN: m_tl 0x%Lx m_hd 0x%Lx"
+                        "- m_wc %p: tl %d hd %d - m_idx=0x%x not found\n",
+                        smd->md->mc->scif_id, smd->entry.tid,
+                        smd->m_tl, smd->m_hd, smd->m_buf_wc, smd->m_buf_tl,
+                        (smd->m_buf_hd + 1) & smd->m_buf_end, m_idx);
+       }
+       if (match > 1) {
+               mlog(0x1," [%d:%d] WARN: m_tl 0x%Lx m_hd 0x%Lx"
+                        "- m_wc %p: tl %d hd %d - m_idx=0x%x duplicate\n",
+                        smd->md->mc->scif_id, smd->entry.tid,
+                        smd->m_tl, smd->m_hd, smd->m_buf_wc, smd->m_buf_tl,
+                        (smd->m_buf_hd + 1) & smd->m_buf_end, m_idx);
+       }
+
+}
+#endif
 }
 
 
@@ -529,18 +570,16 @@ int m_po_proxy_data(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg, struct mcm_qp *m_qp
        }
 
        m_qp->wr_hd = (m_qp->wr_hd + 1) & m_qp->wr_end; /* move hd */
-
        m_wr = (struct mcm_wr *)(m_qp->wr_buf + (m_qp->wr_sz * m_qp->wr_hd));
        m_sge = m_wr->sg;
-
-       mlog(4, " m_wr %p m_sge %p num_sge %d\n", m_wr, m_sge, pmsg->wr.num_sge);
-
        m_wr->org_id = pmsg->wr.wr_id;
+       m_wr->m_idx = 0;
        m_wr->w_idx = m_qp->wr_hd;
        m_wr->flags = M_SEND_FS;
        m_wr->context = (uint64_t)m_qp;
        const_ib_rw(&m_wr->wr, &pmsg->wr, m_sge);
 
+       mlog(4, " m_wr %p m_sge %p num_sge %d\n", m_wr, m_sge, pmsg->wr.num_sge);
        mlog(4, " m_wr: raddr %Lx rkey 0x%x, ib_wr: raddr %Lx rkey 0x%x\n",
                pmsg->wr.wr.rdma.remote_addr, pmsg->wr.wr.rdma.rkey,
                m_wr->wr.wr.rdma.remote_addr, m_wr->wr.wr.rdma.rkey);
@@ -578,13 +617,21 @@ retry_mr:
                                        write(smd->md->mc->tx_pipe[1], "w", sizeof("w"));
                                }
                                if (!(++retries % 100)) {
-                                       mlog(1, " WARN: DTO delay, no PO memory,"
+                                       mlog(1, " [%d:%d:%d] WARN: DTO delay, no PO memory,"
                                                " %x hd 0x%x tl 0x%x %x,"
-                                               " need 0x%x-0x%x ln %d,"
+                                               " need 0x%x-0x%x ln %d %d<-%d,"
                                                " retries = %d -> %s\n",
+                                               m_qp->smd->md->mc->scif_id,
+                                               m_qp->smd->entry.tid, m_qp->r_entry.tid,
                                                smd->m_buf, smd->m_hd, smd->m_tl,
                                                smd->m_buf + smd->m_len,
-                                               l_start, l_end, seg_len, retries,
+                                               l_start, l_end, seg_len, l_len,
+                                               pmsg->sge[i].length, retries,
+                                               mcm_map_str(m_qp->cm->msg.daddr1.ep_map));
+                                       mlog(1," [%d:%d:%d] WR tl %d idx %d hd %d QP pst %d,%d cmp %d - %s\n",
+                                               m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->r_entry.tid,
+                                               m_qp->wr_tl, m_wr->w_idx, m_qp->wr_hd,
+                                               m_qp->post_cnt, m_qp->post_sig_cnt, m_qp->comp_cnt,
                                                mcm_map_str(m_qp->cm->msg.daddr1.ep_map));
                                        mcm_pr_addrs(1, &m_qp->cm->msg, m_qp->cm->state, 0);
                                }
@@ -596,16 +643,17 @@ retry_mr:
                                mpxy_unlock(&m_qp->txlock);
                                mpxy_unlock(&smd->tblock);
                                sleep_usec(10000);
-                               mpxy_lock(&m_qp->txlock);
                                mpxy_lock(&smd->tblock);
+                               mpxy_lock(&m_qp->txlock);
                                goto retry_mr;
                        }
                        if (retries) {
                                mlog(1, " MEM stalled: %x hd 0x%x tl 0x%x %x"
-                                       " got 0x%x-0x%x %d, retried %d\n",
+                                       " got 0x%x-0x%x ln %d %d<-%d retried %d\n",
                                        smd->m_buf, smd->m_hd, smd->m_tl,
                                        smd->m_buf + smd->m_len,
-                                       l_start, l_end, seg_len, retries);
+                                       l_start, l_end, seg_len, l_len,
+                                       pmsg->sge[i].length, retries);
                        }
 
                        l_off = smd->m_offset + l_start;
@@ -624,7 +672,7 @@ retry_mr:
                        m_wr->wr.wr.atomic.rkey = m_qp->wr_pp;
                        if (last_rf) {
                                uint32_t now = mcm_ts_us();
-                               if ((now - last_rf) > 500) {
+                               if ((now - last_rf) > 100000) {
                                        mlog(0x4, " WARN: delayed post (%d us):"
                                                   " WR[%d] hd %d tl %d io %d"
                                                   " pst %d,%d cmp %d wr_pp %d\n",
@@ -692,7 +740,7 @@ retry_mr:
                                        m_wr->flags |= M_SEND_MP_SIG;
                                        m_wr->m_idx = (sbuf + (m_wr->wr.sg_list->length - 1)) - smd->m_buf;
                                        mpxy_lock(&smd->tblock);
-                                       if (m_po_buf_hd(smd, m_wr->m_idx))
+                                       if (m_po_buf_hd(smd, m_wr->m_idx, m_wr))
                                                goto bail;
                                        mpxy_unlock(&smd->tblock);
                                        mlog(0x10, "[%d:%d:%d] %s_RF_post_sig: qp %p wr %p wr_id %p flgs 0x%x,"
@@ -700,7 +748,7 @@ retry_mr:
                                                m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid,
                                                m_qp->r_entry.tid,
                                                (MXS_EP(&m_qp->cm->msg.daddr1)) ? "po_pi":"po_direct",
-                                               m_qp, m_wr, m_wr->wr.wr_id, m_wr->wr.send_flags,
+                                               m_qp, m_wr, pmsg->wr.wr_id, m_wr->wr.send_flags,
                                                m_qp->post_cnt, mcm_rw_signal, m_qp->wr_hd, m_qp->wr_tl,
                                                m_wr->wr.sg_list->length, m_wr->m_idx);
                                }
@@ -774,6 +822,7 @@ retry_mr:
                                m_sge = m_wr->sg;
                                m_wr->org_id = pmsg->wr.wr_id;
                                m_wr->w_idx = m_qp->wr_hd;
+                               m_wr->m_idx = 0;
                                m_wr->flags = 0;
                                m_wr->context = (uint64_t)m_qp;
                                const_ib_rw(&m_wr->wr, &pmsg->wr, m_sge);