From 2b294cd7dcdbccdc235c056791f36bd2821c2b9b Mon Sep 17 00:00:00 2001 From: Arlin Davis Date: Thu, 10 Dec 2015 14:36:22 -0800 Subject: [PATCH] mpxyd: proxy out WR resources exhausted with MFO mode endpoints WC status of IBV_WC_RETRY_EXC_ERR reported back to MIC client Operation processing thread doesn't yield properly to enable tx thread to process completions and replenish WR resources. Retries occur to quickly. add some new error logs for resource issues. Signed-off-by: Arlin Davis --- dapl/svc/mpxy_out.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/dapl/svc/mpxy_out.c b/dapl/svc/mpxy_out.c index d015dc3..ce2d4d5 100644 --- a/dapl/svc/mpxy_out.c +++ b/dapl/svc/mpxy_out.c @@ -487,6 +487,23 @@ void m_po_pending_wr(struct mcm_qp *m_qp, int *data) else wc.wc_flags = 0; wc.vendor_err = ret; + if (ret) { + mlog(0, "[%d:%d:%d] ERR %s_RW_post: WR[%d] wr_id %p flgs 0x%x," + " pcnt %d sg_rate %d hd %d tl %d sz %d m_idx %x\n", + m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, + m_qp->r_entry.tid, + (MXF_EP(&m_qp->cm->msg.daddr1)) ? "po_pi":"po_direct", + m_wr->w_idx, m_wr->wr.wr_id, m_wr->wr.send_flags, + m_qp->post_cnt, mcm_rw_signal, m_qp->wr_hd, m_qp->wr_tl, + m_wr->wr.sg_list->length, m_wr->m_idx); + mlog(0, "[%d:%d:%d] ERR wr_id %Lx next %p sglist %p sge %d op %d flgs" + " %d idata 0x%x raddr %p rkey %x \n", + m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, + m_qp->r_entry.tid, m_wr->wr.wr_id, m_wr->wr.next, + m_wr->wr.sg_list, m_wr->wr.num_sge, m_wr->wr.opcode, + m_wr->wr.send_flags, m_wr->wr.imm_data, + m_wr->wr.wr.rdma.remote_addr, m_wr->wr.wr.rdma.rkey); + } mix_dto_event(m_qp->ib_qp2->send_cq->cq_context, &wc, 1); } @@ -565,7 +582,7 @@ int m_po_proxy_data(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg, struct mcm_qp *m_qp off_t l_off, r_off; uint64_t total_offset; int l_start, l_end, l_len, cacheln_off, seg_len; - struct mcm_wr *m_wr; + struct mcm_wr *m_wr = NULL; struct ibv_sge *m_sge; mlog(4, " q_id %d, q_ctx %p, len %d, wr_id %p, sge %d, op %x flgs %x wr_idx %d\n", @@ -609,7 +626,7 @@ int m_po_proxy_data(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg, struct mcm_qp *m_qp } write(smd->md->mc->tx_pipe[1], "w", sizeof("w")); mpxy_unlock(&m_qp->txlock); - sched_yield(); + sleep_usec(1000); mpxy_lock(&m_qp->txlock); } if (retries) { @@ -919,6 +936,18 @@ bail: else wc.wc_flags = 0; wc.vendor_err = ret; + + mlog(0, "[%d:%d:%d] ERR %s_RF_post: WR[%d] qp %p wr_id %p, " + " post %d hd %d tl %d sz %d \n", + m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->r_entry.tid, + (MXF_EP(&m_qp->cm->msg.daddr1)) ? "po_pi":"po_direct", + m_wr ? m_wr->w_idx:0, m_qp, m_wr, pmsg->wr.wr_id, + m_qp->post_cnt, m_qp->wr_hd, m_qp->wr_tl, wc.byte_len); + mlog(0, "[%d:%d:%d] ERR m_wr: raddr %Lx rkey 0x%x, ib_wr: raddr %Lx rkey 0x%x\n", + m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid, m_qp->r_entry.tid, + pmsg->wr.wr.rdma.remote_addr, pmsg->wr.wr.rdma.rkey, + m_wr ? m_wr->wr.wr.rdma.remote_addr:0, m_wr ? m_wr->wr.wr.rdma.rkey:0); + mix_dto_event(m_qp->ib_qp2->send_cq->cq_context, &wc, 1); } -- 2.41.0