msg.qp_t.max_recv_sge = attr->cap.max_recv_sge;
msg.qp_t.scq_id = req_cq->cq_id;
+ dapl_log(DAPL_DBG_TYPE_EXTENSION, " MIX_QP_CREATE: QP_r - qpn 0x%x, ctx %p, rq %d,%d sq %d,%d rcq_id %d\n",
+ msg.qp_r.qp_num, msg.qp_r.ctx, msg.qp_r.max_recv_wr,
+ msg.qp_r.max_recv_sge, msg.qp_r.max_send_wr,
+ msg.qp_r.max_send_sge, msg.qp_r.rcq_id);
+
+ dapl_log(DAPL_DBG_TYPE_EXTENSION, " MIX_QP_CREATE: QP_t - wr %d sge %d inline %d\n",
+ msg.qp_t.max_send_wr, msg.qp_t.max_send_sge,
+ msg.qp_t.max_inline_data);
+
len = sizeof(dat_mix_qp_t);
ret = scif_send(mix_ep, &msg, len, SCIF_SEND_BLOCK);
if (ret != len) {
{
dat_mix_dto_comp_t msg;
scif_epd_t mix_ep = m_cq->tp->scif_ep;
+ DAPL_COOKIE *cookie;
int ret, len;
/* request */
msg.hdr.ver, msg.hdr.op, msg.hdr.flags, msg.hdr.status);
return -1;
}
- if (msg.wc_cnt == 1)
+ if (msg.wc_cnt == 1) {
memcpy(wc, msg.wc, sizeof(*wc));
+ /* possible segmentation on mpxyd side, update length if success */
+ if (wc->status == 0) {
+ cookie = (DAPL_COOKIE *) (uintptr_t) wc->wr_id;
+ wc->byte_len = cookie->val.dto.size;
+ }
+ }
dapl_log(DAPL_DBG_TYPE_EXTENSION," received reply on SCIF EP, result = %d\n", msg.wc_cnt);
return msg.wc_cnt;
memcpy(&msg.wr, wr, sizeof(*wr));
if (wr->opcode == IBV_WR_SEND) {
- msg.hdr.flags |= MIX_OP_INLINE; /* copy message for now */
+ msg.hdr.flags |= MIX_OP_INLINE; /* copy message for now, TODO need to move same as writes to preserve order */
msg.hdr.op = MIX_SEND;
- } else if (wr->opcode == IBV_WR_RDMA_WRITE) {
+ } else if (wr->opcode == IBV_WR_RDMA_WRITE ||
+ wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) {
#if DEBUG_DATA
char *sbuf = (char*)wr->sg_list[0].addr; /* DEBUG mpi */
dapl_log(DAPL_DBG_TYPE_EXTENSION,
" mix_write: WR=%p, sge[0] 1st byte 0x%x last byte 0x%x \n",
wr->wr_id, sbuf[0], sbuf[wr->sg_list[0].length-1]);
#endif
- /* TODO: send over info and do a readfrom instead ?? */
if (mix_proxy_write(m_qp, &msg, wr, txlen, mix_ep))
return -1;
{
int len, ret, i;
struct dcm_ib_cq *m_cq;
+ DAPL_COOKIE *cookie;
/* hdr already read, get operation data */
len = sizeof(dat_mix_dto_comp_t) - sizeof(dat_mix_hdr_t);
/* Get cq and post DTO event with this WC entry */
m_cq = (void*)pmsg->cq_ctx;
- for (i=0; i<pmsg->wc_cnt; i++)
+ for (i=0; i<pmsg->wc_cnt; i++) {
+ /* possible segmentation on mpxyd side, update length if success */
+ if (pmsg->wc[i].status == 0) {
+ cookie = (DAPL_COOKIE *) (uintptr_t) pmsg->wc[i].wr_id;
+ pmsg->wc[i].byte_len = cookie->val.dto.size;
+ }
dapls_evd_cqe_to_event(m_cq->evd, &pmsg->wc[i]);
+ }
return 0;
}
/* scif-rdma cmd and data channel parameters */
static int mix_align = 64;
static int mix_buffer_mb = 128;
-static int mix_buffer_sg = 128 * 1024;
+static int mix_buffer_sg = 1048576;
static int mix_cmd_depth = 50;
static int mix_cmd_size = 256;
static int mix_shared_buffer = 1;
memset((void *)&qp_create, 0, sizeof(qp_create));
qp_create.cap.max_recv_wr = pmsg->qp_t.max_recv_wr;
qp_create.cap.max_recv_sge = pmsg->qp_t.max_recv_sge;
- qp_create.cap.max_send_wr = pmsg->qp_t.max_send_wr * (mix_max_msg_mb*1024*1024/mix_buffer_sg);
+ qp_create.cap.max_send_wr = pmsg->qp_t.max_send_wr * 8; /* max of 8 segments per wr */
qp_create.cap.max_send_sge = pmsg->qp_t.max_send_sge;
qp_create.cap.max_inline_data = 0; /* better bandwidth without inline */
qp_create.qp_type = IBV_QPT_RC;
- mlog(1, " QP_t - max_wr %d adjusted for segmentation\n",
- pmsg->qp_t.max_send_wr, pmsg->qp_t.max_send_sge,
- pmsg->qp_t.max_inline_data);
+ mlog(1, " QP_t - max_wr %d adjusted for segmentation, inline == 0\n",
+ qp_create.cap.max_send_wr);
pmsg->hdr.status = m_qp_create(smd, &qp_create, pmsg->qp_t.scq_id, &new_mqp);
if (pmsg->hdr.status)
m_wr = (struct ibv_send_wr *)(m_qp->wr_buf + (DAT_MCM_WR * wr_idx));
while ((m_wr->wr_id != m_wr->wr.atomic.swap) && (--poll_cnt));
- mlog(1, " wr_id %p poll_cnt %d\n", m_wr->wr.atomic.swap, poll_cnt);
+ mlog(1, " wr_id %#016Lx poll_cnt %d\n", m_wr->wr.atomic.swap, poll_cnt);
- if (poll_cnt == 0) poll_cnt = 100;
+ if (poll_cnt == 0) poll_cnt = 50;
if (m_wr->wr_id == m_wr->wr.atomic.swap) {
char *sbuf = (char*)m_wr->sg_list->addr;
- mlog(1, " m_wr %p data ready for IB write, 1st byte 0x%x last byte 0x%x \n",
- m_wr->wr_id, sbuf[0], sbuf[m_wr->sg_list->length-1]);
+ mlog(1, " m_wr %p wr_id %#016Lx data ready for IB write, 1st byte 0x%x last byte 0x%x, ln=%d\n",
+ m_wr, m_wr->wr_id, sbuf[0], sbuf[m_wr->sg_list->length-1], m_wr->sg_list->length);
- mlog(1, " wr_id %p next %p sglist %p sge %d op %d flgs %d idata 0x%x raddr %p rkey %x\n",
+ mlog(1, " wr_id %#016Lx next %p sglist %p sge %d op %d flgs %d idata 0x%x raddr %p rkey %x\n",
m_wr->wr_id, m_wr->next, m_wr->sg_list, m_wr->num_sge, m_wr->opcode,
m_wr->send_flags, m_wr->imm_data, m_wr->wr.rdma.remote_addr, m_wr->wr.rdma.rkey);
wc.vendor_err = ret;
mix_dto_event(m_qp->ib_qp->send_cq->cq_context, &wc, 1);
}
- mlog(1, " - wr_id %p posted\n", m_wr->wr_id);
+ mlog(1, " - wr %p wr_id %#016Lx posted tl=%d hd=%d\n",
+ m_wr, m_wr->wr_id, m_qp->wr_tl, m_qp->wr_hd);
+
m_wr->wr_id = 0;
posted++;
if (++m_qp->wr_tl == m_qp->wr_end)
*data = *data - 1;
}
if (!posted) {
- mlog(1, " - wr_id %p still not ready\n", m_wr->wr.atomic.swap);
+ mlog(1, " - wr_id %#016Lx still not ready\n", m_wr->wr.atomic.swap);
break;
}
- if (++wr_idx == m_qp->wr_end) /* posted WR, move tail */
+ if (++wr_idx == m_qp->wr_end) /* posted WR, move to next */
wr_idx = 0;
}
m_qp = get_next_entry(&m_qp->entry, &smd->qplist);
l_end = l_start + seg_len;
if (l_start < smd->m_tl && l_end > smd->m_tl) {
- mlog(0, " mix_post_write stalled, insufficient proxy memory, hd 0x%x, tl 0x%x, len %d\n",
+ mlog(0, " ERR: mix_post_write stalled, insufficient proxy memory, hd 0x%x, tl 0x%x, len %d\n",
smd->m_hd, smd->m_tl, seg_len);
return -1; /* todo queue up, don't fail */
}
l_end = l_start + seg_len;
if (l_start < m_qp->m_tl && l_end > m_qp->m_tl) {
- mlog(0, " mix_post_write stalled, insufficient proxy memory, hd 0x%x, tl 0x%x, len %d\n",
+ mlog(0, " ERR: mix_post_write stalled, insufficient proxy memory, hd 0x%x, tl 0x%x, len %d\n",
m_qp->m_hd, m_qp->m_tl, seg_len);
return -1; /* todo queue up, don't fail */
}
}
/* remove special flags unless it's the last segment */
+ /* NON-COMPLIANT: IMM segmented causes receiver RDMA length will be wrong */
if (l_len || i != pmsg->wr.num_sge -1) {
if (m_wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
m_wr->opcode = IBV_WR_RDMA_WRITE;
- m_wr->send_flags &= IBV_SEND_INLINE;
+ m_wr->send_flags &= IBV_SEND_INLINE;
}
*data = *data + 1;
char *sbuf = (char*)m_wr->sg_list->addr;
mlog(1, " m_wr %p data ready for IB write, 1st byte 0x%x last byte 0x%x \n",
m_wr->wr_id, sbuf[0], sbuf[m_wr->sg_list->length-1]);
-
mlog(1, " wr_id %p next %p sglist %p sge %d op %d flgs %d idata 0x%x raddr %p rkey %x\n",
m_wr->wr_id, m_wr->next, m_wr->sg_list, m_wr->num_sge, m_wr->opcode,
m_wr->send_flags, m_wr->imm_data, m_wr->wr.rdma.remote_addr, m_wr->wr.rdma.rkey);
}
}
#endif
-
mlog(1, " exit: q_id %d, q_ctx %p, len %d, wr_hd = %d\n",
pmsg->qp_id, (void*)pmsg->qp_ctx, pmsg->len, m_qp->wr_hd);
return 0;
} else
notify = 0;
+ /* NOTE: if WR was segmented update length, no context to handle so let mcm client handle */
mix_dto_event(m_cq, wc, ret);
goto retry;
}