cm->msg.ver = htons(DAT_MCM_VER);
cm->msg.s_id = htonl(dapl_os_getpid()); /* process id for src id */
cm->msg.sys_guid = hca->ib_trans.sys_guid;
+ cm->msg.seg_sz = DAT_MCM_SEG_PO2;
/* ACTIVE: init source address QP info from local EP */
if (ep) {
}
/* CM_REP: save remote address information to EP and CM */
+ if (msg->seg_sz) /* set po2 seg_sz, if provided */
+ cm->msg.seg_sz = msg->seg_sz;
+
cm->msg.d_id = msg->s_id;
dapl_os_memcpy(&ep->remote_ia_address, &msg->saddr2, sizeof(dat_mcm_addr_t));
dapl_os_memcpy(&cm->msg.daddr2, &msg->saddr2, sizeof(dat_mcm_addr_t));
mcm_create_wc_q(ep->qp_handle, ep->qp_handle->wrc_rem.wr_end + 1);
mcm_hton_wrc((mcm_wrc_info_t*)cm->msg.p_proxy, &ep->qp_handle->wrc);
ep->qp_handle->ep_map = cm->msg.daddr1.ep_map;
+ ep->qp_handle->seg_sz = (1 << cm->msg.seg_sz);
/* post 0-byte rcv for inbound WC's via RW_imm */
if (mcm_post_rcv_wc(ep->qp_handle, MCM_WRC_QLEN))
goto bail;
- dapl_log(DAPL_DBG_TYPE_CM, "CONN_RTU: WR_rem %p sz %d, WC %p sz %d\n",
+ dapl_log(DAPL_DBG_TYPE_CM, "CONN_RTU: WR_rem %p sz %d, WC %p sz %d, sg %d\n",
ep->qp_handle->wrc_rem.wr_addr,
ep->qp_handle->wrc_rem.wr_end+1,
ep->qp_handle->wrc.wc_addr,
- ep->qp_handle->wrc.wc_end+1);
+ ep->qp_handle->wrc.wc_end+1,
+ ep->qp_handle->seg_sz);
}
}
dapl_os_unlock(&cm->ep->header.lock);
acm->msg.p_size = msg->p_size;
acm->msg.d_id = msg->s_id;
acm->msg.rd_in = msg->rd_in;
+ if (msg->seg_sz) /* set po2 seg_sz, if provided */
+ acm->msg.seg_sz = msg->seg_sz;
/* CR saddr1 is CM daddr1 info, need EP for local saddr1 */
dapl_os_memcpy(&acm->msg.daddr1, &msg->saddr1, sizeof(dat_mcm_addr_t));
mcm_create_wc_q(ep->qp_handle, ep->qp_handle->wrc_rem.wr_end + 1);
mcm_hton_wrc((mcm_wrc_info_t*)cm->msg.p_proxy, &ep->qp_handle->wrc);
ep->qp_handle->ep_map = cm->msg.daddr1.ep_map;
+ ep->qp_handle->seg_sz = (1 << cm->msg.seg_sz);
/* post 0-byte rcv for inbound WC's via RW_imm */
if (mcm_post_rcv_wc(ep->qp_handle, MCM_WRC_QLEN))
goto bail;
dapl_log(DAPL_DBG_TYPE_CM,
- "ACCEPT_USR: WR_rem %p rkey %x sz %d, WC %p rkey %x sz %d\n",
+ "ACCEPT_USR: WR_rem %p rkey %x sz %d, WC %p rkey %x sz %d sg %d\n",
ep->qp_handle->wrc_rem.wr_addr,
ep->qp_handle->wrc_rem.wr_rkey,
ep->qp_handle->wrc_rem.wr_end+1,
ep->qp_handle->wrc.wc_addr,
ep->qp_handle->wrc.wc_rkey,
- ep->qp_handle->wrc.wc_end+1);
+ ep->qp_handle->wrc.wc_end+1,
+ ep->qp_handle->seg_sz);
}
}
dapl_os_unlock(&ep->header.lock);
* non-MIC host to MIC cross socket EP needs to send WR to remote PI service
* instead of direct IB send or write. Inbound traffic from remote MXS will still be
* be direct so there is no need for PI service on this MCM providers host side.
- *
- * NOTE: Initial design with no segmentation, set frequent PI MP signal rate
- * This will avoid creation and management of a local PO WR queue for segments
*/
#define MCM_MP_SIG_RATE 5
-int mcm_send_pi(struct dcm_ib_qp *m_qp, int len, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr)
+int mcm_send_pi(struct dcm_ib_qp *m_qp,
+ int len,
+ struct ibv_send_wr *wr,
+ struct ibv_send_wr **bad_wr)
{
struct ibv_send_wr wr_imm;
struct ibv_sge sge;
struct mcm_wr_rx m_wr_rx;
- int i, ret = 0, wr_idx;
+ int i, l_len, seg_len, ret = 0, wr_idx;
struct wrc_idata wrc;
- uint32_t wr_flags, offset=0;
+ uint32_t wr_flags, l_off, r_off = 0;
+ uint64_t l_addr;
dapl_log(DAPL_DBG_TYPE_EP,
- " mcm_send_pi: ep %p qpn %x ln %d WR: tl %d hd %d end %d wr_id %Lx\n",
- m_qp->ep, m_qp->qp2->qp_num, len, m_qp->wr_tl,
- m_qp->wr_hd, m_qp->wrc_rem.wr_end, wr->wr_id);
+ " mcm_send_pi: ep %p qpn %x ln %d sge %d sg %d"
+ " WR: tl %d hd %d end %d wr_id %Lx\n",
+ m_qp->ep, m_qp->qp2->qp_num, len, wr->num_sge,
+ m_qp->seg_sz, m_qp->wr_tl, m_qp->wr_hd,
+ m_qp->wrc_rem.wr_end, wr->wr_id);
if (wr->num_sge > DAT_MIX_SGE_MAX) {
ret = EINVAL;
goto bail;
}
+
/* one WR per IB sge, no additional segmentation */
for (i=0;i<wr->num_sge;i++) {
wr_flags = M_SEND_DIRECT | M_SEND_PI;
if (i==0) wr_flags |= M_SEND_FS;
+
+ l_len = wr->sg_list[i].length;
+ l_addr = wr->sg_list[i].addr;
+ l_off = 0;
+
if (i==(wr->num_sge-1)) {
wr_flags |= M_SEND_LS;
if (wr->send_flags & IBV_SEND_SIGNALED)
wr_flags |= M_SEND_CN_SIG;
}
- dapl_os_lock(&m_qp->lock);
- if (((m_qp->wr_hd + 1) & m_qp->wrc_rem.wr_end) == m_qp->wr_tl) { /* full */
- ret = ENOMEM;
+
+ while (l_len) {
+ if (wr->opcode == IBV_WR_SEND)
+ seg_len = l_len;
+ else
+ seg_len = (l_len > m_qp->seg_sz) ? m_qp->seg_sz : l_len;
+
+ dapl_os_lock(&m_qp->lock);
+ if (((m_qp->wr_hd + 1) & m_qp->wrc_rem.wr_end) == m_qp->wr_tl) { /* full */
+ ret = ENOMEM;
+ dapl_os_unlock(&m_qp->lock);
+ goto bail;
+ }
+ m_qp->wr_hd = (m_qp->wr_hd + 1) & m_qp->wrc_rem.wr_end; /* move hd */
+ wr_idx = m_qp->wr_hd;
+ if (!(wr_idx % MCM_MP_SIG_RATE) || (wr_flags & M_SEND_CN_SIG))
+ wr_flags |= M_SEND_MP_SIG;
dapl_os_unlock(&m_qp->lock);
- goto bail;
- }
- m_qp->wr_hd = (m_qp->wr_hd + 1) & m_qp->wrc_rem.wr_end; /* move hd */
- wr_idx = m_qp->wr_hd;
- if (!(wr_idx % MCM_MP_SIG_RATE) || (wr_flags & M_SEND_CN_SIG))
- wr_flags |= M_SEND_MP_SIG;
- dapl_os_unlock(&m_qp->lock);
-
- dapl_log(DAPL_DBG_TYPE_EVD,
- " mcm_send_pi[%d]: ln %d wr_idx %d, tl %d hd %d\n",
- i, wr->sg_list[i].length, wr_idx, m_qp->wr_tl, m_qp->wr_hd);
-
- /* build local m_wr_rx for remote PI */
- memset((void*)&m_wr_rx, 0, sizeof(struct mcm_wr_rx));
- m_wr_rx.org_id = (uint64_t) htonll((uint64_t)wr->wr_id);
- m_wr_rx.flags = htonl(wr_flags);
- m_wr_rx.w_idx = htonl(m_qp->wc_tl); /* snd back wc tail */
- m_wr_rx.wr.num_sge = htonl(wr->num_sge);
- m_wr_rx.wr.opcode = htonl(wr->opcode);
- m_wr_rx.wr.send_flags = htonl(wr->send_flags);
- m_wr_rx.wr.imm_data = htonl(wr->imm_data);
- m_wr_rx.sg[0].addr = htonll(wr->sg_list[i].addr);
- m_wr_rx.sg[0].lkey = htonl(wr->sg_list[i].lkey);
- m_wr_rx.sg[0].length = htonl(wr->sg_list[i].length);
-
- if ((wr->opcode == IBV_WR_RDMA_WRITE) ||
- (wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)) {
- m_wr_rx.wr.wr.rdma.remote_addr = htonll(wr->wr.rdma.remote_addr + offset);
- m_wr_rx.wr.wr.rdma.rkey = htonl(wr->wr.rdma.rkey);
- offset += wr->sg_list[i].length;
- }
- /* setup imm_data for PI rcv engine */
- wrc.id = (uint16_t)wr_idx;
- wrc.type = M_WR_TYPE;
- wrc.flags = 0;
-
- /* setup local WR for wr_rx transfer - RW_imm inline */
- wr_imm.wr_id = wr->wr_id; /* MUST be original cookie, CQ processing */
- wr_imm.next = 0;
- wr_imm.sg_list = &sge;
- wr_imm.num_sge = 1;
- wr_imm.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
- wr_imm.send_flags = IBV_SEND_INLINE; /* m_wr_rx, 148 bytes */
- if (wr_flags & M_SEND_MP_SIG)
- wr_imm.send_flags |= IBV_SEND_SIGNALED;
- wr_imm.imm_data = htonl(*(uint32_t *)&wrc);
- wr_imm.wr.rdma.rkey = m_qp->wrc_rem.wr_rkey;
- wr_imm.wr.rdma.remote_addr =
- (uint64_t)(uintptr_t)
- ((struct mcm_wr_rx *) (m_qp->wrc_rem.wr_addr + (m_qp->wrc_rem.wr_sz * wr_idx)));
-
- sge.addr = (uint64_t)(uintptr_t) &m_wr_rx;
- sge.length = (uint32_t) sizeof(struct mcm_wr_rx); /* 160 byte WR */
- sge.lkey = 0; /* inline doesn't need registered */
-
- dapl_log(DAPL_DBG_TYPE_EVD,
- " mcm_send_pi[%d]: WR_RX wr_id %Lx qn %x op %d flgs 0x%x"
- " imm %x raddr %p rkey %x ln %d\n",
- i, wr_imm.wr_id, m_qp->qp2->qp_num, wr_imm.opcode,
- wr_flags, ntohl(wr_imm.imm_data),
- wr_imm.wr.rdma.remote_addr, wr_imm.wr.rdma.rkey,
- sizeof(struct mcm_wr_rx));
- dapl_log(DAPL_DBG_TYPE_EVD,
- " mcm_send_pi[%d]: WR wr_id %Lx qn %x op %d flgs %x"
- " imm %x raddr %p rkey %x ln %d tl %d me %d hd %d\n",
- i, wr->wr_id, m_qp->qp2->qp_num, wr->opcode,
- wr->send_flags, wr->imm_data, wr->wr.rdma.remote_addr,
- wr->wr.rdma.rkey, wr->sg_list[i].length,
- m_qp->wr_tl, wr_idx, m_qp->wr_hd);
-
- ret = ibv_post_send(m_qp->qp2, &wr_imm, bad_wr); /* QP2: QPtx - QPrx PI */
- if (ret) {
- dapl_log(DAPL_DBG_TYPE_ERR,
- " mcm_send_pi ERR: m_wr %p idx %d laddr=%p ln=%d lkey=%x flgs %x"
- " tl %d hd %d\n",
- m_wr_rx, wr_idx, wr->sg_list[0].addr,
- wr->sg_list[0].length, wr->sg_list[0].lkey,
- m_wr_rx.flags, m_qp->wr_tl, m_qp->wr_hd);
- dapl_log(DAPL_DBG_TYPE_ERR,
- " mcm_send_pi ERR: wr_id %Lx %p sglist %p sge %d op %d flgs %x"
- " idata 0x%x raddr %p rkey %x \n",
- m_wr_rx.wr.wr_id, wr->sg_list,
- m_wr_rx.wr.num_sge, m_wr_rx.wr.opcode,
- m_wr_rx.wr.send_flags, m_wr_rx.wr.imm_data,
- m_wr_rx.wr.wr.rdma.remote_addr,
- m_wr_rx.wr.wr.rdma.rkey);
- goto bail;
- }
+ dapl_log(DAPL_DBG_TYPE_EP,
+ " mcm_send_pi[%d]: seg_ln %d wr_idx %d, tl %d hd %d\n",
+ i, seg_len, wr_idx, m_qp->wr_tl, m_qp->wr_hd);
+
+ /* build local m_wr_rx for remote PI */
+ memset((void*)&m_wr_rx, 0, sizeof(struct mcm_wr_rx));
+ m_wr_rx.org_id = (uint64_t) htonll((uint64_t)wr->wr_id);
+ m_wr_rx.flags = htonl(wr_flags);
+ m_wr_rx.w_idx = htonl(m_qp->wc_tl); /* snd back wc tail */
+ m_wr_rx.wr.num_sge = htonl(wr->num_sge);
+ m_wr_rx.wr.opcode = htonl(wr->opcode);
+ m_wr_rx.wr.send_flags = htonl(wr->send_flags);
+ m_wr_rx.wr.imm_data = htonl(wr->imm_data);
+ m_wr_rx.sg[0].addr = htonll(l_addr + l_off);
+ m_wr_rx.sg[0].lkey = htonl(wr->sg_list[i].lkey);
+ m_wr_rx.sg[0].length = htonl(seg_len);
+
+ if ((wr->opcode == IBV_WR_RDMA_WRITE) ||
+ (wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)) {
+ m_wr_rx.wr.wr.rdma.remote_addr = htonll(wr->wr.rdma.remote_addr + r_off);
+ m_wr_rx.wr.wr.rdma.rkey = htonl(wr->wr.rdma.rkey);
+ }
+
+ /* setup imm_data for PI rcv engine */
+ wrc.id = (uint16_t)wr_idx;
+ wrc.type = M_WR_TYPE;
+ wrc.flags = 0;
+
+ /* setup local WR for wr_rx transfer - RW_imm inline */
+ wr_imm.wr_id = wr->wr_id; /* MUST be original cookie, CQ processing */
+ wr_imm.next = 0;
+ wr_imm.sg_list = &sge;
+ wr_imm.num_sge = 1;
+ wr_imm.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+ wr_imm.send_flags = IBV_SEND_INLINE; /* m_wr_rx, 148 bytes */
+ if (wr_flags & M_SEND_MP_SIG)
+ wr_imm.send_flags |= IBV_SEND_SIGNALED;
+ wr_imm.imm_data = htonl(*(uint32_t *)&wrc);
+ wr_imm.wr.rdma.rkey = m_qp->wrc_rem.wr_rkey;
+ wr_imm.wr.rdma.remote_addr =
+ (uint64_t)(uintptr_t)
+ ((struct mcm_wr_rx *) (m_qp->wrc_rem.wr_addr + (m_qp->wrc_rem.wr_sz * wr_idx)));
+
+ sge.addr = (uint64_t)(uintptr_t) &m_wr_rx;
+ sge.length = (uint32_t) sizeof(struct mcm_wr_rx); /* 160 byte WR */
+ sge.lkey = 0; /* inline doesn't need registered */
+
+ dapl_log(DAPL_DBG_TYPE_EP,
+ " mcm_send_pi[%d]: WR_RX wr_id %Lx qn %x op %d flgs 0x%x"
+ " imm %x laddr %p raddr %p rkey %x wr_ln %d ln %d\n",
+ i, wr_imm.wr_id, m_qp->qp2->qp_num, wr_imm.opcode,
+ wr_flags, ntohl(wr_imm.imm_data),
+ l_addr + l_off, wr_imm.wr.rdma.remote_addr,
+ wr_imm.wr.rdma.rkey, sizeof(struct mcm_wr_rx), l_len);
+ dapl_log(DAPL_DBG_TYPE_EP,
+ " mcm_send_pi[%d]: WR wr_id %Lx qn %x op %d flgs %x"
+ " imm %x raddr %p rkey %x sg_ln %d tl %d me %d hd %d\n",
+ i, wr->wr_id, m_qp->qp2->qp_num, wr->opcode,
+ wr->send_flags, wr->imm_data,
+ wr->wr.rdma.remote_addr + r_off,
+ wr->wr.rdma.rkey, seg_len, m_qp->wr_tl, wr_idx, m_qp->wr_hd);
+
+ ret = ibv_post_send(m_qp->qp2, &wr_imm, bad_wr); /* QP2: QPtx - QPrx PI */
+ if (ret) {
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " mcm_send_pi ERR: m_wr %p idx %d laddr=%p ln=%d lkey=%x flgs %x"
+ " tl %d hd %d\n",
+ m_wr_rx, wr_idx, wr->sg_list[0].addr,
+ wr->sg_list[0].length, wr->sg_list[0].lkey,
+ m_wr_rx.flags, m_qp->wr_tl, m_qp->wr_hd);
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " mcm_send_pi ERR: wr_id %Lx %p sglist %p sge %d op %d flgs %x"
+ " idata 0x%x raddr %p rkey %x \n",
+ m_wr_rx.wr.wr_id, wr->sg_list,
+ m_wr_rx.wr.num_sge, m_wr_rx.wr.opcode,
+ m_wr_rx.wr.send_flags, m_wr_rx.wr.imm_data,
+ m_wr_rx.wr.wr.rdma.remote_addr,
+ m_wr_rx.wr.wr.rdma.rkey);
+ goto bail;
+ }
+ l_len -= seg_len;
+ l_off += seg_len;
+ r_off += seg_len;
+
+ } /* wire segmentation of each IOV segment */
}
bail:
return ret;