From: Arlin Davis Date: Fri, 24 Jul 2015 23:01:29 +0000 (-0700) Subject: mcm: add intra-node support via ibscif device and mcm provider X-Git-Tag: dapl-2.1.6-1~16 X-Git-Url: https://openfabrics.org/gitweb/?a=commitdiff_plain;h=03f3b77c6061380b1130c5df95c1808d679dc455;p=~ardavis%2Fdapl.git mcm: add intra-node support via ibscif device and mcm provider - New device entry ofa-v2-scif0-m - Support for different CM and EP locality (MIC vs proxy LID) - MSS mode for all scif device opens via proxy - logging changes for multi-lid options Signed-off-by: Arlin Davis --- diff --git a/dapl/openib_mcm/cm.c b/dapl/openib_mcm/cm.c index cc67b77..17dadd6 100644 --- a/dapl/openib_mcm/cm.c +++ b/dapl/openib_mcm/cm.c @@ -478,7 +478,7 @@ retry_listenq: if (!listenq && cm->msg.sport == msg->dport && cm->msg.sqpn == msg->dqpn && cm->msg.dport == msg->sport && cm->msg.dqpn == msg->sqpn && - cm->msg.daddr1.lid == msg->saddr1.lid) { + cm->msg.daddr2.lid == msg->saddr2.lid) { if (ntohs(msg->op) != MCM_REQ) { found = cm; break; @@ -492,13 +492,13 @@ retry_listenq: cm, dapl_cm_op_str(ntohs(msg->op)), dapl_cm_op_str(ntohs(cm->msg.op)), dapl_cm_state_str(cm->state), - ntohs(cm->msg.daddr1.lid), ntohs(cm->msg.dport), + ntohs(cm->msg.daddr2.lid), ntohs(cm->msg.dport), ntohl(cm->msg.dqpn), ntohl(cm->msg.daddr1.qpn), - ntohs(msg->saddr1.lid), ntohs(msg->sport), + ntohs(msg->saddr2.lid), ntohs(msg->sport), ntohl(msg->sqpn), ntohl(msg->saddr1.qpn), - ntohs(msg->daddr1.lid), ntohs(msg->dport), + ntohs(msg->daddr2.lid), ntohs(msg->dport), ntohl(msg->dqpn), ntohl(msg->daddr1.qpn), - ntohs(cm->msg.saddr1.lid), ntohs(cm->msg.sport), + ntohs(cm->msg.saddr2.lid), ntohs(cm->msg.sport), ntohl(cm->msg.sqpn), ntohl(cm->msg.saddr1.qpn)); DAPL_CNTR(((DAPL_IA *)dapl_llist_peek_head(&cm->hca->ia_list_head)), @@ -524,9 +524,9 @@ retry_listenq: " mcm_recv: NO LISTENER for %s %x %x i%x c%x" " < %x %x %x, sending reject\n", dapl_cm_op_str(ntohs(msg->op)), - ntohs(msg->daddr1.lid), ntohs(msg->dport), + ntohs(msg->daddr2.lid), ntohs(msg->dport), ntohl(msg->daddr1.qpn), ntohl(msg->sqpn), - ntohs(msg->saddr1.lid), ntohs(msg->sport), + ntohs(msg->saddr2.lid), ntohs(msg->sport), ntohl(msg->saddr1.qpn)); mcm_reject(tp, msg); @@ -537,9 +537,9 @@ retry_listenq: " NO MATCH: op %s [lid, port, cqp, iqp, pid]:" " %x %x %x %x %x <- %x %x %x %x l_pid %x r_pid %x\n", dapl_cm_op_str(ntohs(msg->op)), - ntohs(msg->daddr1.lid), ntohs(msg->dport), + ntohs(msg->daddr2.lid), ntohs(msg->dport), ntohl(msg->dqpn), ntohl(msg->daddr1.qpn), - ntohl(msg->d_id), ntohs(msg->saddr1.lid), + ntohl(msg->d_id), ntohs(msg->saddr2.lid), ntohs(msg->sport), ntohl(msg->sqpn), ntohl(msg->saddr1.qpn), ntohl(msg->s_id), ntohl(msg->d_id)); @@ -617,7 +617,7 @@ static int mcm_send(ib_hca_transport_t *tp, dat_mcm_msg_t *msg, DAT_PVOID p_data struct ibv_send_wr wr, *bad_wr; struct ibv_sge sge; int len, ret = -1; - uint16_t dlid = ntohs(msg->daddr1.lid); + uint16_t dlid = ntohs(msg->daddr2.lid); /* Get message from send queue, copy data, and send */ dapl_os_lock(&tp->slock); @@ -987,16 +987,28 @@ DAT_RETURN dapli_cm_disconnect(dp_ib_cm_handle_t cm) DAT_RETURN dapli_cm_connect(DAPL_EP *ep, dp_ib_cm_handle_t cm) { + /* ibscif: intra-node, MIC lid != HST lid, + * HST->HST/MIC (CM always HST lid) + * CM locality via addr2 + * EP locality via addr1 + */ + if (!strncmp(cm->hca->name, "scif", 4) && + HST_EP(&cm->hca->ib_trans.addr)) { + cm->msg.daddr2.lid = cm->tp->addr.lid; + memcpy(cm->msg.daddr2.gid, cm->tp->addr.gid, 16); + } + dapl_log(DAPL_DBG_TYPE_CM, " MCM connect: lid %x QPr %x QPt %x lport %x p_sz=%d -> " - " lid %x c_qpn %x rport %x ep_map %d %s -> %d %s, retries=%d\n", + " lid %x clid %x cqpn %x rport %x, %s -> %s" + " retries=%d\n", htons(cm->tp->addr.lid), htonl(cm->msg.saddr1.qpn), htonl(cm->msg.saddr2.qpn), htons(cm->msg.sport), htons(cm->msg.p_size), - htons(cm->msg.daddr1.lid), htonl(cm->msg.dqpn), - htons(cm->msg.dport), - cm->tp->addr.ep_map, mcm_map_str(cm->tp->addr.ep_map), - cm->msg.daddr1.ep_map, mcm_map_str(cm->msg.daddr1.ep_map), + htons(cm->msg.daddr1.lid), htons(cm->msg.daddr2.lid), + htonl(cm->msg.dqpn), htons(cm->msg.dport), + mcm_map_str(cm->tp->addr.ep_map), + mcm_map_str(cm->msg.daddr1.ep_map), cm->tp->retries); dapl_os_lock(&cm->lock); @@ -1639,6 +1651,8 @@ dapls_ib_connect(IN DAT_EP_HANDLE ep_handle, dapl_os_memcpy(&cm->msg.daddr1, r_addr, sizeof(struct dat_mcm_addr)); dapl_os_memcpy(&cm->msg.daddr2, r_addr, sizeof(struct dat_mcm_addr)); + /* HST dev scif0, remote LID is host proxy, not MIC */ + /* validate port and ep_map range */ if ((mcm_ia->port > 2) || (mcm_ia->ep_map > 3)) cm->msg.daddr1.ep_map = 0; diff --git a/dapl/svc/mcm.c b/dapl/svc/mcm.c index 1f0de05..c67b16f 100644 --- a/dapl/svc/mcm.c +++ b/dapl/svc/mcm.c @@ -163,6 +163,12 @@ int mcm_init_cm_service(mcm_ib_dev_t *md) md->cqe = mcm_depth; md->signal = mcm_signal; + /* ibscif: MIC and HST different, save MIC addresses */ + if (!strncmp(md->ibdev->name, "scif", 4)) { + md->m_lid = md->addr.lid; + memcpy(md->m_gid, md->addr.gid, 16); + } + /* Save addr information */ /* get lid for this hca-port, convert to network order */ if (ibv_query_port(md->ibctx, md->port, &port_attr)) { @@ -554,14 +560,23 @@ mcm_cm_t *m_cm_create(mcm_scif_dev_t *smd, mcm_qp_t *m_qp, dat_mcm_addr_t *r_add /* MPXYD SRC IB info, QP2t = saddr2 all cases */ cm->msg.saddr2.qpn = htonl(m_qp->ib_qp2->qp_num); cm->msg.saddr2.qp_type = m_qp->qp_attr2.qp_type; - cm->msg.saddr2.lid = smd->md->addr.lid; - cm->msg.saddr2.ep_map = smd->md->addr.ep_map; - memcpy(&cm->msg.saddr2.gid[0], &smd->md->addr.gid, 16); + cm->msg.saddr2.ep_map = smd->md->addr.ep_map; /* MPXYD RCV IB info */ - cm->msg.saddr1.lid = smd->md->addr.lid; cm->msg.saddr1.ep_map = smd->md->addr.ep_map; - memcpy(&cm->msg.saddr1.gid[0], &smd->md->addr.gid, 16); + + /* intra-node: QPt addr 2 - ibscif HST lid != MICx lid's */ + if (smd->md->m_lid) { + cm->msg.saddr1.lid = smd->md->m_lid; + cm->msg.saddr2.lid = smd->md->addr.lid; + memcpy(&cm->msg.saddr1.gid[0], &smd->md->m_gid, 16); + memcpy(&cm->msg.saddr2.gid[0], &smd->md->addr.gid, 16); + } else { + cm->msg.saddr1.lid = smd->md->lid; + cm->msg.saddr2.lid = smd->md->lid; + memcpy(&cm->msg.saddr1.gid[0], &smd->md->addr.gid, 16); + memcpy(&cm->msg.saddr2.gid[0], &smd->md->addr.gid, 16); + } /* MSS, QPr is on MIC, QP1r == saddr1 */ if (MSS_EP(l_addr)) { @@ -777,6 +792,10 @@ static int mcm_send(mcm_ib_dev_t *md, dat_mcm_msg_t *msg, DAT_PVOID p_data, DAT_ int len, ret = -1; uint16_t dlid = ntohs(msg->daddr1.lid); + /* intra-node, CM on host side, HST lid != MIC lid */ + if (md->m_lid) + dlid = ntohs(md->addr.lid); + /* Get message from send queue, copy data, and send */ mpxy_lock(&md->txlock); if ((smsg = mcm_get_smsg(md)) == NULL) { @@ -813,7 +832,7 @@ static int mcm_send(mcm_ib_dev_t *md, dat_mcm_msg_t *msg, DAT_PVOID p_data, DAT_ } mlog(8," cm_send: op %s ln %d lid %x c_qpn %x rport %x, p_size %d\n", - mcm_op_str(ntohs(smsg->op)), sge.length, ntohs(smsg->daddr1.lid), + mcm_op_str(ntohs(smsg->op)), sge.length, dlid, ntohl(smsg->dqpn), ntohs(smsg->dport), p_size); /* empty slot, then create AH */ diff --git a/dapl/svc/mix.c b/dapl/svc/mix.c index bfeecbb..5c8cb21 100644 --- a/dapl/svc/mix.c +++ b/dapl/svc/mix.c @@ -1165,10 +1165,16 @@ static int mix_cm_req_out(mcm_scif_dev_t *smd, dat_mix_cm_t *pmsg, scif_epd_t sc mcm_hton_wrc((mcm_wrc_info_t *)m_cm->msg.p_proxy, &m_qp->wrc); /* PI WR/WC raddr,rkey info */ m_cm->msg.seg_sz = mix_buffer_sg_po2; + /* intra-node; set QPt to HST lid */ + if (smd->md->m_lid) { + m_cm->msg.saddr2.lid = smd->md->addr.lid; + memcpy(&m_cm->msg.saddr2.gid[0], &smd->md->addr.gid, 16); + } + mlog(2," QP2 0x%x QP1 0x%x:" " CM sPORT 0x%x sQPN 0x%x sLID 0x%x - dPORT 0x%x dQPN 0x%x dLID 0x%x, psz %d %s\n", m_cm->msg.saddr2.qpn, m_cm->msg.saddr1.qpn, - ntohs(m_cm->msg.sport), ntohl(m_cm->msg.sqpn), ntohs(m_cm->msg.saddr1.lid), + ntohs(m_cm->msg.sport), ntohl(m_cm->msg.sqpn), ntohs(m_cm->msg.saddr2.lid), ntohs(m_cm->msg.dport), ntohl(m_cm->msg.dqpn), ntohs(m_cm->msg.daddr1.lid), ntohs(m_cm->msg.p_size), mcm_map_str(m_cm->msg.daddr1.ep_map)); @@ -1782,6 +1788,10 @@ static int mix_cm_rep_out(mcm_scif_dev_t *smd, dat_mix_cm_t *pmsg, scif_epd_t sc /* KL to KL, QP1->QP2 and QP1<-QP2 */ /* update the QPt src information in CM msg, QPr updated on MIC */ m_cm->msg.saddr1.ep_map = MIC_SSOCK_DEV; + if (smd->md->m_lid) { + m_cm->msg.saddr1.lid = smd->md->m_lid; + memcpy(&m_cm->msg.saddr1.gid[0], &m_cm->smd->md->m_gid, 16); + } m_cm->msg.saddr2.ep_map = MIC_SSOCK_DEV; m_cm->msg.saddr2.qpn = htonl(m_cm->m_qp->ib_qp2->qp_num); m_cm->msg.saddr2.qp_type = m_cm->m_qp->qp_attr2.qp_type; diff --git a/dapl/svc/mpxyd.c b/dapl/svc/mpxyd.c index 80f597c..cc168f9 100644 --- a/dapl/svc/mpxyd.c +++ b/dapl/svc/mpxyd.c @@ -226,6 +226,8 @@ static struct ibv_context *open_ib_device(struct mcm_ib_dev *md, char *name, int if (!rd_dev_file(md->ibdev->ibdev_path, "device/numa_node", val, sizeof val)) md->numa_node = atoi(val); + else if (!strncmp(name, "scif", 4)) + md->numa_node = md->mc->numa_node; /* intra-node, MSS */ else mlog(0," ERR ibdev %s numa_node at " "%s/device/numa_node unreadable\n", @@ -657,8 +659,8 @@ mcm_scif_dev_t *mix_open_device(dat_mix_open_t *msg, scif_epd_t op_ep, scif_epd_ mcm_scif_dev_t *smd = NULL; int i, ret; - mlog(8, " Open IB device - %s, IB port %d, scif_node %d EPs %d %d %d op_msg %p\n", - msg->name, msg->port, node, op_ep, tx_ep, ev_ep, msg); + mlog(8, " IB device - %s, IB port %d, scif_node %d EPs %d %d %d op_msg %p lid %x\n", + msg->name, msg->port, node, op_ep, tx_ep, ev_ep, msg, ntohs(msg->dev_addr.lid)); mc = &mcm_client_list[node]; @@ -748,6 +750,7 @@ mcm_scif_dev_t *mix_open_device(dat_mix_open_t *msg, scif_epd_t op_ep, scif_epd_ memset(md->cntrs, 0, sizeof(uint64_t) * MCM_ALL_COUNTERS); md->mc = mc; md->port = msg->port; + memcpy(&md->addr, &msg->dev_addr, sizeof(dat_mcm_addr_t)); md->ibctx = open_ib_device(md, msg->name, msg->port); if ((!md->ibctx) || mcm_init_cm_service(md)) { @@ -770,7 +773,13 @@ found: msg->hdr.req_id = smd->entry.tid; msg->hdr.status = MIX_SUCCESS; memcpy(&md->dev_attr, &msg->dev_attr, sizeof(dat_mix_dev_attr_t)); - memcpy(&msg->dev_addr, &md->addr, sizeof(dat_mcm_addr_t)); + memcpy(&msg->dev_addr, &md->addr, sizeof(dat_mcm_addr_t)); /* proxy CM lid */ + + /* intra-node: restore MIC lid, gid */ + if (md->m_lid) { + msg->dev_addr.lid = md->m_lid; + memcpy(msg->dev_addr.gid, md->m_gid, 16); + } err: if (!smd) { mlog(0, " ERR: mix_open_device failed for %s - %d\n", msg->name, msg->port); @@ -789,9 +798,10 @@ err: goto bail; } - mlog(1, " MIC client: open mdev[%d] %p smd %p mic%d[%d] -> %s[%d] port %d - %s\n", + mlog(1, " MIC client: mdev[%d] %p smd %p mic%d[%d] -> %s[%d] port %d lid %x %s\n", md->smd_list.tid, md, smd, mc->scif_id-1, mc->numa_node, msg->name, - md->numa_node, msg->port, md->addr.ep_map == MIC_SSOCK_DEV ? "MSS":"MXS"); + md->numa_node, msg->port, ntohs(msg->dev_addr.lid), + md->addr.ep_map == MIC_SSOCK_DEV ? "MSS":"MXS"); bail: mpxy_unlock(&mc->oplock); mpxy_unlock(&mc->cmlock); @@ -1218,7 +1228,7 @@ static void mpxy_server(void) set.revents = 0; mlog(0x8, "Server sleep\n"); poll(&set, 1, -1); /* sleep */ - mlog(0x8, "Server wake, cpu_id=%d\n"); + mlog(0x8, "Server wake, cpu_id=%d\n", cpu_id); /* process listens */ if (mcm_poll(scif_listen_ep, POLLIN) == POLLIN) mix_scif_accept(scif_listen_ep); diff --git a/dapl/svc/mpxyd.h b/dapl/svc/mpxyd.h index 58312e4..e17f4b0 100644 --- a/dapl/svc/mpxyd.h +++ b/dapl/svc/mpxyd.h @@ -117,6 +117,8 @@ typedef struct mcm_ib_dev { uint64_t *ports; /* SCIF device open clients, cm_id*/ struct dat_mcm_addr addr; uint16_t lid; + uint16_t m_lid; /* intra-node, ibscif HST lid != MICx lids */ + uint8_t m_gid[16]; struct dat_mix_dev_attr dev_attr; /* provided with mix_open */ int s_hd; int s_tl; diff --git a/doc/dat.conf b/doc/dat.conf index a0ab015..5602757 100644 --- a/doc/dat.conf +++ b/doc/dat.conf @@ -38,6 +38,7 @@ ofa-v2-mcm-1 u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "mlx4_0 1" "" ofa-v2-mcm-2 u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "mlx4_0 2" "" ofa-v2-scif0 u2.0 nonthreadsafe default libdaploscm.so.2 dapl.2.0 "scif0 1" "" ofa-v2-scif0-u u2.0 nonthreadsafe default libdaploucm.so.2 dapl.2.0 "scif0 1" "" +ofa-v2-scif0-m u2.0 nonthreadsafe default libdaplomcm.so.2 dapl.2.0 "scif0 1" "" ofa-v2-mic0 u2.0 nonthreadsafe default libdaplofa.so.2 dapl.2.0 "mic0:ib 1" "" ofa-v2-mlx4_0-1s u2.0 nonthreadsafe default libdaploscm.so.2 dapl.2.0 "mlx4_0 1" "" ofa-v2-mlx4_0-2s u2.0 nonthreadsafe default libdaploscm.so.2 dapl.2.0 "mlx4_0 2" ""