result of commit:
ab67173b8024e14009c266d76ab9ec0bdd0c5d1f
New MCM provider on MIC side needs to open in compat mode
with MTU set to 2048. It needs to allow proxy, if new, to
adjust to active MTU. If old proxy is on host side, 2048
is returned as normal and new MCM provider remains in
compat mode with MTU at 2048.
New proxy on host side needs to support an old version of
MCM provider and adjust MTU only if MIC side changes
dev_attr.mtu settings. It will bump up to active_MTU
only if the MCM provider is new and sets the MIX_OP_SET
bit on the mic->host proxy device open call.
Proxy open device MUST set new dev attributes in client SMD
device object and not in the shared MD device object since
there can be multiple clients with different attribute
settings from MIC side.
MCM provider MUST query and setup MTU in open instead of query
so subsequent queries don't override negotiated setting.
Signed-off-by: Arlin Davis <arlin.r.davis@intel.com>
dev_attr.max_qp_wr = DAPL_MIN(dev_attr.max_qp_wr,
dapl_os_get_env_val("DAPL_WR_MAX", dev_attr.max_qp_wr));
- /* MTU to active by default, reset if env set and <= active_mtu */
- if (getenv("DAPL_IB_MTU"))
- tp->ib_cm.mtu = DAPL_MIN(port_attr.active_mtu,
- dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU", DCM_IB_MTU)));
- else
- tp->ib_cm.mtu = port_attr.active_mtu;
-
#ifdef _OPENIB_MCM_
/* Adjust for CCL Proxy; limited sge's, no READ support, reduce QP and RDMA limits */
dev_attr.max_sge = DAPL_MIN(dev_attr.max_sge, DAT_MIX_SGE_MAX);
dapl_os_get_env_val("DAPL_MCM_WR_MAX", DAT_MIX_WR_MAX));
port_attr.max_msg_sz = DAPL_MIN(port_attr.max_msg_sz,
dapl_os_get_env_val("DAPL_MCM_MSG_MAX", DAT_MIX_RDMA_MAX));
+#else
+ /* MTU to active by default, reset if env set and <= active_mtu */
+ if (getenv("DAPL_IB_MTU"))
+ tp->ib_cm.mtu = DAPL_MIN(port_attr.active_mtu,
+ dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU", DCM_IB_MTU)));
+ else
+ tp->ib_cm.mtu = port_attr.active_mtu;
#endif
if (ia_attr != NULL) {
goto err2;
}
+ /* Set MTU here, don't set in query; MCM, needs to sync with proxy on MIC */
+ if (getenv("DAPL_IB_MTU")) {
+ hca_ptr->ib_trans.ib_cm.mtu =
+ DAPL_MIN(port_attr.active_mtu,
+ dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU",
+ DCM_IB_MTU)));
+ } else {
+ hca_ptr->ib_trans.ib_cm.mtu = port_attr.active_mtu;
+ }
+
if (dapli_mix_open(&hca_ptr->ib_trans, hca_name,
hca_ptr->port_num, flags & DAPL_OPEN_QUERY)) {
dapl_log(DAPL_DBG_TYPE_ERR,
msg.port = port;
strcpy((char*)&msg.name, name);
- if (getenv("DAPL_IB_MTU"))
+ if (getenv("DAPL_IB_MTU")) {
msg.hdr.flags |= MIX_OP_MTU;
+ msg.dev_attr.mtu = tp->ib_cm.mtu; /* set to env value */
+ } else {
+ msg.hdr.flags |= MIX_OP_SET; /* ok for proxy to set MTU */
+ msg.dev_attr.mtu = IBV_MTU_2048; /* compat mode */
+ }
/* send any overridden attributes to proxy */
msg.dev_attr.ack_timer = tp->ib_cm.ack_timer;
msg.dev_attr.hop_limit = tp->ib_cm.hop_limit;
msg.dev_attr.tclass = tp->ib_cm.tclass;
msg.dev_attr.sl = tp->ib_cm.sl;
- msg.dev_attr.mtu = tp->ib_cm.mtu;
msg.dev_attr.rd_atom_in = tp->ib_cm.rd_atom_in;
msg.dev_attr.rd_atom_out = tp->ib_cm.rd_atom_out;
msg.dev_attr.pkey_idx = tp->ib_cm.pkey_idx;
tp->ib_cm.pkey_idx = msg.dev_attr.pkey_idx;
tp->ib_cm.pkey = msg.dev_attr.pkey;
tp->ib_cm.max_inline = msg.dev_attr.max_inline;
+ tp->ib_cm.mtu = msg.dev_attr.mtu; /* proxy sets active_MTU mode */
tp->dev_id = msg.hdr.req_id;
if (MFO_EP(&tp->addr))
qp_attr.dest_qp_num = ntohl(qpn);
qp_attr.rq_psn = 1;
qp_attr.path_mtu = m_qp->mtu ?
- min(m_qp->mtu, m_qp->smd->md->dev_attr.mtu):
- m_qp->smd->md->dev_attr.mtu;
+ min(m_qp->mtu, m_qp->smd->dev_attr.mtu):
+ m_qp->smd->dev_attr.mtu;
qp_attr.max_dest_rd_atomic = 16;
qp_attr.min_rnr_timer = m_qp->smd->md->dev_attr.rnr_timer;
qp_attr.ah_attr.dlid = ntohs(lid);
m_cm->state = MCM_REP_PENDING;
m_cm->msg.op = htons(MCM_REQ);
m_cm->timer = mcm_time_us(); /* reset reply timer */
- m_cm->msg.mtu = m_cm->smd->md->dev_attr.mtu; /* local MTU to peer */
+ m_cm->msg.mtu = m_cm->smd->dev_attr.mtu; /* local MTU to peer */
if (mcm_send(m_cm->md, &m_cm->msg, &m_cm->msg.p_data, ntohs(m_cm->msg.p_size)))
return -1;
static void mix_get_prov_attr(mcm_scif_dev_t *smd, dat_mix_prov_attr_t *pr_attr)
{
memset(pr_attr, 0, sizeof(dat_mix_prov_attr_t));
- memcpy(&pr_attr->dev_attr, &smd->md->dev_attr, sizeof(dat_mix_dev_attr_t));
+ memcpy(&pr_attr->dev_attr, &smd->dev_attr, sizeof(dat_mix_dev_attr_t));
mcm_get_attr(pr_attr); /* CM attributes */
pr_attr->max_msg_sz = mix_max_msg_mb * 1024 * 1024;
/* Set QP MTU, if negotiated. 2K for compatibility */
m_cm->m_qp->mtu = pkt->mtu ?
- min(pkt->mtu, m_cm->md->dev_attr.mtu):
- m_cm->md->mtu_env ? m_cm->md->mtu_env : IBV_MTU_2048;
+ min(pkt->mtu, m_cm->smd->dev_attr.mtu):
+ m_cm->smd->mtu_env ? m_cm->smd->mtu_env : IBV_MTU_2048;
m_cm->msg.mtu = m_cm->m_qp->mtu; /* forward negotiated MTU */
mlog(2, " WRC: m_qp %p - WR 0x%Lx rkey 0x%x ln %d, sz %d end %d"
mcm_map_str(acm->msg.daddr2.ep_map),
acm->md->addr.lid == acm->msg.daddr1.lid ? "platform":"fabric",
ntohll(acm->msg.sys_guid), &msg, sizeof(dat_mcm_msg_t),
- cm->md->dev_attr.mtu, pkt->mtu);
+ cm->smd->dev_attr.mtu, pkt->mtu);
if (pkt->p_size)
memcpy(acm->msg.p_data, pkt->p_data, ntohs(pkt->p_size));
/* Set QP MTU, if negotiated. 2K for compatibility */
m_cm->m_qp->mtu = m_cm->msg.mtu ?
- min(m_cm->msg.mtu, m_cm->md->dev_attr.mtu):
- m_cm->md->mtu_env ? m_cm->md->mtu_env : IBV_MTU_2048;
+ min(m_cm->msg.mtu, smd->dev_attr.mtu):
+ smd->mtu_env ? smd->mtu_env : IBV_MTU_2048;
if (qp) {
if (mcm_modify_qp(qp, IBV_QPS_RTR, dqpn, dlid, dgid))
ibctx = NULL;
goto bail;
}
-
+ md->dev_attr.mtu = port_attr.active_mtu;
md->dev_attr.rd_atom_in = device_attr.max_qp_rd_atom;
md->dev_attr.rd_atom_out = device_attr.max_qp_init_rd_atom;
md->ibdev = iblist[i];
msg->hdr.status = MIX_SUCCESS;
msg->dev_attr.rd_atom_in = md->dev_attr.rd_atom_in;
msg->dev_attr.rd_atom_out = md->dev_attr.rd_atom_out;
+
+ /* MIC side changed MTU via DAPL_IB_MTU, cover new and old clients */
+ if ((msg->hdr.flags & MIX_OP_MTU) ||
+ (msg->dev_attr.mtu != IBV_MTU_2048)) {
+ smd->mtu_env = msg->dev_attr.mtu;
+ smd->dev_attr.mtu = msg->dev_attr.mtu; /* set new MTU per MIC */
+ } else if (msg->hdr.flags & MIX_OP_SET) {
+ smd->dev_attr.mtu = md->dev_attr.mtu; /* MIC set to active_MTU */
+ } else {
+ smd->dev_attr.mtu = IBV_MTU_2048; /* run compat_mode */
+ }
+ msg->dev_attr.mtu = smd->dev_attr.mtu; /* return MTU settings */
+
if (!(mcm_ib_inline_data(md->ibctx)) || !mcm_ib_inline)
msg->dev_attr.max_inline = 0;
- memcpy(&md->dev_attr, &msg->dev_attr, sizeof(dat_mix_dev_attr_t));
+ memcpy(&smd->dev_attr, &msg->dev_attr, sizeof(dat_mix_dev_attr_t)); /* save to smd */
memcpy(&msg->dev_addr, &md->addr, sizeof(dat_mcm_addr_t)); /* proxy CM lid */
/* intra-node: restore MIC lid, gid */
msg->dev_addr.lid = md->m_lid;
memcpy(msg->dev_addr.gid, md->m_gid, 16);
}
-
- /* MTU changed via DAPL_IB_MTU */
- if (msg->hdr.flags & MIX_OP_MTU)
- md->mtu_env = md->dev_attr.mtu;
err:
if (!smd) {
mlog(1, " WARN: open failed for %s - %d\n", msg->name, msg->port);
mlog(1, " MIC client: mdev[%d] %p->%p mic%d[%d] -> %s[%d] port %d lid %x %s mtu %d (%d)\n",
md->smd_list.tid, md, smd, mc->scif_id-1, mc->numa_node, msg->name,
md->numa_node, msg->port, ntohs(msg->dev_addr.lid), mcm_map_str(md->addr.ep_map),
- md->dev_attr.mtu, md->mtu_env);
+ smd ? smd->dev_attr.mtu:md->dev_attr.mtu,
+ smd ? smd->mtu_env:0);
bail:
mpxy_unlock(&mc->oplock);
mpxy_unlock(&mc->cmlock);
int numa_node;
int indata;
void *cntrs;
- uint8_t mtu_env;
} mcm_ib_dev_t;
int m_buf_hd_r; /* Proxy Buffer WC queue hd */
int m_buf_end_r; /* Proxy Buffer WC queue end */
char *cmd_buf; /* operation command buffer */
+ struct dat_mix_dev_attr dev_attr; /* Manage attributes per MIC client open */
+ uint8_t mtu_env; /* mtu override with DAPL_IB_MTU */
#ifdef MCM_PROFILE
uint16_t m_hd_ro; /* HD,TL tracking */
uint16_t m_tl_ro;