#endif
done:
- /* set default IB MTU */
- hca_ptr->ib_trans.ib_cm.mtu = dapl_ib_mtu(2048);
-
return DAT_SUCCESS;
}
char *wr_buf_rx; /* mcm_wr_rx_t entries, devices without inline data */
struct ibv_mr *wr_buf_rx_mr;
#endif
+ uint8_t mtu; /* RC QP MTU, cm exchange, min(local,peer) */
};
#define DCM_CQ_TX 0x1
uint8_t sportx; /* extend to 24 bits */
uint8_t dportx; /* extend to 24 bits */
uint8_t rtns; /* retransmissions */
- uint8_t resv[2];
+ uint8_t mtu; /* MTU */
+ uint8_t resv[1];
union dcm_addr saddr;
union dcm_addr daddr;
union dcm_addr saddr_alt;
#define DCM_ACK_RETRY 7 /* 3 bits, 7 * 4.2 == 30 seconds */
#define DCM_RNR_TIMER 12 /* 5 bits, 12 =.64ms, 28 =163ms, 31 =491ms */
#define DCM_RNR_RETRY 7 /* 3 bits, 7 == infinite */
-#define DCM_IB_MTU 2048
+#define DCM_IB_MTU 4096 /* new default MTU size */
/* Global routing defaults */
#define DCM_GLOBAL 0 /* global routing is disabled */
uint32_t s_id; /* src pid */
uint32_t d_id; /* dst pid */
uint8_t rd_in; /* atomic_rd_in */
- uint8_t rsvd[4];
+ uint8_t mtu; /* mtu */
+ uint8_t rsvd[3];
uint8_t seg_sz; /* data segment size in power of 2 */
dat_mcm_addr_t saddr1; /* QPt local, MPXY or MCM on non-MIC node */
dat_mcm_addr_t saddr2; /* QPr local, MIC or MCM on non-MIC node or MPXY */
MIX_OP_ASYNC = 0x08,
MIX_OP_INLINE = 0x10,
MIX_OP_SET = 0x20,
+ MIX_OP_MTU = 0x40,
} dat_mix_op_flags_t;
qp_attr.dest_qp_num = ntohl(qpn);
qp_attr.rq_psn = 1;
- qp_attr.path_mtu = ia_ptr->hca_ptr->ib_trans.ib_cm.mtu;
qp_attr.min_rnr_timer = ia_ptr->hca_ptr->ib_trans.ib_cm.rnr_timer;
+ qp_attr.path_mtu = ep_ptr->qp_handle->mtu ?
+ ep_ptr->qp_handle->mtu :
+ ia_ptr->hca_ptr->ib_trans.ib_cm.mtu;
#ifdef _OPENIB_MCM_
qp_attr.max_dest_rd_atomic = ia_ptr->hca_ptr->ib_trans.ib_cm.rd_atom_in;
#else
qp_attr.max_dest_rd_atomic = ep_ptr->param.ep_attr.max_rdma_read_in;
#endif
- dapl_dbg_log(DAPL_DBG_TYPE_EP,
- " QPS_RTR: type %d l_qpn %x qpn %x lid 0x%x"
- " port %d ep %p qp_state %d rd_atomic %d\n",
- qp_handle->qp_type, qp_handle->qp_num,
- ntohl(qpn), ntohs(lid), ia_ptr->hca_ptr->port_num,
- ep_ptr, ep_ptr->qp_state, qp_attr.max_dest_rd_atomic);
+ dapl_log(DAPL_DBG_TYPE_EP,
+ " QPS_RTR: type %d l_qpn %x qpn %x lid 0x%x"
+ " port %d ep %p qp_state %d rd_atomic %d mtu %d lmtu %d\n",
+ qp_handle->qp_type, qp_handle->qp_num,
+ ntohl(qpn), ntohs(lid), ia_ptr->hca_ptr->port_num,
+ ep_ptr, ep_ptr->qp_state, qp_attr.max_dest_rd_atomic,
+ qp_attr.path_mtu, ia_ptr->hca_ptr->ib_trans.ib_cm.mtu);
/* address handle. RC and UD */
qp_attr.ah_attr.dlid = ntohs(lid);
case 4096:
return IBV_MTU_4096;
default:
- return IBV_MTU_1024;
+ return IBV_MTU_4096;
}
}
case IBV_MTU_4096:
return "4096";
default:
- return "1024";
+ return "4096";
}
}
dev_attr.max_qp_wr = DAPL_MIN(dev_attr.max_qp_wr,
dapl_os_get_env_val("DAPL_WR_MAX", dev_attr.max_qp_wr));
+ /* MTU to active by default, reset if env set and <= active_mtu */
+ if (getenv("DAPL_IB_MTU"))
+ tp->ib_cm.mtu = DAPL_MIN(port_attr.active_mtu,
+ dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU", DCM_IB_MTU)));
+ else
+ tp->ib_cm.mtu = port_attr.active_mtu;
+
#ifdef _OPENIB_MCM_
/* Adjust for CCL Proxy; limited sge's, no READ support, reduce QP and RDMA limits */
dev_attr.max_sge = DAPL_MIN(dev_attr.max_sge, DAT_MIX_SGE_MAX);
/* save key device attributes for CM exchange */
tp->ib_cm.rd_atom_in = dev_attr.max_qp_rd_atom;
tp->ib_cm.rd_atom_out = dev_attr.max_qp_init_rd_atom;
- tp->ib_cm.mtu = DAPL_MIN(port_attr.active_mtu, tp->ib_cm.mtu);
tp->ib_cm.ack_timer = DAPL_MAX(dev_attr.local_ca_ack_delay, tp->ib_cm.ack_timer);
/* set provider/transport specific named attributes */
if (msg->seg_sz) /* set po2 seg_sz, if provided */
cm->msg.seg_sz = msg->seg_sz;
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ ep->qp_handle->mtu = msg->mtu ?
+ DAPL_MIN(msg->mtu, cm->hca->ib_trans.ib_cm.mtu):
+ getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
+
cm->msg.d_id = msg->s_id;
dapl_os_memcpy(&ep->remote_ia_address, &msg->saddr2, sizeof(dat_mcm_addr_t));
dapl_os_memcpy(&cm->msg.daddr2, &msg->saddr2, sizeof(dat_mcm_addr_t));
}
dapl_dbg_log(DAPL_DBG_TYPE_CM,
- " CONN_RTU: DST lid=%x, QPr=%x, QPt=%x qp_type=%d, port=%x psize=%d\n",
+ " CONN_RTU: DST lid=%x, QPr=%x, QPt=%x qp_type=%d,"
+ " port=%x psize=%d mtu=%d,%d\n",
ntohs(cm->msg.daddr1.lid), ntohl(cm->msg.daddr1.qpn),
ntohl(cm->msg.daddr2.qpn), cm->msg.daddr1.qp_type,
- ntohs(msg->sport), ntohs(msg->p_size));
+ ntohs(msg->sport), ntohs(msg->p_size),
+ cm->tp->ib_cm.mtu, cm->msg.mtu);
if (ntohs(msg->op) == MCM_REP)
event = IB_CME_CONNECTED;
/* Send RTU, no private data */
cm->msg.op = htons(MCM_RTU);
+ cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */
dapl_os_lock(&cm->lock);
cm->state = MCM_CONNECTED;
cm->msg.p_data, ntohs(cm->msg.p_size), cm->ep);
dapl_log(DAPL_DBG_TYPE_CM_EST,
- " mcm_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x %s\n",
+ " mcm_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x %s mtu %d\n",
cm->hca, cm->retries, ntohs(cm->msg.saddr1.lid),
ntohs(cm->msg.sport), ntohl(cm->msg.saddr1.qpn),
ntohs(cm->msg.daddr1.lid), ntohs(cm->msg.dport),
- ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map));
+ ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map),
+ cm->ep->qp_handle->mtu);
mcm_log_addrs(DAPL_DBG_TYPE_CM_EST, &cm->msg, cm->state, 0);
acm->msg.p_size = msg->p_size;
acm->msg.d_id = msg->s_id;
acm->msg.rd_in = msg->rd_in;
+ acm->msg.mtu = msg->mtu; /* save peer MTU */
if (msg->seg_sz) /* set po2 seg_sz, if provided */
acm->msg.seg_sz = msg->seg_sz;
dapls_cr_callback(cm, IB_CME_CONNECTED, NULL, 0, cm->sp);
dapl_log(DAPL_DBG_TYPE_CM_EST,
- " PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x %s\n",
+ " PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x %s mtu %d\n",
cm->hca, cm->retries, ntohs(cm->msg.saddr1.lid),
ntohs(cm->msg.sport), ntohl(cm->msg.saddr1.qpn),
ntohs(cm->msg.daddr1.lid), ntohs(cm->msg.dport),
- ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map));
+ ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map),
+ cm->ep->qp_handle->mtu);
mcm_log_addrs(DAPL_DBG_TYPE_CM_EST, &cm->msg, cm->state, 1);
return;
ep->param.ep_attr.max_rdma_read_out =
DAPL_MIN(ep->param.ep_attr.max_rdma_read_out, cm->msg.rd_in);
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ ep->qp_handle->mtu = cm->msg.mtu ?
+ DAPL_MIN(cm->msg.mtu, cm->hca->ib_trans.ib_cm.mtu):
+ getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
+
/* modify QPr to RTR and then to RTS, QPr (qp) to remote QPt (daddr2), !xsocket */
dapl_os_lock(&ep->header.lock);
if (!MXF_EP(&cm->hca->ib_trans.addr)) {
/* setup local QPr info (if !KR) and type from EP, copy pdata, for reply */
cm->msg.op = htons(MCM_REP);
cm->msg.rd_in = ep->param.ep_attr.max_rdma_read_in;
+ cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */
if (!MXF_EP(&cm->hca->ib_trans.addr)) {
cm->msg.saddr1.qpn = htonl(ep->qp_handle->qp->qp_num);
/* set max rdma inbound requests */
cm->msg.rd_in = ep->param.ep_attr.max_rdma_read_in;
+ cm->msg.mtu = cm->tp->ib_cm.mtu; /* local MTU to peer */
if (p_size) {
cm->msg.p_size = htons(p_size);
msg.port = port;
strcpy((char*)&msg.name, name);
+ if (getenv("DAPL_IB_MTU"))
+ msg.hdr.flags |= MIX_OP_MTU;
+
/* send any overridden attributes to proxy */
msg.dev_attr.ack_timer = tp->ib_cm.ack_timer;
msg.dev_attr.ack_retry = tp->ib_cm.ack_retry;
/* REQ: QP info in msg.saddr, IA address in msg.daddr, and pdata */
cm_ptr->hca = ia_ptr->hca_ptr;
cm_ptr->msg.op = ntohs(DCM_REQ);
+ cm_ptr->msg.mtu = ia_ptr->hca_ptr->ib_trans.ib_cm.mtu; /* local MTU to peer */
cm_ptr->msg.saddr.ib.qpn = htonl(ep_ptr->qp_handle->qp->qp_num);
cm_ptr->msg.saddr.ib.qp_type = ep_ptr->qp_handle->qp->qp_type;
cm_ptr->msg.saddr.ib.lid = ia_ptr->hca_ptr->ib_trans.lid;
DAPL_MIN(ep_ptr->param.ep_attr.max_rdma_read_out,
cm_ptr->msg.rd_in);
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ ep_ptr->qp_handle->mtu = cm_ptr->msg.mtu ?
+ DAPL_MIN(cm_ptr->msg.mtu, cm_ptr->hca->ib_trans.ib_cm.mtu):
+ getenv("DAPL_IB_MTU") ? cm_ptr->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
+
/* modify QP to RTR and then to RTS with remote info */
dapl_os_lock(&ep_ptr->header.lock);
if (dapls_modify_qp_state(ep_ptr->qp_handle->qp,
dapl_os_unlock(&cm_ptr->lock);
cm_ptr->msg.op = ntohs(DCM_RTU);
+ cm_ptr->msg.mtu = ep_ptr->qp_handle->mtu; /* send negotiated MTU */
if (send(cm_ptr->socket, (char *)&cm_ptr->msg, 4, 0) == -1) {
int err = dapl_socket_errno();
dapl_log(DAPL_DBG_TYPE_ERR,
DCM_MAX_PDATA_SIZE, ep_ptr);
}
dapl_log(DAPL_DBG_TYPE_CM_EST,
- " SCM ACTIVE CONN: %x -> %s %x\n",
+ " SCM ACTIVE CONN: %x -> %s %x mtu %d\n",
ntohs(((struct sockaddr_in *) &cm_ptr->addr)->sin_port),
inet_ntoa(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_addr),
- ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port)-1000);
+ ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port)-1000,
+ ep_ptr->qp_handle->mtu);
return;
bail:
if (ntohs(cm_ptr->msg.ver) < DCM_VER_XPS)
exp += SCM_BC_DIFF;
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ ep_ptr->qp_handle->mtu = cm_ptr->msg.mtu ?
+ DAPL_MIN(cm_ptr->msg.mtu, cm_ptr->hca->ib_trans.ib_cm.mtu):
+ getenv("DAPL_IB_MTU") ? cm_ptr->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
+
/* modify QP to RTR and then to RTS with remote info already read */
dapl_os_lock(&ep_ptr->header.lock);
if (dapls_modify_qp_state(ep_ptr->qp_handle->qp,
local.ver = htons(DCM_VER);
local.op = htons(DCM_REP);
local.rd_in = ep_ptr->param.ep_attr.max_rdma_read_in;
+ local.mtu = ep_ptr->qp_handle->mtu; /* send negotiated MTU */
local.saddr.ib.qpn = htonl(ep_ptr->qp_handle->qp->qp_num);
local.saddr.ib.qp_type = ep_ptr->qp_handle->qp->qp_type;
local.saddr.ib.lid = ia_ptr->hca_ptr->ib_trans.lid;
dapls_cr_callback(cm_ptr, event, NULL, 0, cm_ptr->sp);
}
dapl_log(DAPL_DBG_TYPE_CM_EST,
- " SCM PASSIVE CONN: %x <- %s %x\n",
+ " SCM PASSIVE CONN: %x <- %s %x mtu %d\n",
cm_ptr->sp->conn_qual,
inet_ntoa(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_addr),
- ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port));
+ ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port),
+ cm_ptr->ep->qp_handle->mtu);
return;
bail:
#define SCM_RNR_TIMER 12 /* 5 bits, 12 =.64ms, 28 =163ms, 31 =491ms */
#define SCM_RNR_RETRY 7 /* 3 bits, 7 == infinite */
#define SCM_CR_RETRY 5 /* retries for busy server, connect refused */
-#define SCM_IB_MTU 2048
/* Global routing defaults */
#define SCM_GLOBAL 0 /* global routing is disabled */
dapl_os_get_env_val("DAPL_HOP_LIMIT", SCM_HOP_LIMIT);
hca_ptr->ib_trans.ib_cm.tclass =
dapl_os_get_env_val("DAPL_TCLASS", SCM_TCLASS);
- hca_ptr->ib_trans.ib_cm.mtu =
- dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU", SCM_IB_MTU));
if (flags & DAPL_OPEN_QUERY)
goto done;
lock = &tp->lock;
dapl_log(DAPL_DBG_TYPE_CM,
- " ucm_recv: %s %d %x %x i %x c %x < %d %x %x i %x c %x\n",
+ " ucm_recv: %s %d %x %x i %x c %x < %d %x %x i %x c %x (%d,%d)\n",
dapl_cm_op_str(msg_op),
ntohl(msg->d_id), ntohs(msg->daddr.ib.lid),
UCM_PORT_NTOH(msg->dportx, msg->dport),
ntohl(msg->daddr.ib.qpn), ntohl(msg->dqpn),
ntohl(msg->s_id), ntohs(msg->saddr.ib.lid),
UCM_PORT_NTOH(msg->sportx, msg->sport),
- ntohl(msg->saddr.ib.qpn), ntohl(msg->sqpn));
+ ntohl(msg->saddr.ib.qpn), ntohl(msg->sqpn),
+ tp->ib_cm.mtu, msg->mtu);
retry_listenq:
dapl_os_lock(lock);
{
dapl_log(DAPL_DBG_TYPE_EP,
" connect: lid %x i_qpn %x lport %x p_sz=%d -> "
- " lid %x c_qpn %x rport %x\n",
+ " lid %x c_qpn %x rport %x l_mtu %d\n",
htons(cm->msg.saddr.ib.lid), htonl(cm->msg.saddr.ib.qpn),
UCM_PORT_NTOH(cm->msg.sportx,cm->msg.sport),
htons(cm->msg.p_size),
htons(cm->msg.daddr.ib.lid), htonl(cm->msg.dqpn),
- UCM_PORT_NTOH(cm->msg.dportx,cm->msg.dport));
+ UCM_PORT_NTOH(cm->msg.dportx,cm->msg.dport),
+ cm->hca->ib_trans.ib_cm.mtu);
dapl_os_lock(&cm->lock);
if (cm->state != DCM_INIT && cm->state != DCM_REP_PENDING) {
cm->state = DCM_REP_PENDING;
cm->msg.op = htons(DCM_REQ);
+ cm->msg.mtu = cm->hca->ib_trans.ib_cm.mtu; /* local MTU to peer */
+
if (ucm_send(&cm->hca->ib_trans, &cm->msg,
&cm->msg.p_data, ntohs(cm->msg.p_size))) {
dapl_os_unlock(&cm->lock);
cm->ep->param.ep_attr.max_rdma_read_out =
DAPL_MIN(cm->ep->param.ep_attr.max_rdma_read_out,
cm->msg.rd_in);
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ ep->qp_handle->mtu = msg->mtu ?
+ DAPL_MIN(msg->mtu, cm->hca->ib_trans.ib_cm.mtu):
+ getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
/* modify QP to RTR and then to RTS with remote info */
dapl_os_lock(&cm->ep->header.lock);
/* Send RTU, no private data */
cm->msg.op = htons(DCM_RTU);
+ cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */
dapl_os_lock(&cm->lock);
cm->state = DCM_CONNECTED;
}
dapl_log(DAPL_DBG_TYPE_CM_EST,
- " UCM_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x xevent=%d\n",
+ " UCM_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x mtu %d\n",
cm->hca, cm->retries, ntohs(cm->msg.saddr.ib.lid),
ntohs(cm->msg.sport), ntohl(cm->msg.saddr.ib.qpn),
ntohs(cm->msg.daddr.ib.lid), ntohs(cm->msg.dport),
- ntohl(cm->msg.dqpn), sizeof(DAT_IB_EXTENSION_EVENT_DATA));
+ ntohl(cm->msg.dqpn), ep->qp_handle->mtu);
return;
bail:
if (ntohs(msg->op) != DCM_REJ_USER) {
acm->msg.p_size = msg->p_size;
acm->msg.d_id = msg->s_id;
acm->msg.rd_in = msg->rd_in;
+ acm->msg.mtu = msg->mtu; /* save peer MTU */
/* CR saddr is CM daddr info, need EP for local saddr */
dapl_os_memcpy(&acm->msg.daddr, &msg->saddr, sizeof(union dcm_addr));
dapl_log(DAPL_DBG_TYPE_CM,
" accepting: op %s [id lid, port, cqp, iqp]:"
- " %d %x %x %x %x <- %d %x %x %x %x\n",
+ " %d %x %x %x %x <- %d %x %x %x %x mtu %d\n",
dapl_cm_op_str(ntohs(msg->op)),
ntohl(acm->msg.s_id), ntohs(msg->daddr.ib.lid),
UCM_PORT_NTOH(msg->dportx, msg->dport),
ntohl(msg->dqpn), ntohl(msg->daddr.ib.qpn),
ntohl(msg->s_id), ntohs(msg->saddr.ib.lid),
UCM_PORT_NTOH(msg->sportx, msg->sport),
- ntohl(msg->sqpn), ntohl(msg->saddr.ib.qpn));
+ ntohl(msg->sqpn), ntohl(msg->saddr.ib.qpn),
+ acm->msg.mtu);
#ifdef DAT_EXTENSIONS
if (acm->msg.daddr.ib.qp_type == IBV_QPT_UD) {
}
dapl_log(DAPL_DBG_TYPE_CM_EST,
- " UCM_PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x\n",
+ " UCM_PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x mtu %d\n",
cm->hca, cm->retries, ntohs(cm->msg.saddr.ib.lid),
UCM_PORT_NTOH(cm->msg.sportx, cm->msg.sport),
ntohl(cm->msg.saddr.ib.qpn),
ntohs(cm->msg.daddr.ib.lid),
UCM_PORT_NTOH(cm->msg.dportx, cm->msg.dport),
- ntohl(cm->msg.dqpn));
+ ntohl(cm->msg.dqpn), cm->ep->qp_handle->mtu);
return;
bail:
dapl_log(DAPL_DBG_TYPE_CM_WARN,
dapl_dbg_log(DAPL_DBG_TYPE_CM,
" ACCEPT_USR: s_id %d r_id %d lid=%x"
- " iqp=%x qp_type %d, psize=%d\n",
+ " iqp=%x qp_type %d, psize=%d r_mtu %d l_mtu %d\n",
ntohl(cm->msg.s_id), ntohl(cm->msg.d_id),
ntohs(cm->msg.daddr.ib.lid),
ntohl(cm->msg.daddr.ib.qpn), cm->msg.daddr.ib.qp_type,
- p_size);
+ p_size, cm->msg.mtu, cm->hca->ib_trans.ib_cm.mtu);
#ifdef DAT_EXTENSIONS
if (cm->msg.daddr.ib.qp_type == IBV_QPT_UD &&
ep->param.ep_attr.max_rdma_read_out =
DAPL_MIN(ep->param.ep_attr.max_rdma_read_out,
cm->msg.rd_in);
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ ep->qp_handle->mtu = cm->msg.mtu ?
+ DAPL_MIN(cm->msg.mtu, cm->hca->ib_trans.ib_cm.mtu):
+ getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048;
/* modify QP to RTR and then to RTS with remote info already read */
dapl_os_lock(&ep->header.lock);
/* setup local QP info and type from EP, copy pdata, for reply */
cm->msg.op = htons(DCM_REP);
cm->msg.rd_in = ep->param.ep_attr.max_rdma_read_in;
+ cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */
cm->msg.saddr.ib.qpn = htonl(ep->qp_handle->qp->qp_num);
cm->msg.saddr.ib.qp_type = ep->qp_handle->qp->qp_type;
cm->msg.saddr.ib.lid = cm->hca->ib_trans.addr.ib.lid;
dapl_os_get_env_val("DAPL_HOP_LIMIT", DCM_HOP_LIMIT);
hca_ptr->ib_trans.ib_cm.tclass =
dapl_os_get_env_val("DAPL_TCLASS", DCM_TCLASS);
- hca_ptr->ib_trans.ib_cm.mtu =
- dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU", DCM_IB_MTU));
if (flags & DAPL_OPEN_QUERY)
goto done;
qp_attr.dest_qp_num = ntohl(qpn);
qp_attr.rq_psn = 1;
- qp_attr.path_mtu = m_qp->smd->md->dev_attr.mtu;
+ qp_attr.path_mtu = m_qp->mtu ?
+ min(m_qp->mtu, m_qp->smd->md->dev_attr.mtu):
+ m_qp->smd->md->dev_attr.mtu;
qp_attr.max_dest_rd_atomic = 16;
qp_attr.min_rnr_timer = m_qp->smd->md->dev_attr.rnr_timer;
qp_attr.ah_attr.dlid = ntohs(lid);
m_cm->state = MCM_REP_PENDING;
m_cm->msg.op = htons(MCM_REQ);
m_cm->timer = mcm_time_us(); /* reset reply timer */
+ m_cm->msg.mtu = m_cm->smd->md->dev_attr.mtu; /* local MTU to peer */
if (mcm_send(m_cm->md, &m_cm->msg, &m_cm->msg.p_data, ntohs(m_cm->msg.p_size)))
return -1;
MCNTR(m_cm->md, MCM_CM_RTU_OUT);
- mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s -> 0x%x %x 0x%x %Lx %s\n",
+ mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s -> 0x%x %x 0x%x %Lx %s mtu %d\n",
m_cm->md->mc->scif_id, m_cm->smd->entry.tid,
m_cm->md->cntrs ? (uint32_t)((uint64_t *)m_cm->md->cntrs)[MCM_CM_RTU_OUT]:0,
m_cm, htons(m_cm->msg.saddr2.lid), htonl(m_cm->msg.saddr2.qpn),
htons(m_cm->msg.daddr1.lid),
MXF_EP(&m_cm->msg.saddr1) && MXF_EP(&m_cm->msg.daddr1) ?
htonl(m_cm->msg.daddr2.qpn):htonl(m_cm->msg.daddr1.qpn),
- htons(m_cm->msg.dport), ntohll(r_guid), mcm_map_str(m_cm->msg.daddr1.ep_map));
+ htons(m_cm->msg.dport), ntohll(r_guid), mcm_map_str(m_cm->msg.daddr1.ep_map),
+ m_cm->m_qp->mtu);
mpxy_lock(&m_cm->lock);
if (m_cm->state != MCM_REP_RCV) {
mlog(8, " SCIF client: device open client_pid 0x%x - mlen %d - ep %d\n",
ntohl(msg.hdr.req_id), len, op_ep);
- msg.hdr.flags = MIX_OP_RSP;
-
if (msg.hdr.ver < MIX_MIN || msg.hdr.ver > MIX_MAX || msg.hdr.op != MIX_IA_OPEN) {
mlog(0, " ERR: MIC client incompatible with MPXYD (exp %d,rcvd %d) or OP (exp %d,rcvd %d)\n",
DAT_MIX_VER, msg.hdr.ver, msg.hdr.op, MIX_IA_OPEN);
ntohs(m_cm->msg.daddr1.lid), ntohll(m_cm->msg.sys_guid));
/* send RTU on wire */
+ m_cm->msg.mtu = m_cm->m_qp->mtu; /* send negotiated MTU */
mcm_cm_rtu_out(m_cm);
return 0;
else
m_cm->m_qp->p2p_data = 0;
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ m_cm->m_qp->mtu = pkt->mtu ?
+ min(pkt->mtu, m_cm->md->dev_attr.mtu):
+ m_cm->md->mtu_env ? m_cm->md->mtu_env : IBV_MTU_2048;
+ m_cm->msg.mtu = m_cm->m_qp->mtu; /* forward negotiated MTU */
+
mlog(2, " WRC: m_qp %p - WR 0x%Lx rkey 0x%x ln %d, sz %d end %d"
" WC 0x%Lx rkey 0x%x ln %d, sz %d end %d\n",
m_cm->m_qp, m_cm->m_qp->wrc.wr_addr, m_cm->m_qp->wrc.wr_rkey,
acm->msg.p_size = pkt->p_size;
acm->msg.d_id = pkt->s_id;
acm->msg.rd_in = pkt->rd_in;
+ acm->msg.mtu = pkt->mtu;
#ifdef MPXYD_LOCAL_SUPPORT
acm->msg.sys_guid = pkt->sys_guid; /* remote system guid */;
#else
memcpy(&acm->msg.daddr1, &pkt->saddr1, sizeof(dat_mcm_addr_t));
memcpy(&acm->msg.daddr2, &pkt->saddr2, sizeof(dat_mcm_addr_t));
- mlog(2, " [%d:%d] cm %p ep %d sPORT %x %s <- dPORT %x lid=%x psz=%d %s %s %Lx (msg %p %d)\n",
+ mlog(2, " [%d:%d] cm %p ep %d: %x %s <- %x lid=%x psz=%d %s %s %Lx (%p %d) lmtu %d rmtu %d\n",
cm->md->mc->scif_id, cm->smd->entry.tid, acm, acm->smd->scif_ev_ep,
ntohs(acm->msg.sport), mcm_map_str(acm->md->addr.ep_map),
ntohs(acm->msg.dport), ntohs(acm->msg.daddr1.lid), htons(acm->msg.p_size),
mcm_map_str(acm->msg.daddr2.ep_map),
acm->md->addr.lid == acm->msg.daddr1.lid ? "platform":"fabric",
- ntohll(acm->msg.sys_guid), &msg, sizeof(dat_mcm_msg_t));
+ ntohll(acm->msg.sys_guid), &msg, sizeof(dat_mcm_msg_t),
+ cm->md->dev_attr.mtu, pkt->mtu);
if (pkt->p_size)
memcpy(acm->msg.p_data, pkt->p_data, ntohs(pkt->p_size));
dat_mix_cm_t msg;
int len;
- mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s <- 0x%x %x 0x%x %Lx %s\n",
+ mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s <- 0x%x %x 0x%x %Lx %s mtu %d\n",
m_cm->md->mc->scif_id, m_cm->smd->entry.tid,
m_cm->md->cntrs ? (uint32_t)((uint64_t *)m_cm->md->cntrs)[MCM_CM_RTU_IN]:0,
m_cm, htons(pkt->daddr1.lid),
htonl(m_cm->msg.daddr2.qpn):htonl(m_cm->msg.daddr1.qpn),
htons(pkt->dport), system_guid, mcm_map_str(pkt->daddr1.ep_map),
htons(pkt->saddr2.lid), htonl(pkt->saddr2.qpn),
- htons(pkt->sport), ntohll(pkt->sys_guid), mcm_map_str(pkt->saddr2.ep_map));
+ htons(pkt->sport), ntohll(pkt->sys_guid), mcm_map_str(pkt->saddr2.ep_map),
+ m_cm->m_qp->mtu);
/* MXF_EP <- HST_EP, host sends WC on RTU, save WRC info */
if (MXF_EP(&pkt->daddr1) && HST_EP(&pkt->saddr2)) {
m_cm->msg.sys_guid = rand();
#endif
+ /* Set QP MTU, if negotiated. 2K for compatibility */
+ m_cm->m_qp->mtu = m_cm->msg.mtu ?
+ min(m_cm->msg.mtu, m_cm->md->dev_attr.mtu):
+ m_cm->md->mtu_env ? m_cm->md->mtu_env : IBV_MTU_2048;
+
if (qp) {
if (mcm_modify_qp(qp, IBV_QPS_RTR, dqpn, dlid, dgid))
goto err;
goto err;
}
- /* send RTU on wire, monitor for retries */
+ /* send REP on wire, monitor for retries */
m_cm->state = MCM_RTU_PENDING;
+ m_cm->msg.mtu = m_cm->m_qp->mtu; /* send negotiated MTU */
mpxy_unlock(&m_cm->lock);
mcm_cm_rep_out(m_cm);
return 0;
msg->dev_addr.lid = md->m_lid;
memcpy(msg->dev_addr.gid, md->m_gid, 16);
}
+
+ /* MTU changed via DAPL_IB_MTU */
+ if (msg->hdr.flags & MIX_OP_MTU)
+ md->mtu_env = md->dev_attr.mtu;
err:
if (!smd) {
mlog(1, " WARN: open failed for %s - %d\n", msg->name, msg->port);
}
/* send back response */
+ msg->hdr.flags = MIX_OP_RSP;
ret = scif_send_msg(op_ep, (void*)msg, sizeof(dat_mix_open_t));
if (ret) {
mlog(0, " ERR: scif_send dev_id %d op_ep %d, closing device %p\n",
goto bail;
}
- mlog(1, " MIC client: mdev[%d] %p smd %p mic%d[%d] -> %s[%d] port %d lid %x %s\n",
+ mlog(1, " MIC client: mdev[%d] %p->%p mic%d[%d] -> %s[%d] port %d lid %x %s mtu %d (%d)\n",
md->smd_list.tid, md, smd, mc->scif_id-1, mc->numa_node, msg->name,
- md->numa_node, msg->port, ntohs(msg->dev_addr.lid), mcm_map_str(md->addr.ep_map));
+ md->numa_node, msg->port, ntohs(msg->dev_addr.lid), mcm_map_str(md->addr.ep_map),
+ md->dev_attr.mtu, md->mtu_env);
bail:
mpxy_unlock(&mc->oplock);
mpxy_unlock(&mc->cmlock);
int numa_node;
int indata;
void *cntrs;
+ uint8_t mtu_env;
} mcm_ib_dev_t;
int sr_len; /* SR WR buffer pool len */
int sr_sz; /* SR WR entry size */
int post_sr;
+ uint8_t mtu; /* negotiated QP MTU */
#ifdef MCM_PROFILE
mcm_qp_prof_t ts;
uint32_t last_wr_sig;