From: Arlin Davis Date: Wed, 10 Feb 2016 22:45:12 +0000 (-0800) Subject: openib_common: set providers mtu to active_mtu instead of 2048 X-Git-Tag: dapl-2.1.9-1~18 X-Git-Url: https://openfabrics.org/gitweb/?a=commitdiff_plain;h=ab67173b8024e14009c266d76ab9ec0bdd0c5d1f;p=~ardavis%2Fdapl.git openib_common: set providers mtu to active_mtu instead of 2048 Better out of the box performance when setting mtu to active_mtu instead of default settings of 2K. The new mtu settings are applied on a per QP basis and negotiated via CM mtu 8-bit field. One of the reserved 8 bit CM message fields is used to insure compatibility with older versions. If older endpoints are mixed with newer versions it will fallback to the pre-existing 2K MTU settings, unless overriden by DAPL_IB_MTU. The change has been made across all providers including ucm, scm, mcm, and cma (rdma_cm). The mcm provider on a MIC will notify the CCL Proxy service of a DAPL_IB_MTU override via a new MIX_OP_FLAGS bit MIX_OP_MTU during the open call. Signed-off-by: Arlin Davis --- diff --git a/dapl/openib_cma/device.c b/dapl/openib_cma/device.c index 9e87355..ff6c174 100644 --- a/dapl/openib_cma/device.c +++ b/dapl/openib_cma/device.c @@ -394,9 +394,6 @@ DAT_RETURN dapls_ib_open_hca(IN IB_HCA_NAME hca_name, #endif done: - /* set default IB MTU */ - hca_ptr->ib_trans.ib_cm.mtu = dapl_ib_mtu(2048); - return DAT_SUCCESS; } diff --git a/dapl/openib_common/dapl_ib_common.h b/dapl/openib_common/dapl_ib_common.h index 69ec31b..4006fd7 100644 --- a/dapl/openib_common/dapl_ib_common.h +++ b/dapl/openib_common/dapl_ib_common.h @@ -71,6 +71,7 @@ struct dcm_ib_qp { char *wr_buf_rx; /* mcm_wr_rx_t entries, devices without inline data */ struct ibv_mr *wr_buf_rx_mr; #endif + uint8_t mtu; /* RC QP MTU, cm exchange, min(local,peer) */ }; #define DCM_CQ_TX 0x1 @@ -150,7 +151,8 @@ typedef struct _ib_cm_msg uint8_t sportx; /* extend to 24 bits */ uint8_t dportx; /* extend to 24 bits */ uint8_t rtns; /* retransmissions */ - uint8_t resv[2]; + uint8_t mtu; /* MTU */ + uint8_t resv[1]; union dcm_addr saddr; union dcm_addr daddr; union dcm_addr saddr_alt; @@ -243,7 +245,7 @@ typedef uint16_t ib_hca_port_t; #define DCM_ACK_RETRY 7 /* 3 bits, 7 * 4.2 == 30 seconds */ #define DCM_RNR_TIMER 12 /* 5 bits, 12 =.64ms, 28 =163ms, 31 =491ms */ #define DCM_RNR_RETRY 7 /* 3 bits, 7 == infinite */ -#define DCM_IB_MTU 2048 +#define DCM_IB_MTU 4096 /* new default MTU size */ /* Global routing defaults */ #define DCM_GLOBAL 0 /* global routing is disabled */ diff --git a/dapl/openib_common/dapl_mic_common.h b/dapl/openib_common/dapl_mic_common.h index 86a815e..0231013 100755 --- a/dapl/openib_common/dapl_mic_common.h +++ b/dapl/openib_common/dapl_mic_common.h @@ -234,7 +234,8 @@ typedef struct dat_mcm_msg uint32_t s_id; /* src pid */ uint32_t d_id; /* dst pid */ uint8_t rd_in; /* atomic_rd_in */ - uint8_t rsvd[4]; + uint8_t mtu; /* mtu */ + uint8_t rsvd[3]; uint8_t seg_sz; /* data segment size in power of 2 */ dat_mcm_addr_t saddr1; /* QPt local, MPXY or MCM on non-MIC node */ dat_mcm_addr_t saddr2; /* QPr local, MIC or MCM on non-MIC node or MPXY */ @@ -369,6 +370,7 @@ typedef enum dat_mix_op_flags MIX_OP_ASYNC = 0x08, MIX_OP_INLINE = 0x10, MIX_OP_SET = 0x20, + MIX_OP_MTU = 0x40, } dat_mix_op_flags_t; diff --git a/dapl/openib_common/qp.c b/dapl/openib_common/qp.c index 01f91ca..3d622ab 100644 --- a/dapl/openib_common/qp.c +++ b/dapl/openib_common/qp.c @@ -648,19 +648,22 @@ dapls_modify_qp_state(IN struct ibv_qp *qp_handle, qp_attr.dest_qp_num = ntohl(qpn); qp_attr.rq_psn = 1; - qp_attr.path_mtu = ia_ptr->hca_ptr->ib_trans.ib_cm.mtu; qp_attr.min_rnr_timer = ia_ptr->hca_ptr->ib_trans.ib_cm.rnr_timer; + qp_attr.path_mtu = ep_ptr->qp_handle->mtu ? + ep_ptr->qp_handle->mtu : + ia_ptr->hca_ptr->ib_trans.ib_cm.mtu; #ifdef _OPENIB_MCM_ qp_attr.max_dest_rd_atomic = ia_ptr->hca_ptr->ib_trans.ib_cm.rd_atom_in; #else qp_attr.max_dest_rd_atomic = ep_ptr->param.ep_attr.max_rdma_read_in; #endif - dapl_dbg_log(DAPL_DBG_TYPE_EP, - " QPS_RTR: type %d l_qpn %x qpn %x lid 0x%x" - " port %d ep %p qp_state %d rd_atomic %d\n", - qp_handle->qp_type, qp_handle->qp_num, - ntohl(qpn), ntohs(lid), ia_ptr->hca_ptr->port_num, - ep_ptr, ep_ptr->qp_state, qp_attr.max_dest_rd_atomic); + dapl_log(DAPL_DBG_TYPE_EP, + " QPS_RTR: type %d l_qpn %x qpn %x lid 0x%x" + " port %d ep %p qp_state %d rd_atomic %d mtu %d lmtu %d\n", + qp_handle->qp_type, qp_handle->qp_num, + ntohl(qpn), ntohs(lid), ia_ptr->hca_ptr->port_num, + ep_ptr, ep_ptr->qp_state, qp_attr.max_dest_rd_atomic, + qp_attr.path_mtu, ia_ptr->hca_ptr->ib_trans.ib_cm.mtu); /* address handle. RC and UD */ qp_attr.ah_attr.dlid = ntohs(lid); diff --git a/dapl/openib_common/util.c b/dapl/openib_common/util.c index 55bda3b..d54d0a8 100644 --- a/dapl/openib_common/util.c +++ b/dapl/openib_common/util.c @@ -285,7 +285,7 @@ enum ibv_mtu dapl_ib_mtu(int mtu) case 4096: return IBV_MTU_4096; default: - return IBV_MTU_1024; + return IBV_MTU_4096; } } @@ -303,7 +303,7 @@ const char *dapl_ib_mtu_str(enum ibv_mtu mtu) case IBV_MTU_4096: return "4096"; default: - return "1024"; + return "4096"; } } @@ -424,6 +424,13 @@ DAT_RETURN dapls_ib_query_hca(IN DAPL_HCA * hca_ptr, dev_attr.max_qp_wr = DAPL_MIN(dev_attr.max_qp_wr, dapl_os_get_env_val("DAPL_WR_MAX", dev_attr.max_qp_wr)); + /* MTU to active by default, reset if env set and <= active_mtu */ + if (getenv("DAPL_IB_MTU")) + tp->ib_cm.mtu = DAPL_MIN(port_attr.active_mtu, + dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU", DCM_IB_MTU))); + else + tp->ib_cm.mtu = port_attr.active_mtu; + #ifdef _OPENIB_MCM_ /* Adjust for CCL Proxy; limited sge's, no READ support, reduce QP and RDMA limits */ dev_attr.max_sge = DAPL_MIN(dev_attr.max_sge, DAT_MIX_SGE_MAX); @@ -497,7 +504,6 @@ DAT_RETURN dapls_ib_query_hca(IN DAPL_HCA * hca_ptr, /* save key device attributes for CM exchange */ tp->ib_cm.rd_atom_in = dev_attr.max_qp_rd_atom; tp->ib_cm.rd_atom_out = dev_attr.max_qp_init_rd_atom; - tp->ib_cm.mtu = DAPL_MIN(port_attr.active_mtu, tp->ib_cm.mtu); tp->ib_cm.ack_timer = DAPL_MAX(dev_attr.local_ca_ack_delay, tp->ib_cm.ack_timer); /* set provider/transport specific named attributes */ diff --git a/dapl/openib_mcm/cm.c b/dapl/openib_mcm/cm.c index f2a4b8d..48ff0b3 100644 --- a/dapl/openib_mcm/cm.c +++ b/dapl/openib_mcm/cm.c @@ -1104,6 +1104,11 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg) if (msg->seg_sz) /* set po2 seg_sz, if provided */ cm->msg.seg_sz = msg->seg_sz; + /* Set QP MTU, if negotiated. 2K for compatibility */ + ep->qp_handle->mtu = msg->mtu ? + DAPL_MIN(msg->mtu, cm->hca->ib_trans.ib_cm.mtu): + getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048; + cm->msg.d_id = msg->s_id; dapl_os_memcpy(&ep->remote_ia_address, &msg->saddr2, sizeof(dat_mcm_addr_t)); dapl_os_memcpy(&cm->msg.daddr2, &msg->saddr2, sizeof(dat_mcm_addr_t)); @@ -1129,10 +1134,12 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg) } dapl_dbg_log(DAPL_DBG_TYPE_CM, - " CONN_RTU: DST lid=%x, QPr=%x, QPt=%x qp_type=%d, port=%x psize=%d\n", + " CONN_RTU: DST lid=%x, QPr=%x, QPt=%x qp_type=%d," + " port=%x psize=%d mtu=%d,%d\n", ntohs(cm->msg.daddr1.lid), ntohl(cm->msg.daddr1.qpn), ntohl(cm->msg.daddr2.qpn), cm->msg.daddr1.qp_type, - ntohs(msg->sport), ntohs(msg->p_size)); + ntohs(msg->sport), ntohs(msg->p_size), + cm->tp->ib_cm.mtu, cm->msg.mtu); if (ntohs(msg->op) == MCM_REP) event = IB_CME_CONNECTED; @@ -1227,6 +1234,7 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg) /* Send RTU, no private data */ cm->msg.op = htons(MCM_RTU); + cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */ dapl_os_lock(&cm->lock); cm->state = MCM_CONNECTED; @@ -1249,11 +1257,12 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg) cm->msg.p_data, ntohs(cm->msg.p_size), cm->ep); dapl_log(DAPL_DBG_TYPE_CM_EST, - " mcm_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x %s\n", + " mcm_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x %s mtu %d\n", cm->hca, cm->retries, ntohs(cm->msg.saddr1.lid), ntohs(cm->msg.sport), ntohl(cm->msg.saddr1.qpn), ntohs(cm->msg.daddr1.lid), ntohs(cm->msg.dport), - ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map)); + ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map), + cm->ep->qp_handle->mtu); mcm_log_addrs(DAPL_DBG_TYPE_CM_EST, &cm->msg, cm->state, 0); @@ -1291,6 +1300,7 @@ static void mcm_accept(ib_cm_srvc_handle_t cm, dat_mcm_msg_t *msg) acm->msg.p_size = msg->p_size; acm->msg.d_id = msg->s_id; acm->msg.rd_in = msg->rd_in; + acm->msg.mtu = msg->mtu; /* save peer MTU */ if (msg->seg_sz) /* set po2 seg_sz, if provided */ acm->msg.seg_sz = msg->seg_sz; @@ -1359,11 +1369,12 @@ static void mcm_accept_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg) dapls_cr_callback(cm, IB_CME_CONNECTED, NULL, 0, cm->sp); dapl_log(DAPL_DBG_TYPE_CM_EST, - " PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x %s\n", + " PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x %s mtu %d\n", cm->hca, cm->retries, ntohs(cm->msg.saddr1.lid), ntohs(cm->msg.sport), ntohl(cm->msg.saddr1.qpn), ntohs(cm->msg.daddr1.lid), ntohs(cm->msg.dport), - ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map)); + ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map), + cm->ep->qp_handle->mtu); mcm_log_addrs(DAPL_DBG_TYPE_CM_EST, &cm->msg, cm->state, 1); return; @@ -1489,6 +1500,11 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data) ep->param.ep_attr.max_rdma_read_out = DAPL_MIN(ep->param.ep_attr.max_rdma_read_out, cm->msg.rd_in); + /* Set QP MTU, if negotiated. 2K for compatibility */ + ep->qp_handle->mtu = cm->msg.mtu ? + DAPL_MIN(cm->msg.mtu, cm->hca->ib_trans.ib_cm.mtu): + getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048; + /* modify QPr to RTR and then to RTS, QPr (qp) to remote QPt (daddr2), !xsocket */ dapl_os_lock(&ep->header.lock); if (!MXF_EP(&cm->hca->ib_trans.addr)) { @@ -1567,6 +1583,7 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data) /* setup local QPr info (if !KR) and type from EP, copy pdata, for reply */ cm->msg.op = htons(MCM_REP); cm->msg.rd_in = ep->param.ep_attr.max_rdma_read_in; + cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */ if (!MXF_EP(&cm->hca->ib_trans.addr)) { cm->msg.saddr1.qpn = htonl(ep->qp_handle->qp->qp_num); @@ -1680,6 +1697,7 @@ dapls_ib_connect(IN DAT_EP_HANDLE ep_handle, /* set max rdma inbound requests */ cm->msg.rd_in = ep->param.ep_attr.max_rdma_read_in; + cm->msg.mtu = cm->tp->ib_cm.mtu; /* local MTU to peer */ if (p_size) { cm->msg.p_size = htons(p_size); diff --git a/dapl/openib_mcm/mix.c b/dapl/openib_mcm/mix.c index 59ef7e5..5d96eb5 100644 --- a/dapl/openib_mcm/mix.c +++ b/dapl/openib_mcm/mix.c @@ -171,6 +171,9 @@ int dapli_mix_open(ib_hca_transport_t *tp, char *name, int port, int query_only) msg.port = port; strcpy((char*)&msg.name, name); + if (getenv("DAPL_IB_MTU")) + msg.hdr.flags |= MIX_OP_MTU; + /* send any overridden attributes to proxy */ msg.dev_attr.ack_timer = tp->ib_cm.ack_timer; msg.dev_attr.ack_retry = tp->ib_cm.ack_retry; diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c index 6ee99f1..35164ef 100644 --- a/dapl/openib_scm/cm.c +++ b/dapl/openib_scm/cm.c @@ -671,6 +671,7 @@ dapli_socket_connect(DAPL_EP * ep_ptr, /* REQ: QP info in msg.saddr, IA address in msg.daddr, and pdata */ cm_ptr->hca = ia_ptr->hca_ptr; cm_ptr->msg.op = ntohs(DCM_REQ); + cm_ptr->msg.mtu = ia_ptr->hca_ptr->ib_trans.ib_cm.mtu; /* local MTU to peer */ cm_ptr->msg.saddr.ib.qpn = htonl(ep_ptr->qp_handle->qp->qp_num); cm_ptr->msg.saddr.ib.qp_type = ep_ptr->qp_handle->qp->qp_type; cm_ptr->msg.saddr.ib.lid = ia_ptr->hca_ptr->ib_trans.lid; @@ -848,6 +849,11 @@ static void dapli_socket_connect_rtu(dp_ib_cm_handle_t cm_ptr) DAPL_MIN(ep_ptr->param.ep_attr.max_rdma_read_out, cm_ptr->msg.rd_in); + /* Set QP MTU, if negotiated. 2K for compatibility */ + ep_ptr->qp_handle->mtu = cm_ptr->msg.mtu ? + DAPL_MIN(cm_ptr->msg.mtu, cm_ptr->hca->ib_trans.ib_cm.mtu): + getenv("DAPL_IB_MTU") ? cm_ptr->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048; + /* modify QP to RTR and then to RTS with remote info */ dapl_os_lock(&ep_ptr->header.lock); if (dapls_modify_qp_state(ep_ptr->qp_handle->qp, @@ -895,6 +901,7 @@ static void dapli_socket_connect_rtu(dp_ib_cm_handle_t cm_ptr) dapl_os_unlock(&cm_ptr->lock); cm_ptr->msg.op = ntohs(DCM_RTU); + cm_ptr->msg.mtu = ep_ptr->qp_handle->mtu; /* send negotiated MTU */ if (send(cm_ptr->socket, (char *)&cm_ptr->msg, 4, 0) == -1) { int err = dapl_socket_errno(); dapl_log(DAPL_DBG_TYPE_ERR, @@ -968,10 +975,11 @@ ud_bail: DCM_MAX_PDATA_SIZE, ep_ptr); } dapl_log(DAPL_DBG_TYPE_CM_EST, - " SCM ACTIVE CONN: %x -> %s %x\n", + " SCM ACTIVE CONN: %x -> %s %x mtu %d\n", ntohs(((struct sockaddr_in *) &cm_ptr->addr)->sin_port), inet_ntoa(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_addr), - ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port)-1000); + ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port)-1000, + ep_ptr->qp_handle->mtu); return; bail: @@ -1274,6 +1282,11 @@ dapli_socket_accept_usr(DAPL_EP * ep_ptr, if (ntohs(cm_ptr->msg.ver) < DCM_VER_XPS) exp += SCM_BC_DIFF; + /* Set QP MTU, if negotiated. 2K for compatibility */ + ep_ptr->qp_handle->mtu = cm_ptr->msg.mtu ? + DAPL_MIN(cm_ptr->msg.mtu, cm_ptr->hca->ib_trans.ib_cm.mtu): + getenv("DAPL_IB_MTU") ? cm_ptr->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048; + /* modify QP to RTR and then to RTS with remote info already read */ dapl_os_lock(&ep_ptr->header.lock); if (dapls_modify_qp_state(ep_ptr->qp_handle->qp, @@ -1313,6 +1326,7 @@ dapli_socket_accept_usr(DAPL_EP * ep_ptr, local.ver = htons(DCM_VER); local.op = htons(DCM_REP); local.rd_in = ep_ptr->param.ep_attr.max_rdma_read_in; + local.mtu = ep_ptr->qp_handle->mtu; /* send negotiated MTU */ local.saddr.ib.qpn = htonl(ep_ptr->qp_handle->qp->qp_num); local.saddr.ib.qp_type = ep_ptr->qp_handle->qp->qp_type; local.saddr.ib.lid = ia_ptr->hca_ptr->ib_trans.lid; @@ -1458,10 +1472,11 @@ ud_bail: dapls_cr_callback(cm_ptr, event, NULL, 0, cm_ptr->sp); } dapl_log(DAPL_DBG_TYPE_CM_EST, - " SCM PASSIVE CONN: %x <- %s %x\n", + " SCM PASSIVE CONN: %x <- %s %x mtu %d\n", cm_ptr->sp->conn_qual, inet_ntoa(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_addr), - ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port)); + ntohs(((struct sockaddr_in *) &cm_ptr->msg.daddr.so)->sin_port), + cm_ptr->ep->qp_handle->mtu); return; bail: diff --git a/dapl/openib_scm/dapl_ib_util.h b/dapl/openib_scm/dapl_ib_util.h index b03018b..ad5bc60 100644 --- a/dapl/openib_scm/dapl_ib_util.h +++ b/dapl/openib_scm/dapl_ib_util.h @@ -65,7 +65,6 @@ typedef dp_ib_cm_handle_t ib_cm_srvc_handle_t; #define SCM_RNR_TIMER 12 /* 5 bits, 12 =.64ms, 28 =163ms, 31 =491ms */ #define SCM_RNR_RETRY 7 /* 3 bits, 7 == infinite */ #define SCM_CR_RETRY 5 /* retries for busy server, connect refused */ -#define SCM_IB_MTU 2048 /* Global routing defaults */ #define SCM_GLOBAL 0 /* global routing is disabled */ diff --git a/dapl/openib_scm/device.c b/dapl/openib_scm/device.c index 43f9eaf..b210a15 100644 --- a/dapl/openib_scm/device.c +++ b/dapl/openib_scm/device.c @@ -371,8 +371,6 @@ DAT_RETURN dapls_ib_open_hca(IN IB_HCA_NAME hca_name, dapl_os_get_env_val("DAPL_HOP_LIMIT", SCM_HOP_LIMIT); hca_ptr->ib_trans.ib_cm.tclass = dapl_os_get_env_val("DAPL_TCLASS", SCM_TCLASS); - hca_ptr->ib_trans.ib_cm.mtu = - dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU", SCM_IB_MTU)); if (flags & DAPL_OPEN_QUERY) goto done; diff --git a/dapl/openib_ucm/cm.c b/dapl/openib_ucm/cm.c index 3d06c82..88dd890 100644 --- a/dapl/openib_ucm/cm.c +++ b/dapl/openib_ucm/cm.c @@ -622,14 +622,15 @@ dp_ib_cm_handle_t ucm_cm_find(ib_hca_transport_t *tp, ib_cm_msg_t *msg) lock = &tp->lock; dapl_log(DAPL_DBG_TYPE_CM, - " ucm_recv: %s %d %x %x i %x c %x < %d %x %x i %x c %x\n", + " ucm_recv: %s %d %x %x i %x c %x < %d %x %x i %x c %x (%d,%d)\n", dapl_cm_op_str(msg_op), ntohl(msg->d_id), ntohs(msg->daddr.ib.lid), UCM_PORT_NTOH(msg->dportx, msg->dport), ntohl(msg->daddr.ib.qpn), ntohl(msg->dqpn), ntohl(msg->s_id), ntohs(msg->saddr.ib.lid), UCM_PORT_NTOH(msg->sportx, msg->sport), - ntohl(msg->saddr.ib.qpn), ntohl(msg->sqpn)); + ntohl(msg->saddr.ib.qpn), ntohl(msg->sqpn), + tp->ib_cm.mtu, msg->mtu); retry_listenq: dapl_os_lock(lock); @@ -1467,12 +1468,13 @@ dapli_cm_connect(DAPL_EP *ep, dp_ib_cm_handle_t cm) { dapl_log(DAPL_DBG_TYPE_EP, " connect: lid %x i_qpn %x lport %x p_sz=%d -> " - " lid %x c_qpn %x rport %x\n", + " lid %x c_qpn %x rport %x l_mtu %d\n", htons(cm->msg.saddr.ib.lid), htonl(cm->msg.saddr.ib.qpn), UCM_PORT_NTOH(cm->msg.sportx,cm->msg.sport), htons(cm->msg.p_size), htons(cm->msg.daddr.ib.lid), htonl(cm->msg.dqpn), - UCM_PORT_NTOH(cm->msg.dportx,cm->msg.dport)); + UCM_PORT_NTOH(cm->msg.dportx,cm->msg.dport), + cm->hca->ib_trans.ib_cm.mtu); dapl_os_lock(&cm->lock); if (cm->state != DCM_INIT && cm->state != DCM_REP_PENDING) { @@ -1513,6 +1515,8 @@ dapli_cm_connect(DAPL_EP *ep, dp_ib_cm_handle_t cm) cm->state = DCM_REP_PENDING; cm->msg.op = htons(DCM_REQ); + cm->msg.mtu = cm->hca->ib_trans.ib_cm.mtu; /* local MTU to peer */ + if (ucm_send(&cm->hca->ib_trans, &cm->msg, &cm->msg.p_data, ntohs(cm->msg.p_size))) { dapl_os_unlock(&cm->lock); @@ -1638,6 +1642,10 @@ static void ucm_connect_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg) cm->ep->param.ep_attr.max_rdma_read_out = DAPL_MIN(cm->ep->param.ep_attr.max_rdma_read_out, cm->msg.rd_in); + /* Set QP MTU, if negotiated. 2K for compatibility */ + ep->qp_handle->mtu = msg->mtu ? + DAPL_MIN(msg->mtu, cm->hca->ib_trans.ib_cm.mtu): + getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048; /* modify QP to RTR and then to RTS with remote info */ dapl_os_lock(&cm->ep->header.lock); @@ -1671,6 +1679,7 @@ static void ucm_connect_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg) /* Send RTU, no private data */ cm->msg.op = htons(DCM_RTU); + cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */ dapl_os_lock(&cm->lock); cm->state = DCM_CONNECTED; @@ -1760,11 +1769,11 @@ ud_bail: } dapl_log(DAPL_DBG_TYPE_CM_EST, - " UCM_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x xevent=%d\n", + " UCM_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x mtu %d\n", cm->hca, cm->retries, ntohs(cm->msg.saddr.ib.lid), ntohs(cm->msg.sport), ntohl(cm->msg.saddr.ib.qpn), ntohs(cm->msg.daddr.ib.lid), ntohs(cm->msg.dport), - ntohl(cm->msg.dqpn), sizeof(DAT_IB_EXTENSION_EVENT_DATA)); + ntohl(cm->msg.dqpn), ep->qp_handle->mtu); return; bail: if (ntohs(msg->op) != DCM_REJ_USER) { @@ -1812,6 +1821,7 @@ static void ucm_accept(ib_cm_srvc_handle_t cm, ib_cm_msg_t *msg) acm->msg.p_size = msg->p_size; acm->msg.d_id = msg->s_id; acm->msg.rd_in = msg->rd_in; + acm->msg.mtu = msg->mtu; /* save peer MTU */ /* CR saddr is CM daddr info, need EP for local saddr */ dapl_os_memcpy(&acm->msg.daddr, &msg->saddr, sizeof(union dcm_addr)); @@ -1832,14 +1842,15 @@ static void ucm_accept(ib_cm_srvc_handle_t cm, ib_cm_msg_t *msg) dapl_log(DAPL_DBG_TYPE_CM, " accepting: op %s [id lid, port, cqp, iqp]:" - " %d %x %x %x %x <- %d %x %x %x %x\n", + " %d %x %x %x %x <- %d %x %x %x %x mtu %d\n", dapl_cm_op_str(ntohs(msg->op)), ntohl(acm->msg.s_id), ntohs(msg->daddr.ib.lid), UCM_PORT_NTOH(msg->dportx, msg->dport), ntohl(msg->dqpn), ntohl(msg->daddr.ib.qpn), ntohl(msg->s_id), ntohs(msg->saddr.ib.lid), UCM_PORT_NTOH(msg->sportx, msg->sport), - ntohl(msg->sqpn), ntohl(msg->saddr.ib.qpn)); + ntohl(msg->sqpn), ntohl(msg->saddr.ib.qpn), + acm->msg.mtu); #ifdef DAT_EXTENSIONS if (acm->msg.daddr.ib.qp_type == IBV_QPT_UD) { @@ -1958,13 +1969,13 @@ static void ucm_accept_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg) } dapl_log(DAPL_DBG_TYPE_CM_EST, - " UCM_PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x\n", + " UCM_PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x mtu %d\n", cm->hca, cm->retries, ntohs(cm->msg.saddr.ib.lid), UCM_PORT_NTOH(cm->msg.sportx, cm->msg.sport), ntohl(cm->msg.saddr.ib.qpn), ntohs(cm->msg.daddr.ib.lid), UCM_PORT_NTOH(cm->msg.dportx, cm->msg.dport), - ntohl(cm->msg.dqpn)); + ntohl(cm->msg.dqpn), cm->ep->qp_handle->mtu); return; bail: dapl_log(DAPL_DBG_TYPE_CM_WARN, @@ -2090,11 +2101,11 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data) dapl_dbg_log(DAPL_DBG_TYPE_CM, " ACCEPT_USR: s_id %d r_id %d lid=%x" - " iqp=%x qp_type %d, psize=%d\n", + " iqp=%x qp_type %d, psize=%d r_mtu %d l_mtu %d\n", ntohl(cm->msg.s_id), ntohl(cm->msg.d_id), ntohs(cm->msg.daddr.ib.lid), ntohl(cm->msg.daddr.ib.qpn), cm->msg.daddr.ib.qp_type, - p_size); + p_size, cm->msg.mtu, cm->hca->ib_trans.ib_cm.mtu); #ifdef DAT_EXTENSIONS if (cm->msg.daddr.ib.qp_type == IBV_QPT_UD && @@ -2110,6 +2121,10 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data) ep->param.ep_attr.max_rdma_read_out = DAPL_MIN(ep->param.ep_attr.max_rdma_read_out, cm->msg.rd_in); + /* Set QP MTU, if negotiated. 2K for compatibility */ + ep->qp_handle->mtu = cm->msg.mtu ? + DAPL_MIN(cm->msg.mtu, cm->hca->ib_trans.ib_cm.mtu): + getenv("DAPL_IB_MTU") ? cm->hca->ib_trans.ib_cm.mtu : IBV_MTU_2048; /* modify QP to RTR and then to RTS with remote info already read */ dapl_os_lock(&ep->header.lock); @@ -2146,6 +2161,7 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data) /* setup local QP info and type from EP, copy pdata, for reply */ cm->msg.op = htons(DCM_REP); cm->msg.rd_in = ep->param.ep_attr.max_rdma_read_in; + cm->msg.mtu = ep->qp_handle->mtu; /* send negotiated MTU */ cm->msg.saddr.ib.qpn = htonl(ep->qp_handle->qp->qp_num); cm->msg.saddr.ib.qp_type = ep->qp_handle->qp->qp_type; cm->msg.saddr.ib.lid = cm->hca->ib_trans.addr.ib.lid; diff --git a/dapl/openib_ucm/device.c b/dapl/openib_ucm/device.c index f23c77b..71fee5f 100644 --- a/dapl/openib_ucm/device.c +++ b/dapl/openib_ucm/device.c @@ -292,8 +292,6 @@ DAT_RETURN dapls_ib_open_hca(IN IB_HCA_NAME hca_name, dapl_os_get_env_val("DAPL_HOP_LIMIT", DCM_HOP_LIMIT); hca_ptr->ib_trans.ib_cm.tclass = dapl_os_get_env_val("DAPL_TCLASS", DCM_TCLASS); - hca_ptr->ib_trans.ib_cm.mtu = - dapl_ib_mtu(dapl_os_get_env_val("DAPL_IB_MTU", DCM_IB_MTU)); if (flags & DAPL_OPEN_QUERY) goto done; diff --git a/dapl/svc/mcm.c b/dapl/svc/mcm.c index 4b91090..7be40b8 100644 --- a/dapl/svc/mcm.c +++ b/dapl/svc/mcm.c @@ -346,7 +346,9 @@ int mcm_modify_qp(struct ibv_qp *qp_handle, qp_attr.dest_qp_num = ntohl(qpn); qp_attr.rq_psn = 1; - qp_attr.path_mtu = m_qp->smd->md->dev_attr.mtu; + qp_attr.path_mtu = m_qp->mtu ? + min(m_qp->mtu, m_qp->smd->md->dev_attr.mtu): + m_qp->smd->md->dev_attr.mtu; qp_attr.max_dest_rd_atomic = 16; qp_attr.min_rnr_timer = m_qp->smd->md->dev_attr.rnr_timer; qp_attr.ah_attr.dlid = ntohs(lid); @@ -1491,6 +1493,7 @@ int mcm_cm_req_out(mcm_cm_t *m_cm) m_cm->state = MCM_REP_PENDING; m_cm->msg.op = htons(MCM_REQ); m_cm->timer = mcm_time_us(); /* reset reply timer */ + m_cm->msg.mtu = m_cm->smd->md->dev_attr.mtu; /* local MTU to peer */ if (mcm_send(m_cm->md, &m_cm->msg, &m_cm->msg.p_data, ntohs(m_cm->msg.p_size))) return -1; @@ -1508,7 +1511,7 @@ int mcm_cm_rtu_out(mcm_cm_t *m_cm) MCNTR(m_cm->md, MCM_CM_RTU_OUT); - mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s -> 0x%x %x 0x%x %Lx %s\n", + mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s -> 0x%x %x 0x%x %Lx %s mtu %d\n", m_cm->md->mc->scif_id, m_cm->smd->entry.tid, m_cm->md->cntrs ? (uint32_t)((uint64_t *)m_cm->md->cntrs)[MCM_CM_RTU_OUT]:0, m_cm, htons(m_cm->msg.saddr2.lid), htonl(m_cm->msg.saddr2.qpn), @@ -1516,7 +1519,8 @@ int mcm_cm_rtu_out(mcm_cm_t *m_cm) htons(m_cm->msg.daddr1.lid), MXF_EP(&m_cm->msg.saddr1) && MXF_EP(&m_cm->msg.daddr1) ? htonl(m_cm->msg.daddr2.qpn):htonl(m_cm->msg.daddr1.qpn), - htons(m_cm->msg.dport), ntohll(r_guid), mcm_map_str(m_cm->msg.daddr1.ep_map)); + htons(m_cm->msg.dport), ntohll(r_guid), mcm_map_str(m_cm->msg.daddr1.ep_map), + m_cm->m_qp->mtu); mpxy_lock(&m_cm->lock); if (m_cm->state != MCM_REP_RCV) { diff --git a/dapl/svc/mix.c b/dapl/svc/mix.c index 741ca7c..8e4e622 100644 --- a/dapl/svc/mix.c +++ b/dapl/svc/mix.c @@ -186,8 +186,6 @@ void mix_scif_accept(scif_epd_t listen_ep) mlog(8, " SCIF client: device open client_pid 0x%x - mlen %d - ep %d\n", ntohl(msg.hdr.req_id), len, op_ep); - msg.hdr.flags = MIX_OP_RSP; - if (msg.hdr.ver < MIX_MIN || msg.hdr.ver > MIX_MAX || msg.hdr.op != MIX_IA_OPEN) { mlog(0, " ERR: MIC client incompatible with MPXYD (exp %d,rcvd %d) or OP (exp %d,rcvd %d)\n", DAT_MIX_VER, msg.hdr.ver, msg.hdr.op, MIX_IA_OPEN); @@ -1537,6 +1535,7 @@ static int mix_cm_rtu_out(mcm_scif_dev_t *smd, dat_mix_cm_t *pmsg, scif_epd_t sc ntohs(m_cm->msg.daddr1.lid), ntohll(m_cm->msg.sys_guid)); /* send RTU on wire */ + m_cm->msg.mtu = m_cm->m_qp->mtu; /* send negotiated MTU */ mcm_cm_rtu_out(m_cm); return 0; @@ -1641,6 +1640,12 @@ int mix_cm_rep_in(mcm_cm_t *m_cm, dat_mcm_msg_t *pkt, int pkt_len) else m_cm->m_qp->p2p_data = 0; + /* Set QP MTU, if negotiated. 2K for compatibility */ + m_cm->m_qp->mtu = pkt->mtu ? + min(pkt->mtu, m_cm->md->dev_attr.mtu): + m_cm->md->mtu_env ? m_cm->md->mtu_env : IBV_MTU_2048; + m_cm->msg.mtu = m_cm->m_qp->mtu; /* forward negotiated MTU */ + mlog(2, " WRC: m_qp %p - WR 0x%Lx rkey 0x%x ln %d, sz %d end %d" " WC 0x%Lx rkey 0x%x ln %d, sz %d end %d\n", m_cm->m_qp, m_cm->m_qp->wrc.wr_addr, m_cm->m_qp->wrc.wr_rkey, @@ -1797,6 +1802,7 @@ int mix_cm_req_in(mcm_cm_t *cm, dat_mcm_msg_t *pkt, int pkt_len) acm->msg.p_size = pkt->p_size; acm->msg.d_id = pkt->s_id; acm->msg.rd_in = pkt->rd_in; + acm->msg.mtu = pkt->mtu; #ifdef MPXYD_LOCAL_SUPPORT acm->msg.sys_guid = pkt->sys_guid; /* remote system guid */; #else @@ -1808,13 +1814,14 @@ int mix_cm_req_in(mcm_cm_t *cm, dat_mcm_msg_t *pkt, int pkt_len) memcpy(&acm->msg.daddr1, &pkt->saddr1, sizeof(dat_mcm_addr_t)); memcpy(&acm->msg.daddr2, &pkt->saddr2, sizeof(dat_mcm_addr_t)); - mlog(2, " [%d:%d] cm %p ep %d sPORT %x %s <- dPORT %x lid=%x psz=%d %s %s %Lx (msg %p %d)\n", + mlog(2, " [%d:%d] cm %p ep %d: %x %s <- %x lid=%x psz=%d %s %s %Lx (%p %d) lmtu %d rmtu %d\n", cm->md->mc->scif_id, cm->smd->entry.tid, acm, acm->smd->scif_ev_ep, ntohs(acm->msg.sport), mcm_map_str(acm->md->addr.ep_map), ntohs(acm->msg.dport), ntohs(acm->msg.daddr1.lid), htons(acm->msg.p_size), mcm_map_str(acm->msg.daddr2.ep_map), acm->md->addr.lid == acm->msg.daddr1.lid ? "platform":"fabric", - ntohll(acm->msg.sys_guid), &msg, sizeof(dat_mcm_msg_t)); + ntohll(acm->msg.sys_guid), &msg, sizeof(dat_mcm_msg_t), + cm->md->dev_attr.mtu, pkt->mtu); if (pkt->p_size) memcpy(acm->msg.p_data, pkt->p_data, ntohs(pkt->p_size)); @@ -1849,7 +1856,7 @@ int mix_cm_rtu_in(mcm_cm_t *m_cm, dat_mcm_msg_t *pkt, int pkt_len) dat_mix_cm_t msg; int len; - mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s <- 0x%x %x 0x%x %Lx %s\n", + mlog(1, "[%d:%d] CONN_EST[%d]: %p 0x%x %x 0x%x %Lx %s <- 0x%x %x 0x%x %Lx %s mtu %d\n", m_cm->md->mc->scif_id, m_cm->smd->entry.tid, m_cm->md->cntrs ? (uint32_t)((uint64_t *)m_cm->md->cntrs)[MCM_CM_RTU_IN]:0, m_cm, htons(pkt->daddr1.lid), @@ -1857,7 +1864,8 @@ int mix_cm_rtu_in(mcm_cm_t *m_cm, dat_mcm_msg_t *pkt, int pkt_len) htonl(m_cm->msg.daddr2.qpn):htonl(m_cm->msg.daddr1.qpn), htons(pkt->dport), system_guid, mcm_map_str(pkt->daddr1.ep_map), htons(pkt->saddr2.lid), htonl(pkt->saddr2.qpn), - htons(pkt->sport), ntohll(pkt->sys_guid), mcm_map_str(pkt->saddr2.ep_map)); + htons(pkt->sport), ntohll(pkt->sys_guid), mcm_map_str(pkt->saddr2.ep_map), + m_cm->m_qp->mtu); /* MXF_EP <- HST_EP, host sends WC on RTU, save WRC info */ if (MXF_EP(&pkt->daddr1) && HST_EP(&pkt->saddr2)) { @@ -2099,6 +2107,11 @@ static int mix_cm_rep_out(mcm_scif_dev_t *smd, dat_mix_cm_t *pmsg, scif_epd_t sc m_cm->msg.sys_guid = rand(); #endif + /* Set QP MTU, if negotiated. 2K for compatibility */ + m_cm->m_qp->mtu = m_cm->msg.mtu ? + min(m_cm->msg.mtu, m_cm->md->dev_attr.mtu): + m_cm->md->mtu_env ? m_cm->md->mtu_env : IBV_MTU_2048; + if (qp) { if (mcm_modify_qp(qp, IBV_QPS_RTR, dqpn, dlid, dgid)) goto err; @@ -2114,8 +2127,9 @@ static int mix_cm_rep_out(mcm_scif_dev_t *smd, dat_mix_cm_t *pmsg, scif_epd_t sc goto err; } - /* send RTU on wire, monitor for retries */ + /* send REP on wire, monitor for retries */ m_cm->state = MCM_RTU_PENDING; + m_cm->msg.mtu = m_cm->m_qp->mtu; /* send negotiated MTU */ mpxy_unlock(&m_cm->lock); mcm_cm_rep_out(m_cm); return 0; diff --git a/dapl/svc/mpxyd.c b/dapl/svc/mpxyd.c index 4269ff9..c9029c6 100644 --- a/dapl/svc/mpxyd.c +++ b/dapl/svc/mpxyd.c @@ -799,6 +799,10 @@ found: msg->dev_addr.lid = md->m_lid; memcpy(msg->dev_addr.gid, md->m_gid, 16); } + + /* MTU changed via DAPL_IB_MTU */ + if (msg->hdr.flags & MIX_OP_MTU) + md->mtu_env = md->dev_attr.mtu; err: if (!smd) { mlog(1, " WARN: open failed for %s - %d\n", msg->name, msg->port); @@ -806,6 +810,7 @@ err: } /* send back response */ + msg->hdr.flags = MIX_OP_RSP; ret = scif_send_msg(op_ep, (void*)msg, sizeof(dat_mix_open_t)); if (ret) { mlog(0, " ERR: scif_send dev_id %d op_ep %d, closing device %p\n", @@ -817,9 +822,10 @@ err: goto bail; } - mlog(1, " MIC client: mdev[%d] %p smd %p mic%d[%d] -> %s[%d] port %d lid %x %s\n", + mlog(1, " MIC client: mdev[%d] %p->%p mic%d[%d] -> %s[%d] port %d lid %x %s mtu %d (%d)\n", md->smd_list.tid, md, smd, mc->scif_id-1, mc->numa_node, msg->name, - md->numa_node, msg->port, ntohs(msg->dev_addr.lid), mcm_map_str(md->addr.ep_map)); + md->numa_node, msg->port, ntohs(msg->dev_addr.lid), mcm_map_str(md->addr.ep_map), + md->dev_attr.mtu, md->mtu_env); bail: mpxy_unlock(&mc->oplock); mpxy_unlock(&mc->cmlock); diff --git a/dapl/svc/mpxyd.h b/dapl/svc/mpxyd.h index 8addaa2..ec31cc0 100644 --- a/dapl/svc/mpxyd.h +++ b/dapl/svc/mpxyd.h @@ -132,6 +132,7 @@ typedef struct mcm_ib_dev { int numa_node; int indata; void *cntrs; + uint8_t mtu_env; } mcm_ib_dev_t; @@ -244,6 +245,7 @@ typedef struct mcm_qp { int sr_len; /* SR WR buffer pool len */ int sr_sz; /* SR WR entry size */ int post_sr; + uint8_t mtu; /* negotiated QP MTU */ #ifdef MCM_PROFILE mcm_qp_prof_t ts; uint32_t last_wr_sig;