From: Arlin Davis Date: Tue, 9 Dec 2014 23:35:59 +0000 (-0800) Subject: ucm: add time wait override capability for CM services X-Git-Tag: dapl-2.1.3~3 X-Git-Url: https://openfabrics.org/gitweb/?a=commitdiff_plain;h=0ae79c2236cc3b80f52cb4a3b2a38a40f1cba8b7;p=~ardavis%2Fdapl.git ucm: add time wait override capability for CM services New environment variable DAPL_UCM_WAIT_TIME (ms) to override the default wait_time for CM services. Default setting is 60 seconds. Signed-off-by: Arlin Davis --- diff --git a/dapl/openib_common/dapl_ib_common.h b/dapl/openib_common/dapl_ib_common.h index c1b9267..d5b26ec 100644 --- a/dapl/openib_common/dapl_ib_common.h +++ b/dapl/openib_common/dapl_ib_common.h @@ -225,6 +225,7 @@ typedef uint16_t ib_hca_port_t; #define DCM_RETRY_CNT 10 #define DCM_REP_TIME 800 /* reply timeout in m_secs */ #define DCM_RTU_TIME 800 /* rtu timeout in m_secs */ +#define DCM_WAIT_TIME 60000 /* wait timeout in m_secs */ #define DCM_QP_SIZE 500 /* uCM tx, rx qp size */ #define DCM_CQ_SIZE 500 /* uCM cq size */ #define DCM_TX_BURST 50 /* uCM signal, every TX burst msgs posted */ diff --git a/dapl/openib_ucm/cm.c b/dapl/openib_ucm/cm.c index 141086d..04d5eac 100644 --- a/dapl/openib_ucm/cm.c +++ b/dapl/openib_ucm/cm.c @@ -231,38 +231,26 @@ static void ucm_check_timers(dp_ib_cm_handle_t cm, int *timer) *timer = cm->hca->ib_trans.cm_timer; if ((time - cm->timer)/1000 > (cm->hca->ib_trans.rtu_time << cm->retries)) { - dapl_log(DAPL_DBG_TYPE_CM, - " CM_TIMEWAIT %d %p [lid, port, cqp, iqp]:" - " %x %x %x %x -> %x %x %x %x r_pid %x" - " Time(ms) %d > %d\n", - cm->retries+1, cm, - ntohs(cm->msg.saddr.ib.lid), ntohs(cm->msg.sport), - ntohl(cm->msg.sqpn), ntohl(cm->msg.saddr.ib.qpn), - ntohs(cm->msg.daddr.ib.lid), ntohs(cm->msg.dport), - ntohl(cm->msg.dqpn), ntohl(cm->msg.daddr.ib.qpn), - ntohl(cm->msg.d_id), - (time - cm->timer)/1000, - cm->hca->ib_trans.rtu_time << cm->retries); cm->retries++; - } - if (cm->retries > 2) { - dapl_log(DAPL_DBG_TYPE_CM_WARN, - " CM_TIMEWAIT EXPIRED %d %p [lid, port, cqp, iqp]:" - " %x %x %x %x -> %x %x %x %x r_pid %x" - " Time(ms) %d > %d\n", - cm->retries+1, cm, - ntohs(cm->msg.saddr.ib.lid), ntohs(cm->msg.sport), - ntohl(cm->msg.sqpn), ntohl(cm->msg.saddr.ib.qpn), - ntohs(cm->msg.daddr.ib.lid), ntohs(cm->msg.dport), - ntohl(cm->msg.dqpn), ntohl(cm->msg.daddr.ib.qpn), - ntohl(cm->msg.d_id), - (time - cm->timer)/1000, - cm->hca->ib_trans.rtu_time << cm->retries); - cm->ah = NULL; /* consumer will free AH */ - cm->state = DCM_FREE; - dapl_os_unlock(&cm->lock); - dapl_ep_unlink_cm(cm->ep, cm); /* last CM ref */ - return; + if ((time - cm->timer)/1000 > cm->hca->ib_trans.wait_time) { + dapl_log(DAPL_DBG_TYPE_CM_WARN, + " CM_TIMEWAIT EXPIRED %d %p [lid, port, cqp, iqp]:" + " %x %x %x %x -> %x %x %x %x r_pid %x" + " Time(ms) %d > %d\n", + cm->retries+1, cm, + ntohs(cm->msg.saddr.ib.lid), ntohs(cm->msg.sport), + ntohl(cm->msg.sqpn), ntohl(cm->msg.saddr.ib.qpn), + ntohs(cm->msg.daddr.ib.lid), ntohs(cm->msg.dport), + ntohl(cm->msg.dqpn), ntohl(cm->msg.daddr.ib.qpn), + ntohl(cm->msg.d_id), + (time - cm->timer)/1000, + cm->hca->ib_trans.wait_time); + cm->ah = NULL; /* consumer will free AH */ + cm->state = DCM_FREE; + dapl_os_unlock(&cm->lock); + dapl_ep_unlink_cm(cm->ep, cm); /* last CM ref */ + return; + } } break; @@ -737,7 +725,7 @@ void dapls_cm_release(dp_ib_cm_handle_t cm) dapl_os_lock(&cm->lock); cm->ref_count--; if (cm->ref_count) { - if (cm->ref_count == 1) + if ((cm->ref_count == 1) && (cm->list_entry.list_head)) dapl_os_wait_object_wakeup(&cm->f_event); dapl_os_unlock(&cm->lock); return; diff --git a/dapl/openib_ucm/dapl_ib_util.h b/dapl/openib_ucm/dapl_ib_util.h index 69d61a4..a5b9c52 100644 --- a/dapl/openib_ucm/dapl_ib_util.h +++ b/dapl/openib_ucm/dapl_ib_util.h @@ -101,6 +101,7 @@ typedef struct _ib_hca_transport int cm_timer; int rep_time; int rtu_time; + int wait_time; DAPL_OS_LOCK slock; int s_hd; int s_tl; diff --git a/dapl/openib_ucm/device.c b/dapl/openib_ucm/device.c index 75d7306..79796cc 100644 --- a/dapl/openib_ucm/device.c +++ b/dapl/openib_ucm/device.c @@ -504,12 +504,11 @@ static int ucm_service_create(IN DAPL_HCA *hca) int hlen = sizeof(struct ibv_grh); /* hdr included with UD recv */ char *rbuf; - dapl_dbg_log(DAPL_DBG_TYPE_UTIL, " ucm_create: \n"); - /* setup CM timers and queue sizes */ tp->retries = dapl_os_get_env_val("DAPL_UCM_RETRY", DCM_RETRY_CNT); tp->rep_time = dapl_os_get_env_val("DAPL_UCM_REP_TIME", DCM_REP_TIME); tp->rtu_time = dapl_os_get_env_val("DAPL_UCM_RTU_TIME", DCM_RTU_TIME); + tp->wait_time = dapl_os_get_env_val("DAPL_UCM_WAIT_TIME", DCM_WAIT_TIME); tp->cm_timer = DAPL_MIN(tp->rep_time,tp->rtu_time); tp->qpe = dapl_os_get_env_val("DAPL_UCM_QP_SIZE", DCM_QP_SIZE); tp->cqe = dapl_os_get_env_val("DAPL_UCM_CQ_SIZE", DCM_CQ_SIZE); @@ -519,8 +518,10 @@ static int ucm_service_create(IN DAPL_HCA *hca) goto bail; dapl_log(DAPL_DBG_TYPE_UTIL, - " create_service: pd %p ctx %p handle 0x%x\n", - tp->pd, tp->pd->context, tp->pd->handle); + " UCM: CM service - pd %p ctx %p " + " Timers(ms): req %d rtu %d wait %d\n", + tp->pd, tp->pd->context, tp->rep_time, + tp->rtu_time, tp->wait_time); tp->rch = ibv_create_comp_channel(hca->ib_hca_handle); if (!tp->rch)