From 78ae5ca6bc5ca16772d70e4c4e7b76982efac0fa Mon Sep 17 00:00:00 2001 From: ardavis Date: Fri, 24 Sep 2010 10:47:30 -0700 Subject: [PATCH] ibal: delay QP transition during disconnect phase ibal provider calls ib_cm_drep in response to receiving a dreq. The result is that the user's QP is transitioned through the error state, which fails any outstanding send operations and flushes all receives. The disconnect request is then reported to the user. Since a user can receive errors from the QP before they are aware of a pending disconnect request, the application may respond to the errors as, well, actual errors. Fix this by delaying the QP transition until the user responds to the dreq. This fixes an error with Intel MPI running over the ibal dapl provider with a 'spawn' test. Signed-off-by: Sean Hefty --- dapl/ibal/dapl_ibal_cm.c | 212 +++++++++++++++++-------------------- dapl/ibal/dapl_ibal_util.h | 8 ++ dapl/include/dapl.h | 4 - 3 files changed, 105 insertions(+), 119 deletions(-) diff --git a/dapl/ibal/dapl_ibal_cm.c b/dapl/ibal/dapl_ibal_cm.c index e3c12ff..37e158b 100644 --- a/dapl/ibal/dapl_ibal_cm.c +++ b/dapl/ibal/dapl_ibal_cm.c @@ -46,6 +46,9 @@ int g_dapl_loopback_connection = 0; extern dapl_ibal_root_t dapl_ibal_root; +static void dapli_ib_cm_drep_cb ( + IN ib_cm_drep_rec_t *p_cm_drep_rec ); + /* * Prototypes */ @@ -192,6 +195,28 @@ dapli_ib_cm_lap_cb ( "--> DiCLcb: CM callback LAP (Load Alternate Path)\n"); } +static DAT_RETURN dapli_send_ib_cm_dreq(ib_cm_handle_t h_cm) +{ + ib_cm_dreq_t cm_dreq; + + dapl_os_memzero(&cm_dreq, sizeof(ib_cm_dreq_t)); + + cm_dreq.flags = IB_FLAGS_SYNC; + cm_dreq.qp_type = IB_QPT_RELIABLE_CONN; + cm_dreq.h_qp = h_cm.h_qp; + cm_dreq.pfn_cm_drep_cb = dapli_ib_cm_drep_cb; + + return dapl_ib_status_convert(ib_cm_dreq(&cm_dreq)); +} + +static DAT_RETURN dapli_send_ib_cm_drep(ib_cm_handle_t h_cm) +{ + ib_cm_drep_t cm_drep; + + dapl_os_memzero(&cm_drep, sizeof(ib_cm_drep_t)); + return dapl_ib_status_convert(ib_cm_drep(h_cm, &cm_drep)); +} + /* * Connection Disconnect Request callback * We received a DREQ, return a DREP (disconnect reply). @@ -201,9 +226,7 @@ static void dapli_ib_cm_dreq_cb ( IN ib_cm_dreq_rec_t *p_cm_dreq_rec ) { - ib_cm_drep_t cm_drep; DAPL_EP *ep_ptr; - int bail=10; dp_ib_cm_handle_t cm_ptr; dapl_os_assert (p_cm_dreq_rec); @@ -213,25 +236,25 @@ dapli_ib_cm_dreq_cb ( { dapl_dbg_log (DAPL_DBG_TYPE_ERR, "--> %s: BAD_PTR EP %lx\n", __FUNCTION__, ep_ptr); - return; + goto send_drep; } if ( ep_ptr->header.magic != DAPL_MAGIC_EP ) { if ( ep_ptr->header.magic == DAPL_MAGIC_INVALID ) - return; + goto send_drep; dapl_dbg_log (DAPL_DBG_TYPE_ERR, "--> %s: EP %p BAD_EP_MAGIC %x != wanted %x\n", __FUNCTION__, ep_ptr, ep_ptr->header.magic, DAPL_MAGIC_EP ); - return; + goto send_drep; } cm_ptr = dapl_get_cm_from_ep(ep_ptr); if (!cm_ptr) { dapl_dbg_log (DAPL_DBG_TYPE_ERR, "--> %s: !CM_PTR on EP %p\n", __FUNCTION__, ep_ptr); - return; + goto send_drep; } dapl_os_assert(cm_ptr->ib_cm.h_qp == p_cm_dreq_rec->h_cm_dreq.h_qp); @@ -242,45 +265,34 @@ dapli_ib_cm_dreq_cb ( (ep_ptr->sent_discreq == DAT_TRUE ? "TRUE":"FALSE")); dapl_os_lock (&ep_ptr->header.lock); - if ( ep_ptr->param.ep_state == DAT_EP_STATE_DISCONNECTED - /*|| ( ep_ptr->param.ep_state == DAT_EP_STATE_DISCONNECT_PENDING - && ep_ptr->sent_discreq == DAT_TRUE)*/ ) + if ( ep_ptr->param.ep_state == DAT_EP_STATE_DISCONNECTED ) { - dapl_os_unlock (&ep_ptr->header.lock); dapl_dbg_log (DAPL_DBG_TYPE_CM, "--> DiCDcb: EP %lx QP %lx already Disconnected\n", ep_ptr, ep_ptr->qp_handle); - return; + goto unlock; } ep_ptr->param.ep_state = DAT_EP_STATE_DISCONNECT_PENDING; - ep_ptr->recv_discreq = DAT_TRUE; - dapl_os_unlock (&ep_ptr->header.lock); - - dapl_os_memzero (&cm_drep, sizeof(ib_cm_drep_t)); - /* Could fail if we received reply from other side, no need to retry */ - /* Wait for any send ops in process holding reference */ - while (dapls_cb_pending(&ep_ptr->req_buffer) && bail-- > 0 ) + if (cm_ptr->state == IBAL_CM_CONNECT) { - dapl_dbg_log (DAPL_DBG_TYPE_CM, - "--> DiCDcb: WAIT for EP=%lx req_count(%d) != 0\n", - ep_ptr, dapls_cb_pending(&ep_ptr->req_buffer)); - dapl_os_sleep_usec (100); + cm_ptr->state = IBAL_CM_DREQ; + cm_ptr->ib_cm = p_cm_dreq_rec->h_cm_dreq; } + else + { + dapli_send_ib_cm_drep(p_cm_dreq_rec->h_cm_dreq); + } + dapl_os_unlock (&ep_ptr->header.lock); - ib_cm_drep (p_cm_dreq_rec->h_cm_dreq, &cm_drep); - - /* CM puts QP in reset state */ - ep_ptr->qp_state = IB_QPS_RESET; - if (ep_ptr->cr_ptr) { /* passive side */ dapls_cr_callback ( cm_ptr, IB_CME_DISCONNECTED, (void * __ptr64) p_cm_dreq_rec->p_dreq_pdata, - IB_DREQ_PDATA_SIZE, + IB_DREQ_PDATA_SIZE, (void *) (((DAPL_CR *) ep_ptr->cr_ptr)->sp_ptr) ); } else @@ -291,9 +303,15 @@ dapli_ib_cm_dreq_cb ( IB_CME_DISCONNECTED, (void * __ptr64) p_cm_dreq_rec->p_dreq_pdata, - IB_DREQ_PDATA_SIZE, + IB_DREQ_PDATA_SIZE, p_cm_dreq_rec->qp_context ); } + return; + +unlock: + dapl_os_unlock (&ep_ptr->header.lock); +send_drep: + dapli_send_ib_cm_drep(p_cm_dreq_rec->h_cm_dreq); } /* @@ -335,9 +353,8 @@ dapli_ib_cm_drep_cb ( dapl_os_assert(cm_ptr->ib_cm.h_qp == p_cm_drep_rec->h_qp); dapl_dbg_log (DAPL_DBG_TYPE_CM, - "--> DiCDpcb: EP %p state %s cm_hdl %p\n",ep_ptr, - dapl_get_ep_state_str(ep_ptr->param.ep_state), - cm_ptr); + "--> DiCDpcb: EP %p state %s cm_hdl %p\n",ep_ptr, + dapl_get_ep_state_str(ep_ptr->param.ep_state), cm_ptr); if ( ep_ptr->param.ep_state == DAT_EP_STATE_DISCONNECTED ) { @@ -353,7 +370,7 @@ dapli_ib_cm_drep_cb ( dapls_cr_callback ( cm_ptr, IB_CME_DISCONNECTED, (void * __ptr64) p_cm_drep_rec->p_drep_pdata, - IB_DREP_PDATA_SIZE, + IB_DREP_PDATA_SIZE, (void *) (((DAPL_CR *) ep_ptr->cr_ptr)->sp_ptr) ); } else @@ -363,7 +380,7 @@ dapli_ib_cm_drep_cb ( cm_ptr, IB_CME_DISCONNECTED, (void * __ptr64) p_cm_drep_rec->p_drep_pdata, - IB_DREP_PDATA_SIZE, + IB_DREP_PDATA_SIZE, p_cm_drep_rec->qp_context ); } } @@ -1101,88 +1118,59 @@ DAT_RETURN dapls_ib_disconnect ( IN DAPL_EP *ep_ptr, IN DAT_CLOSE_FLAGS disconnect_flags ) { - ib_api_status_t ib_status = IB_SUCCESS; - ib_cm_dreq_t cm_dreq; - dp_ib_cm_handle_t cm_ptr; + DAT_RETURN dat_status = DAT_SUCCESS; + ib_cm_dreq_t cm_dreq; + dp_ib_cm_handle_t cm_ptr; + ib_cm_handle_t h_cm; + enum dapl_ibal_cm_state state; - dapl_os_assert(ep_ptr); + dapl_os_assert(ep_ptr); - if ( DAPL_BAD_HANDLE(ep_ptr, DAPL_MAGIC_EP) ) - { - dapl_dbg_log (DAPL_DBG_TYPE_CM, - "--> %s: BAD EP Magic EP=%lx\n", __FUNCTION__,ep_ptr); - return DAT_SUCCESS; - } - cm_ptr = dapl_get_cm_from_ep(ep_ptr); - if (!cm_ptr) - { - dapl_dbg_log (DAPL_DBG_TYPE_ERR, - "--> %s: !CM_PTR on EP %p\n", __FUNCTION__, ep_ptr); - return DAT_SUCCESS; - } - - dapl_dbg_log (DAPL_DBG_TYPE_CM, - "--> %s() EP %p %s rx_drq %d tx_drq %d Close %s\n", __FUNCTION__, - ep_ptr, dapl_get_ep_state_str(ep_ptr->param.ep_state), - ep_ptr->recv_discreq, ep_ptr->sent_discreq, - (disconnect_flags == DAT_CLOSE_ABRUPT_FLAG ? "Abrupt":"Graceful")); - - if ( disconnect_flags == DAT_CLOSE_ABRUPT_FLAG ) - { - if ( ep_ptr->param.ep_state == DAT_EP_STATE_DISCONNECTED ) - return DAT_SUCCESS; - - if ( ep_ptr->param.ep_state != DAT_EP_STATE_DISCONNECT_PENDING ) - { - dapl_dbg_log(DAPL_DBG_TYPE_CM, - "%s() calling legacy_post_disconnect()\n",__FUNCTION__); - dapl_ep_legacy_post_disconnect(ep_ptr, disconnect_flags); - return DAT_SUCCESS; + if ( DAPL_BAD_HANDLE(ep_ptr, DAPL_MAGIC_EP) ) + { + dapl_dbg_log (DAPL_DBG_TYPE_CM, + "--> %s: BAD EP Magic EP=%lx\n", __FUNCTION__,ep_ptr); + return DAT_SUCCESS; } - } - - dapl_os_memzero(&cm_dreq, sizeof(ib_cm_dreq_t)); - cm_dreq.qp_type = IB_QPT_RELIABLE_CONN; - cm_dreq.h_qp = ep_ptr->qp_handle; - cm_dreq.pfn_cm_drep_cb = dapli_ib_cm_drep_cb; - - /* - * Currently we do not send any disconnect private data to - * the other endpoint because DAT 2.0 does not support it. - */ - cm_dreq.p_dreq_pdata = NULL; - cm_dreq.flags = IB_FLAGS_SYNC; - - /* - * still need to send DREQ (disconnect request)? - */ - if ( (ep_ptr->recv_discreq == DAT_FALSE) - && (ep_ptr->sent_discreq == DAT_FALSE) - && (ep_ptr->qp_state != IB_QPS_RESET) ) - { - ep_ptr->sent_discreq = DAT_TRUE; - ib_status = ib_cm_dreq ( &cm_dreq ); - - if ( ib_status == IB_SUCCESS ) - dapl_dbg_log (DAPL_DBG_TYPE_CM, - "--> DsD: EP %p DREQ SENT\n", ep_ptr); - - /* tolerate INVALID_STATE error as the other side can race ahead and - * generate a DREQ before we do. - */ - if ( ib_status == IB_INVALID_STATE || ib_status == IB_INVALID_HANDLE ) + cm_ptr = dapl_get_cm_from_ep(ep_ptr); + if (!cm_ptr) { - ib_status = IB_SUCCESS; + dapl_dbg_log (DAPL_DBG_TYPE_ERR, + "--> %s: !CM_PTR on EP %p\n", __FUNCTION__, ep_ptr); + return DAT_SUCCESS; } - else if (ib_status) + + dapl_dbg_log (DAPL_DBG_TYPE_CM, + "--> %s() EP %p %s Close %s\n", __FUNCTION__, + ep_ptr, dapl_get_ep_state_str(ep_ptr->param.ep_state), + (disconnect_flags == DAT_CLOSE_ABRUPT_FLAG ? "Abrupt":"Graceful")); + + //if ( disconnect_flags == DAT_CLOSE_ABRUPT_FLAG ) + //{ + // dapl_dbg_log(DAPL_DBG_TYPE_CM, + // "%s() calling legacy_post_disconnect()\n",__FUNCTION__); + // dapl_ep_legacy_post_disconnect(ep_ptr, disconnect_flags); + //} + + dapl_os_lock(&ep_ptr->header.lock); + if (ep_ptr->param.ep_state == DAT_EP_STATE_DISCONNECTED) { - dapl_dbg_log(DAPL_DBG_TYPE_ERR, - "%s() EP %p ib_cm_dreq() status %s\n", - __FUNCTION__,ep_ptr,ib_get_err_str(ib_status)); + dapl_os_unlock(&ep_ptr->header.lock); + return dat_status; } - } - return ib_status; + + h_cm = cm_ptr->ib_cm; + state = cm_ptr->state; + cm_ptr->state = IBAL_CM_DISCONNECT; + dapl_os_unlock(&ep_ptr->header.lock); + + if (state == IBAL_CM_DREQ) + dapli_send_ib_cm_drep(h_cm); + else + dapli_send_ib_cm_dreq(cm_ptr->ib_cm); + + return DAT_SUCCESS; } @@ -1692,19 +1680,13 @@ dapls_ib_disconnect_clean ( } dapl_os_assert ( ep_ptr->header.magic == DAPL_MAGIC_EP ); - ep_ptr->sent_discreq = DAT_FALSE; - ep_ptr->recv_discreq = DAT_FALSE; - - /* - * Query the QP to get the current state */ ib_status = ib_query_qp ( ep_ptr->qp_handle, &qp_attr ); - if ( ib_status != IB_SUCCESS ) { dapl_dbg_log ( DAPL_DBG_TYPE_ERR, ">>>DSCONN_CLEAN(%s): Query QP failed = %s\n", (active?"Act":"Pas"),ib_get_err_str(ib_status) ); - return; + return; } ep_ptr->qp_state = qp_attr.state; diff --git a/dapl/ibal/dapl_ibal_util.h b/dapl/ibal/dapl_ibal_util.h index e28bdbe..37fcbaa 100644 --- a/dapl/ibal/dapl_ibal_util.h +++ b/dapl/ibal/dapl_ibal_util.h @@ -40,6 +40,13 @@ typedef struct ib_cm_handle *dp_ib_cm_handle_t; typedef struct ib_cm_handle *ib_cm_srvc_handle_t; #else +enum dapl_ibal_cm_state +{ + IBAL_CM_CONNECT, + IBAL_CM_DREQ, + IBAL_CM_DISCONNECT +}; + /* EP-CM linking requires list_entry, protected ref counting */ typedef struct dapl_ibal_cm { @@ -49,6 +56,7 @@ typedef struct dapl_ibal_cm DAT_SOCK_ADDR6 dst_ip_addr; ib_cm_handle_t ib_cm; /* h_al, h_qp, cid */ DAPL_EP *ep; + enum dapl_ibal_cm_state state; } dapl_ibal_cm_t; diff --git a/dapl/include/dapl.h b/dapl/include/dapl.h index a522f15..68d0ea4 100755 --- a/dapl/include/dapl.h +++ b/dapl/include/dapl.h @@ -466,10 +466,6 @@ struct dapl_ep struct io_buf_track *ibt_base; DAPL_RING_BUFFER ibt_queue; #endif /* DAPL_DBG_IO_TRC */ -#if defined(_WIN32) || defined(_WIN64) - DAT_BOOLEAN recv_discreq; - DAT_BOOLEAN sent_discreq; -#endif #ifdef DAPL_COUNTERS void *cntrs; #endif -- 2.41.0