From 520a1038414a55e05673cbfab284516e2b5fae04 Mon Sep 17 00:00:00 2001 From: Arlin Davis Date: Wed, 7 Jul 2010 11:31:44 -0700 Subject: [PATCH] common: race conditions with DTO error, disconnect and dapl_reset_ep Add locking to dapl_ep_reset to avoid race condition with disconnect events and DTO errors. During DTO errors in cqe_to_event call there is no need for manual disconnect processing. During disconnect phase the CMA provider should wait while DISCONNECT_PENDING. The EP could move directly to UNCONNECTED state with dapl_reset_ep(). Signed-off-by: Arlin Davis --- dapl/common/dapl_ep_reset.c | 5 ++++ dapl/common/dapl_evd_util.c | 45 ++++-------------------------------- dapl/openib_cma/dapl_ib_cm.c | 11 +++++++++ 3 files changed, 20 insertions(+), 41 deletions(-) diff --git a/dapl/common/dapl_ep_reset.c b/dapl/common/dapl_ep_reset.c index e98b115..bd6975f 100644 --- a/dapl/common/dapl_ep_reset.c +++ b/dapl/common/dapl_ep_reset.c @@ -81,18 +81,23 @@ dapl_ep_reset ( goto bail; } + dapl_os_lock(&ep_ptr->header.lock); if ( ep_ptr->param.ep_state != DAT_EP_STATE_UNCONNECTED && ep_ptr->param.ep_state != DAT_EP_STATE_DISCONNECTED ) { dat_status = DAT_ERROR (DAT_INVALID_STATE,dapls_ep_state_subtype (ep_ptr)); + dapl_os_unlock(&ep_ptr->header.lock); goto bail; } if ( ep_ptr->param.ep_state == DAT_EP_STATE_DISCONNECTED ) { + dapl_os_unlock(&ep_ptr->header.lock); dapls_ib_reinit_ep ( ep_ptr ); + dapl_os_lock(&ep_ptr->header.lock); ep_ptr->param.ep_state = DAT_EP_STATE_UNCONNECTED; } + dapl_os_unlock(&ep_ptr->header.lock); bail: return dat_status; diff --git a/dapl/common/dapl_evd_util.c b/dapl/common/dapl_evd_util.c index ccec627..8ea2ce8 100644 --- a/dapl/common/dapl_evd_util.c +++ b/dapl/common/dapl_evd_util.c @@ -1168,49 +1168,12 @@ dapli_evd_cqe_to_event ( /* * Most error DTO ops result in disconnecting the EP. See * IBTA Vol 1.1, Chapter 10,Table 68, for expected effect on - * state. + * state. The QP going to error state will trigger disconnect + * at provider level. No need to force disconnect here. Just + * print error log. */ - if ((dto_status != DAT_DTO_SUCCESS) && - (dto_status != DAT_DTO_ERR_FLUSHED)) + if ((dto_status != DAT_DTO_SUCCESS) && (dto_status != DAT_DTO_ERR_FLUSHED)) { - DAPL_EVD *evd_ptr; - - /* - * If we are connected, generate disconnect and generate an - * event. We may be racing with other disconnect ops, so we - * need to check. We may also be racing CM connection events, - * requiring us to check for connection pending states too. - */ - dapl_os_lock ( &ep_ptr->header.lock ); - if (ep_ptr->param.ep_state == DAT_EP_STATE_CONNECTED || - ep_ptr->param.ep_state == DAT_EP_STATE_ACTIVE_CONNECTION_PENDING || - ep_ptr->param.ep_state == DAT_EP_STATE_PASSIVE_CONNECTION_PENDING|| - ep_ptr->param.ep_state == DAT_EP_STATE_COMPLETION_PENDING ) - - { - ep_ptr->param.ep_state = DAT_EP_STATE_DISCONNECTED; - dapl_os_unlock ( &ep_ptr->header.lock ); - dapls_io_trc_dump (ep_ptr, cqe_ptr, dto_status); - - /* Let the other side know we have disconnected */ - (void) dapls_ib_disconnect (ep_ptr, DAT_CLOSE_ABRUPT_FLAG); - - /* ... and clean up the local side */ - evd_ptr = (DAPL_EVD *) ep_ptr->param.connect_evd_handle; - if (evd_ptr != NULL) - { - dapls_evd_post_connection_event (evd_ptr, - DAT_CONNECTION_EVENT_BROKEN, - (DAT_HANDLE) ep_ptr, - 0, - 0); - } - } - else - { - dapl_os_unlock ( &ep_ptr->header.lock ); - } - dapl_log(DAPL_DBG_TYPE_ERR, "DTO completion ERR: status %d, op %s, vendor_err 0x%x - %s\n", DAPL_GET_CQE_STATUS(cqe_ptr), diff --git a/dapl/openib_cma/dapl_ib_cm.c b/dapl/openib_cma/dapl_ib_cm.c index f2eb8cb..576e19e 100755 --- a/dapl/openib_cma/dapl_ib_cm.c +++ b/dapl/openib_cma/dapl_ib_cm.c @@ -611,6 +611,17 @@ dapls_ib_disconnect(IN DAPL_EP *ep_ptr, " disconnect: ID %p ret %d\n", ep_ptr->cm_handle, ret); + /* ABRUPT close, wait for callback and !DISCONNECT_PENDING state */ + if (close_flags == DAT_CLOSE_ABRUPT_FLAG) { + dapl_os_lock(&ep_ptr->header.lock); + while (ep_ptr->param.ep_state == DAT_EP_STATE_DISCONNECT_PENDING) { + dapl_os_unlock(&ep_ptr->header.lock); + dapl_os_sleep_usec(10000); + dapl_os_lock(&ep_ptr->header.lock); + } + dapl_os_unlock(&ep_ptr->header.lock); + } + /* * DAT event notification occurs from the callback * Note: will fire even if DREQ goes unanswered on timeout -- 2.41.0