]> git.openfabrics.org - ~ardavis/dapl.git/commitdiff
common: race conditions with DTO error, disconnect and dapl_reset_ep
authorArlin Davis <arlin.r.davis@intel.com>
Wed, 7 Jul 2010 18:31:44 +0000 (11:31 -0700)
committerArlin Davis <arlin.r.davis@intel.com>
Wed, 7 Jul 2010 18:31:44 +0000 (11:31 -0700)
Add locking to dapl_ep_reset to avoid race condition with disconnect events
and DTO errors. During DTO errors in cqe_to_event call there is
no need for manual disconnect processing. During disconnect
phase the CMA provider should wait while DISCONNECT_PENDING.
The EP could move directly to UNCONNECTED state with dapl_reset_ep().

Signed-off-by: Arlin Davis <arlin.r.davis@intel.com>
dapl/common/dapl_ep_reset.c
dapl/common/dapl_evd_util.c
dapl/openib_cma/dapl_ib_cm.c

index e98b115d1fa8ff448552bdd7b5e0cc836126aa9d..bd6975f0e94b77d48dc1ea23e3dd349de717d593 100644 (file)
@@ -81,18 +81,23 @@ dapl_ep_reset (
        goto bail;
     }
 
+    dapl_os_lock(&ep_ptr->header.lock);
     if ( ep_ptr->param.ep_state != DAT_EP_STATE_UNCONNECTED
         && ep_ptr->param.ep_state != DAT_EP_STATE_DISCONNECTED )
     {
        dat_status = DAT_ERROR (DAT_INVALID_STATE,dapls_ep_state_subtype (ep_ptr));
+       dapl_os_unlock(&ep_ptr->header.lock);
        goto bail;
     }
 
     if ( ep_ptr->param.ep_state == DAT_EP_STATE_DISCONNECTED )
     {
+       dapl_os_unlock(&ep_ptr->header.lock);
        dapls_ib_reinit_ep ( ep_ptr );
+       dapl_os_lock(&ep_ptr->header.lock);
        ep_ptr->param.ep_state = DAT_EP_STATE_UNCONNECTED;
     }
+    dapl_os_unlock(&ep_ptr->header.lock);
 
  bail:
     return dat_status;
index ccec627410a6e2948a9981cb62ef50f2dc5395d1..8ea2ce833528915eb252058612a548bcb01b00d8 100644 (file)
@@ -1168,49 +1168,12 @@ dapli_evd_cqe_to_event (
     /*
      * Most error DTO ops result in disconnecting the EP. See
      * IBTA Vol 1.1, Chapter 10,Table 68, for expected effect on
-     * state.
+     * state. The QP going to error state will trigger disconnect
+     * at provider level. No need to force disconnect here. Just
+     * print error log.
      */
-    if ((dto_status != DAT_DTO_SUCCESS) &&
-        (dto_status != DAT_DTO_ERR_FLUSHED))
+    if ((dto_status != DAT_DTO_SUCCESS) && (dto_status != DAT_DTO_ERR_FLUSHED))
     {
-       DAPL_EVD                *evd_ptr;
-
-       /*
-        * If we are connected, generate disconnect and generate an
-        * event. We may be racing with other disconnect ops, so we
-        * need to check. We may also be racing CM connection events,
-        * requiring us to check for connection pending states too.
-        */
-       dapl_os_lock ( &ep_ptr->header.lock );
-       if (ep_ptr->param.ep_state == DAT_EP_STATE_CONNECTED ||
-           ep_ptr->param.ep_state == DAT_EP_STATE_ACTIVE_CONNECTION_PENDING ||
-           ep_ptr->param.ep_state == DAT_EP_STATE_PASSIVE_CONNECTION_PENDING||
-           ep_ptr->param.ep_state == DAT_EP_STATE_COMPLETION_PENDING )
-
-       {
-           ep_ptr->param.ep_state = DAT_EP_STATE_DISCONNECTED;
-           dapl_os_unlock ( &ep_ptr->header.lock );
-           dapls_io_trc_dump (ep_ptr, cqe_ptr, dto_status);
-
-           /* Let the other side know we have disconnected */
-           (void) dapls_ib_disconnect (ep_ptr, DAT_CLOSE_ABRUPT_FLAG);
-
-           /* ... and clean up the local side */
-           evd_ptr = (DAPL_EVD *) ep_ptr->param.connect_evd_handle;
-           if (evd_ptr != NULL)
-           {
-               dapls_evd_post_connection_event (evd_ptr,
-                                               DAT_CONNECTION_EVENT_BROKEN,
-                                               (DAT_HANDLE) ep_ptr,
-                                               0,
-                                               0);
-           }
-       }
-       else
-       {
-           dapl_os_unlock ( &ep_ptr->header.lock );
-       }
-
        dapl_log(DAPL_DBG_TYPE_ERR,
                 "DTO completion ERR: status %d, op %s, vendor_err 0x%x - %s\n",
                 DAPL_GET_CQE_STATUS(cqe_ptr),
index f2eb8cb530df229fc3d30a5f22d3988a5fd8098d..576e19e8eaccecd794a8c1ff4a91e5969e484112 100755 (executable)
@@ -611,6 +611,17 @@ dapls_ib_disconnect(IN DAPL_EP *ep_ptr,
                             " disconnect: ID %p ret %d\n", 
                             ep_ptr->cm_handle, ret);
 
+       /* ABRUPT close, wait for callback and !DISCONNECT_PENDING state */
+       if (close_flags == DAT_CLOSE_ABRUPT_FLAG) {
+               dapl_os_lock(&ep_ptr->header.lock);
+               while (ep_ptr->param.ep_state == DAT_EP_STATE_DISCONNECT_PENDING) {
+                       dapl_os_unlock(&ep_ptr->header.lock);
+                       dapl_os_sleep_usec(10000);
+                       dapl_os_lock(&ep_ptr->header.lock);
+               }
+               dapl_os_unlock(&ep_ptr->header.lock);
+       }
+
        /* 
         * DAT event notification occurs from the callback
         * Note: will fire even if DREQ goes unanswered on timeout