]> git.openfabrics.org - ~shefty/rdma-win.git/commitdiff
ibat/resolve: retry ibat resolution
authorSean Hefty <sean.hefty@intel.com>
Wed, 17 Feb 2010 18:12:23 +0000 (10:12 -0800)
committerSean Hefty <sean.hefty@intel.com>
Wed, 17 Feb 2010 18:12:23 +0000 (10:12 -0800)
Winverbs ND scale out testing showed that IBAT::Resolve() can
return E_PENDING, which requires that the resolution be retried.
A similar issue to this was seen when testing with the librdmacm.
Rather than duplicating retry logic in the winverbs ND provider,
add new functionality to ibat, with retry capability.  To
avoid breaking the ibat.dll interface, extend the API with a
new call ResolvePath() that takes a timeout value.

ResolvePath() automatically retries Resolve() while the result
is E_PENDING, until the request times out.  Modify the winverbs
ND provider to call ResolvePath().  Also update other places
where Resolve() is called in a loop: the librdmacm and wsd.

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
trunk/core/ibat/user/ibat.cpp
trunk/inc/user/iba/ibat.h
trunk/ulp/dapl2/dapl/include/dapl.h
trunk/ulp/dapl2/dapl/openib_common/qp.c
trunk/ulp/dapl2/dapl/openib_scm/cm.c
trunk/ulp/librdmacm/src/cma.cpp
trunk/ulp/netdirect/user/nd_connect.cpp
trunk/ulp/wsd/user/ibsp_ip.c

index 69d918615789e871b287b5a82bbf978f252a7f0e..ede0a5ee19cc19878b479423f3f170f61a50c38a 100644 (file)
@@ -358,9 +358,32 @@ Resolve(
     return S_OK;\r
 }\r
 \r
-#endif\r
+#endif // WINVER >= 0x600\r
+\r
+\r
+HRESULT\r
+ResolvePath(\r
+    __in const struct sockaddr* pSrcAddr,\r
+    __in const struct sockaddr* pDestAddr,\r
+    __out IBAT_PATH_BLOB* pPath,\r
+       __in int Timeout)\r
+{\r
+       HRESULT hr;\r
+\r
+       do {\r
+               hr = Resolve(pSrcAddr, pDestAddr, pPath);\r
+               if( hr != E_PENDING || Timeout <= 0 )\r
+                       break;\r
+\r
+               Timeout -= 10;\r
+               Sleep(10);\r
+       } while( Timeout > 0 );\r
+\r
+       return hr;\r
 }\r
 \r
+} /* IBAT namespace */\r
+\r
 extern "C"\r
 {\r
 \r
@@ -374,4 +397,14 @@ IbatResolve(
     return IBAT::Resolve( pSrcAddr, pDestAddr, pPath );\r
 }\r
 \r
+HRESULT\r
+IbatResolvePath(\r
+    __in const struct sockaddr* pSrcAddr,\r
+    __in const struct sockaddr* pDestAddr,\r
+    __out IBAT_PATH_BLOB* pPath,\r
+       __in const int Timeout)\r
+{\r
+       return IBAT::ResolvePath(pSrcAddr, pDestAddr, pPath, Timeout);\r
+}\r
+\r
 } /* extern "C" */\r
index c9a174059481380202dbc84a342c80bf705f6fcb..e7f2c06c65417e72bc2ec2e924f77ef07499f79f 100644 (file)
@@ -41,6 +41,8 @@ typedef struct _IBAT_PATH_BLOB
 \r
 } IBAT_PATH_BLOB;\r
 \r
+#define IBAT_MAX_TIMEOUT 0x0FFFFFFF\r
+\r
 #ifdef __cplusplus\r
 namespace IBAT\r
 {\r
@@ -52,6 +54,14 @@ Resolve(
     __out IBAT_PATH_BLOB* pPath\r
     );\r
 \r
+HRESULT\r
+ResolvePath(\r
+    __in const struct sockaddr* pSrcAddr,\r
+    __in const struct sockaddr* pDestAddr,\r
+    __out IBAT_PATH_BLOB* pPath,\r
+       __in int Timeout        /* ms */\r
+    );\r
+\r
 }\r
 #else /* __cplusplus */\r
 \r
@@ -62,6 +72,14 @@ IbatResolve(
     __out IBAT_PATH_BLOB* pPath\r
     );\r
 \r
+HRESULT\r
+IbatResolvePath(\r
+    __in const struct sockaddr* pSrcAddr,\r
+    __in const struct sockaddr* pDestAddr,\r
+    __out IBAT_PATH_BLOB* pPath,\r
+       __in int Timeout        /* ms */\r
+    );\r
+\r
 #endif /* __cplusplus */\r
 \r
 #endif // _IBAT_H_
\ No newline at end of file
index a36b110733d561d97fb0d5255ed3e598e10b46bf..91e041c158eb4d39b38330ed8a54ddfde8592610 100644 (file)
 typedef enum dapl_magic
 {
     /* magic number values for verification & debug */
-    DAPL_MAGIC_IA      = 0xCafeF00d,
-    DAPL_MAGIC_EVD     = 0xFeedFace,
-    DAPL_MAGIC_EP      = 0xDeadBabe,
-    DAPL_MAGIC_LMR     = 0xBeefCafe,
-    DAPL_MAGIC_RMR      = 0xABadCafe,
-    DAPL_MAGIC_PZ      = 0xDeafBeef,
-    DAPL_MAGIC_PSP     = 0xBeadeD0c,
-    DAPL_MAGIC_RSP     = 0xFab4Feed,
-    DAPL_MAGIC_SRQ     = 0xC001Babe,
-    DAPL_MAGIC_CR      = 0xBe12Cee1,
-    DAPL_MAGIC_CR_DESTROYED = 0xB12bDead,
-    DAPL_MAGIC_CNO     = 0xDeadF00d,
+    DAPL_MAGIC_IA      = 0x12345678,
+    DAPL_MAGIC_EVD     = 0x02468ace,
+    DAPL_MAGIC_EP      = 0x13579bdf,
+    DAPL_MAGIC_LMR     = 0x2123ab54,
+    DAPL_MAGIC_RMR      = 0x1358bc47,
+    DAPL_MAGIC_PZ      = 0x389d9075,
+    DAPL_MAGIC_PSP     = 0x238e9080,
+    DAPL_MAGIC_RSP     = 0x12390754,
+    DAPL_MAGIC_SRQ     = 0x0ee98434,
+    DAPL_MAGIC_CR      = 0x889f3398,
+    DAPL_MAGIC_CR_DESTROYED = 0x74749009,
+    DAPL_MAGIC_CNO     = 0x78899984,
     DAPL_MAGIC_INVALID  = 0xFFFFFFFF
 } DAPL_MAGIC;
 
index c2b5c69f1caa43fe51550a8fd724382a7783d87c..b0de59800015babdd4a89609b96d2b00d5ced9e8 100644 (file)
@@ -211,6 +211,7 @@ DAT_RETURN dapls_ib_qp_free(IN DAPL_IA * ia_ptr, IN DAPL_EP * ep_ptr)
                     ep_ptr, ep_ptr->qp_handle);\r
 \r
        if (ep_ptr->cm_handle != NULL) {\r
+dapl_log(DAPL_DBG_TYPE_ERR, "dapls_ib_qp_free - calling dapls_ib_cm_free\n");\r
                dapls_ib_cm_free(ep_ptr->cm_handle, ep_ptr);\r
        }\r
        \r
@@ -481,8 +482,13 @@ dapls_modify_qp_state(IN ib_qp_handle_t            qp_handle,
                                qp_attr.pkey_index, qp_attr.port_num,\r
                                qp_attr.qp_access_flags, qp_attr.qkey);\r
                break;\r
-       default:\r
+       case IBV_QPS_RESET:\r
+               break;\r
+       case IBV_QPS_ERR:\r
                break;\r
+       default:\r
+               dapl_log(DAPL_DBG_TYPE_ERR, "invalid QP state 0x%x!\n", qp_state);\r
+               return DAT_SUCCESS;\r
        }\r
 \r
        ret = ibv_modify_qp(qp_handle, &qp_attr, mask);\r
index 1d7a8dc4d28f37a11e63aec7f707d3e39c871575..f95b356035234d152beca402e391f76494e46432 100644 (file)
@@ -311,6 +311,8 @@ void dapls_ib_cm_free(dp_ib_cm_handle_t cm_ptr, DAPL_EP *ep)
 \r
        /* cleanup, never made it to work queue */\r
        dapl_os_lock(&cm_ptr->lock);\r
+if (cm_ptr->state == DCM_DESTROY)\r
+  dapl_log(DAPL_DBG_TYPE_ERR, "dapls_ib_cm_free - destroying twice!\n");\r
        if (cm_ptr->state == DCM_INIT) {\r
                if (cm_ptr->socket != DAPL_INVALID_SOCKET) {\r
                        shutdown(cm_ptr->socket, SHUT_RDWR);\r
@@ -391,7 +393,7 @@ notify_thread:
 /* queue socket for processing CM work */\r
 static void dapli_cm_queue(struct ib_cm_handle *cm_ptr)\r
 {\r
-       DAPL_HCA *hca_ptr = cm_ptr->hca;\r
+       DAPL_HCA *hca = cm_ptr->hca;\r
 \r
        /* add to work queue for cr thread processing */\r
        dapl_llist_init_entry((DAPL_LLIST_ENTRY *) & cm_ptr->entry);\r
@@ -411,12 +413,43 @@ static void dapli_cm_queue(struct ib_cm_handle *cm_ptr)
 DAT_RETURN dapli_socket_disconnect(dp_ib_cm_handle_t cm_ptr)\r
 {\r
        DAPL_EP *ep_ptr = cm_ptr->ep;\r
-       DAT_UINT32 disc_data = htonl(0xdead);\r
+       DAT_UINT32 disc_data = htonl(0xbad);\r
 \r
        if (ep_ptr == NULL)\r
                return DAT_SUCCESS;\r
+dapl_os_lock(&cm_ptr->lock);\r
+if (cm_ptr->ep->header.magic != DAPL_MAGIC_EP) {\r
+  dapl_log(DAPL_DBG_TYPE_ERR, "bad ep magic!!!\n");\r
+  dapl_os_unlock(&cm_ptr->lock);\r
+  return DAT_SUCCESS;\r
+}\r
+if (cm_ptr->ep->qp_handle->qp_context != cm_ptr->ep) {\r
+  dapl_log(DAPL_DBG_TYPE_ERR, "bad qp_handle->qp_context!!!\n");\r
+  dapl_os_unlock(&cm_ptr->lock);\r
+  return DAT_SUCCESS;\r
+}\r
+if (ep_ptr->qp_handle->srq) {\r
+  dapl_log(DAPL_DBG_TYPE_ERR, "qp handle has srq??? likely bad handle\n");\r
+  dapl_os_unlock(&cm_ptr->lock);\r
+  return DAT_SUCCESS;\r
+}\r
+if (ep_ptr->qp_handle->send_cq != ep_ptr->qp_handle->recv_cq) {\r
+  dapl_log(DAPL_DBG_TYPE_ERR, "qp send/recv cqs do not match\n");\r
+  dapl_os_unlock(&cm_ptr->lock);\r
+  return DAT_SUCCESS;\r
+}\r
+if (ep_ptr->qp_handle->context != ep_ptr->qp_handle->pd->context) {\r
+  dapl_log(DAPL_DBG_TYPE_ERR, "qp verbs != pd verbs\n");\r
+  dapl_os_unlock(&cm_ptr->lock);\r
+  return DAT_SUCCESS;\r
+}\r
+if (ep_ptr->qp_handle->qp_type != IBV_QPT_RC) {\r
+  dapl_log(DAPL_DBG_TYPE_ERR, "qp type is invalid\n");\r
+  dapl_os_unlock(&cm_ptr->lock);\r
+  return DAT_SUCCESS;\r
+}\r
 \r
-       dapl_os_lock(&cm_ptr->lock);\r
+//     dapl_os_lock(&cm_ptr->lock);\r
        if (cm_ptr->state != DCM_CONNECTED) {\r
                dapl_os_unlock(&cm_ptr->lock);\r
                return DAT_SUCCESS;\r
@@ -655,6 +688,7 @@ static void dapli_socket_connect_rtu(dp_ib_cm_handle_t cm_ptr)
                        dapli_socket_connect(cm_ptr->ep, (DAT_IA_ADDRESS_PTR)&cm_ptr->addr, \r
                                             ntohs(((struct sockaddr_in *)&cm_ptr->addr)->sin_port) - 1000,\r
                                             ntohs(cm_ptr->msg.p_size), &cm_ptr->msg.p_data);\r
+dapl_log(DAPL_DBG_TYPE_ERR, "dapli_socket_connect_rtu\n");\r
                        dapls_ib_cm_free(cm_ptr, NULL);\r
                        return;\r
                }\r
@@ -1460,6 +1494,8 @@ dapls_ib_remove_conn_listener(IN DAPL_IA * ia_ptr, IN DAPL_SP * sp_ptr)
        if (cm_ptr != NULL) {\r
                /* cr_thread will free */\r
                dapl_os_lock(&cm_ptr->lock);\r
+if (cm_ptr->state == DCM_DESTROY)\r
+  dapl_log(DAPL_DBG_TYPE_ERR, "dapls_ib_remove_conn_listener - destroying twice!\n");\r
                cm_ptr->state = DCM_DESTROY;\r
                sp_ptr->cm_srvc_handle = NULL;\r
                send(cm_ptr->hca->ib_trans.scm[1], "w", sizeof "w", 0);\r
index cde309b881321206bbbb73204bd544d8d9989991..40e741136b9a0f2e4be8859024593033c69b21b7 100644 (file)
@@ -506,25 +506,6 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
        return 0;\r
 }\r
 \r
-static int\r
-ucma_resolve_ibat_path(struct rdma_cm_id *id, int timeout_ms,\r
-                                          IBAT_PATH_BLOB *path)\r
-{\r
-       HRESULT hr;\r
-\r
-       do {\r
-               hr = IBAT::Resolve(&id->route.addr.src_addr, &id->route.addr.dst_addr,\r
-                                                  path);\r
-               if (hr != E_PENDING || timeout_ms <= 0) {\r
-                       break;\r
-               }\r
-               timeout_ms -= 10;\r
-               Sleep(10);\r
-       } while (timeout_ms > 0);\r
-\r
-       return hr;\r
-}\r
-\r
 __declspec(dllexport)\r
 int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)\r
 {\r
@@ -532,7 +513,8 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
        IBAT_PATH_BLOB path;\r
        HRESULT hr;\r
 \r
-       hr = ucma_resolve_ibat_path(id, timeout_ms, &path);\r
+       hr = IBAT::ResolvePath(&id->route.addr.src_addr, &id->route.addr.dst_addr,\r
+                                                  &path, timeout_ms);\r
        if (FAILED(hr)) {\r
                return hr;\r
        }\r
index aa46adac41ce6b8187affee23c5d50b5b9066300..6ee6fa0f90d5aa1e61bb96839edcd5f1dde014d7 100644 (file)
@@ -138,12 +138,13 @@ Connect(INDEndpoint* pEndpoint,
        } else {\r
                addr.Sin6.sin6_port = LocalPort;\r
        }\r
-       hr = m_pWvConnEp->BindAddress(&addr.Sa);\r
+\r
+       hr = IBAT::ResolvePath(&addr.Sa, pAddress, &path, IBAT_MAX_TIMEOUT);\r
        if (FAILED(hr)) {\r
                goto out;\r
        }\r
 \r
-       hr = IBAT::Resolve(&addr.Sa, pAddress, &path);\r
+       hr = m_pWvConnEp->BindAddress(&addr.Sa);\r
        if (FAILED(hr)) {\r
                goto out;\r
        }\r
index a0afe89416cd5c287f143a12a118af65c72d8909..0da0c0e89b0759c758c011ace74ab0ae78f136b0 100644 (file)
@@ -270,20 +270,9 @@ query_guid_address(
        HRESULT hr;\r
 \r
        IBSP_ENTER( IBSP_DBG_HW );\r
+       hr = IbatResolvePath(p_src_addr, p_dest_addr, (IBAT_PATH_BLOB*)&path,\r
+               IBAT_MAX_TIMEOUT);\r
 \r
-       for(;;)\r
-       {\r
-               hr = IbatResolve(\r
-                       p_src_addr,\r
-                       p_dest_addr,\r
-                       (IBAT_PATH_BLOB*)&path\r
-                       );\r
-\r
-               if( hr != E_PENDING )\r
-                       break;\r
-\r
-               Sleep( 100 );\r
-       }\r
        if( hr == S_OK )\r
        {\r
                *port_guid = path.dgid.unicast.interface_id;\r