]> git.openfabrics.org - ~ardavis/dapl.git/commitdiff
r7755: Use the uCM set_option feature to adjust connect request timeout
authorArlin Davis <ardavis@ichips.intel.com>
Tue, 6 Jun 2006 21:46:44 +0000 (21:46 +0000)
committerJames Lentini <jlentini@netapp.com>
Tue, 6 Jun 2006 21:46:44 +0000 (21:46 +0000)
and retry values. Also, a fix to disallow any event after a disconnect
event.
Signed-off-by: Arlin Davis <ardavis@ichips.intel.com>
Signed-off-by: James Lentini <jlentini@netapp.com>
dapl/openib_cma/dapl_ib_cm.c
dapl/openib_cma/dapl_ib_util.c
dapl/openib_cma/dapl_ib_util.h

index 6d8673ba3d6404457ac30e7940210bde7be11b55..3a4bc2312b275daf0cdb87e0bf7a921a1ce603d0 100644 (file)
@@ -58,6 +58,7 @@
 #include "dapl_ib_util.h"
 #include <sys/poll.h>
 #include <signal.h>
+#include <rdma/rdma_cma_ib.h>
 
 extern struct rdma_event_channel *g_cm_events;
 
@@ -85,7 +86,6 @@ static inline uint64_t cpu_to_be64(uint64_t x) { return x; }
     (unsigned short)((SID % IB_PORT_MOD) + IB_PORT_BASE) :\
     (unsigned short)SID)
 
-
 static void dapli_addr_resolve(struct dapl_cm_id *conn)
 {
        int ret;
@@ -114,6 +114,8 @@ static void dapli_addr_resolve(struct dapl_cm_id *conn)
 static void dapli_route_resolve(struct dapl_cm_id *conn)
 {
        int ret;
+       size_t optlen = sizeof(struct ib_cm_req_opt);
+       struct ib_cm_req_opt req_opt;
 #ifdef DAPL_DBG
        struct rdma_addr *ipaddr = &conn->cm_id->route.addr;
        struct ib_addr   *ibaddr = &conn->cm_id->route.addr.addr.ibaddr;
@@ -143,13 +145,43 @@ static void dapli_route_resolve(struct dapl_cm_id *conn)
                        cpu_to_be64(ibaddr->dgid.global.interface_id));
        
        dapl_dbg_log(DAPL_DBG_TYPE_CM, 
-               " rdma_connect: cm_id %p pdata %p plen %d rr %d ind %d\n",
+               " route_resolve: cm_id %p pdata %p plen %d rr %d ind %d\n",
                conn->cm_id,
                conn->params.private_data, 
                conn->params.private_data_len,
                conn->params.responder_resources, 
                conn->params.initiator_depth );
 
+       /* Get default connect request timeout values, and adjust */
+       ret = rdma_get_option(conn->cm_id, RDMA_PROTO_IB, IB_CM_REQ_OPTIONS,
+                             (void*)&req_opt, &optlen);
+       if (ret) {
+               dapl_dbg_log(DAPL_DBG_TYPE_ERR, " rdma_get_option failed: %s\n",
+                            strerror(errno));
+               goto bail;
+       }
+
+       dapl_dbg_log(DAPL_DBG_TYPE_CM, " route_resolve: "
+                    "Set CR times - response %d to %d, retry %d to %d\n",
+                    req_opt.remote_cm_response_timeout, 
+                    conn->hca->ib_trans.max_cm_timeout,
+                    req_opt.max_cm_retries, 
+                    conn->hca->ib_trans.max_cm_retries);
+
+       /* Use hca response time setting for connect requests */
+       req_opt.max_cm_retries = conn->hca->ib_trans.max_cm_retries;
+       req_opt.remote_cm_response_timeout = 
+                               conn->hca->ib_trans.max_cm_timeout;
+       req_opt.local_cm_response_timeout = 
+                               req_opt.remote_cm_response_timeout;
+       ret = rdma_set_option(conn->cm_id, RDMA_PROTO_IB, IB_CM_REQ_OPTIONS,
+                             (void*)&req_opt, optlen);
+       if (ret) {
+               dapl_dbg_log(DAPL_DBG_TYPE_ERR, " rdma_set_option failed: %s\n",
+                            strerror(errno));
+               goto bail;
+       }
+
        ret = rdma_connect(conn->cm_id, &conn->params);
        if (ret) {
                dapl_dbg_log(DAPL_DBG_TYPE_ERR, " rdma_connect failed: %s\n",
@@ -273,14 +305,37 @@ static void dapli_cm_active_cb(struct dapl_cm_id *conn,
        }
        dapl_os_unlock(&conn->lock);
 
+        /* There is a chance that we can get events after
+         * the consumer calls disconnect in a pending state
+         * since the IB CM and uDAPL states are not shared.
+         * In some cases, IB CM could generate either a DCONN
+         * or CONN_ERR after the consumer returned from
+         * dapl_ep_disconnect with a DISCONNECTED event
+         * already queued. Check state here and bail to
+         * avoid any events after a disconnect.
+         */
+        if (DAPL_BAD_HANDLE(conn->ep, DAPL_MAGIC_EP))
+                return;
+
+        dapl_os_lock(&conn->ep->header.lock);
+        if (conn->ep->param.ep_state == DAT_EP_STATE_DISCONNECTED) {
+                dapl_os_unlock(&conn->ep->header.lock);
+                return;
+        }
+        if (event->event == RDMA_CM_EVENT_DISCONNECTED)
+                conn->ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
+
+        dapl_os_unlock(&conn->ep->header.lock);
+
        switch (event->event) {
        case RDMA_CM_EVENT_UNREACHABLE:
        case RDMA_CM_EVENT_CONNECT_ERROR:
-               dapl_dbg_log(
-                       DAPL_DBG_TYPE_WARN,
-                       " dapli_cm_active_handler: CONN_ERR "
-                       " event=0x%x status=%d\n",      
-                       event->event, event->status);
+                dapl_dbg_log(
+                        DAPL_DBG_TYPE_WARN,
+                        " dapli_cm_active_handler: CONN_ERR "
+                        " event=0x%x status=%d %s\n",
+                        event->event, event->status,
+                        (event->status == -110)?"TIMEOUT":"" );
 
                dapl_evd_connection_callback(conn,
                                             IB_CME_DESTINATION_UNREACHABLE,
@@ -368,25 +423,23 @@ static void dapli_cm_passive_cb(struct dapl_cm_id *conn,
                                          event->private_data, new_conn->sp);
                break;
        case RDMA_CM_EVENT_UNREACHABLE:
-               dapls_cr_callback(conn, IB_CME_DESTINATION_UNREACHABLE,
-                                NULL, conn->sp);
-
        case RDMA_CM_EVENT_CONNECT_ERROR:
 
                dapl_dbg_log(
-                       DAPL_DBG_TYPE_WARN, 
-                       " dapli_cm_passive: CONN_ERR "
-                       " event=0x%x status=%d",
-                       " on SRC 0x%x,0x%x DST 0x%x,0x%x\n",
-                       event->event, event->status,
-                       ntohl(((struct sockaddr_in *)
-                               &ipaddr->src_addr)->sin_addr.s_addr),
-                       ntohs(((struct sockaddr_in *)
-                               &ipaddr->src_addr)->sin_port),
-                       ntohl(((struct sockaddr_in *)
-                               &ipaddr->dst_addr)->sin_addr.s_addr),
-                       ntohs(((struct sockaddr_in *)
-                               &ipaddr->dst_addr)->sin_port));
+                        DAPL_DBG_TYPE_WARN,
+                        " dapli_cm_passive: CONN_ERR "
+                        " event=0x%x status=%d %s"
+                        " on SRC 0x%x,0x%x DST 0x%x,0x%x\n",
+                        event->event, event->status,
+                        (event->status == -110)?"TIMEOUT":"",
+                        ntohl(((struct sockaddr_in *)
+                                &ipaddr->src_addr)->sin_addr.s_addr),
+                        ntohs(((struct sockaddr_in *)
+                                &ipaddr->src_addr)->sin_port),
+                        ntohl(((struct sockaddr_in *)
+                                &ipaddr->dst_addr)->sin_addr.s_addr),
+                        ntohs(((struct sockaddr_in *)
+                                &ipaddr->dst_addr)->sin_port));
 
                dapls_cr_callback(conn, IB_CME_DESTINATION_UNREACHABLE,
                                 NULL, conn->sp);
index b733ebbaddb3174739149dd1e9b8119c458f780f..76a2cdf3f627f3024652a438b364fd5a61330ecb 100644 (file)
@@ -264,7 +264,15 @@ DAT_RETURN dapls_ib_open_hca(IN IB_HCA_NAME hca_name, IN DAPL_HCA *hca_ptr)
        /* set inline max with env or default, get local lid and gid 0 */
        hca_ptr->ib_trans.max_inline_send = 
                dapl_os_get_env_val("DAPL_MAX_INLINE", INLINE_SEND_DEFAULT);
-               
+
+       /* set CM timer defaults */     
+       hca_ptr->ib_trans.max_cm_timeout =
+               dapl_os_get_env_val("DAPL_MAX_CM_RESPONSE_TIME", 
+                                   IB_CM_RESPONSE_TIMEOUT);
+       hca_ptr->ib_trans.max_cm_retries = 
+               dapl_os_get_env_val("DAPL_MAX_CM_RETRIES", 
+                                   IB_CM_RETRIES);
+
        /* EVD events without direct CQ channels, non-blocking */
        hca_ptr->ib_trans.ib_cq = 
                ibv_create_comp_channel(hca_ptr->ib_hca_handle);
index a6f3bdb581228784eb6b923e8051925fa5046f12..5686ddf0a1bcd41f1a30b95f0a1137250ec27b30 100644 (file)
@@ -67,8 +67,8 @@ typedef ib_hca_handle_t               dapl_ibal_ca_t;
 
 #define IB_RC_RETRY_COUNT      7
 #define IB_RNR_RETRY_COUNT     7
-#define IB_CM_RESPONSE_TIMEOUT 18      /* 1 sec */
-#define IB_MAX_CM_RETRIES      7
+#define IB_CM_RESPONSE_TIMEOUT  20     /* 4 sec */
+#define IB_CM_RETRIES           15
 #define IB_REQ_MRA_TIMEOUT     27      /* a little over 9 minutes */
 #define IB_MAX_AT_RETRY                3
 #define IB_TARGET_MAX          4       /* max_qp_ous_rd_atom */
@@ -252,6 +252,8 @@ typedef struct _ib_hca_transport
        ib_async_cq_handler_t   async_cq_error;
        ib_async_dto_handler_t  async_cq;
        ib_async_qp_handler_t   async_qp_error;
+       uint8_t                 max_cm_timeout;
+       uint8_t                 max_cm_retries;
 
 } ib_hca_transport_t;