]> git.openfabrics.org - ~ardavis/dapl.git/commitdiff
scm: SOCKOPT ERR Connection timed out on large clusters
authorArlin Davis <arlin.r.davis@intel.com>
Thu, 13 May 2010 17:31:17 +0000 (10:31 -0700)
committerArlin Davis <arlin.r.davis@intel.com>
Thu, 13 May 2010 17:31:17 +0000 (10:31 -0700)
Large scale all to all connections on +1000 cores
the listen backlog is reached and SYN's are dropped
which causes the connect to timeout. Retry connect
on timeout errors.

Signed-off-by: Arlin Davis <arlin.r.davis@intel.com>
dapl/openib_scm/cm.c

index 746519095d59f4cee034f13b8c3795359527bb39..4c8d4a117b1f1d1d53ba643f054c31e4ed7da17b 100644 (file)
 #include "dapl_ep_util.h"
 #include "dapl_osd.h"
 
+/* forward declarations */
+static DAT_RETURN
+dapli_socket_connect(DAPL_EP * ep_ptr,
+                    DAT_IA_ADDRESS_PTR r_addr,
+                    DAT_CONN_QUAL r_qual, DAT_COUNT p_size, DAT_PVOID p_data);
+
 #ifdef DAPL_DBG
 /* Check for EP linking to IA and proper connect state */
 void dapli_ep_check(DAPL_EP *ep)
@@ -494,13 +500,27 @@ static void dapli_socket_connected(dp_ib_cm_handle_t cm_ptr, int err)
 
        if (err) {
                dapl_log(DAPL_DBG_TYPE_ERR,
-                        " CONN_PENDING: %s ERR %s -> %s %d\n",
+                        " CONN_PENDING: %s ERR %s -> %s %d - %s\n",
                         err == -1 ? "POLL" : "SOCKOPT",
                         err == -1 ? strerror(dapl_socket_errno()) : strerror(err), 
                         inet_ntoa(((struct sockaddr_in *)
                                &cm_ptr->addr)->sin_addr), 
                         ntohs(((struct sockaddr_in *)
-                               &cm_ptr->addr)->sin_port));
+                               &cm_ptr->addr)->sin_port),
+                        err == ETIMEDOUT ? "RETRYING...":"ABORTING");
+
+               /* retry a timeout */
+               if (err == ETIMEDOUT) {
+                       closesocket(cm_ptr->socket);
+                       cm_ptr->socket = DAPL_INVALID_SOCKET;
+                       dapli_socket_connect(cm_ptr->ep, (DAT_IA_ADDRESS_PTR)&cm_ptr->addr, 
+                                            ntohs(((struct sockaddr_in *)&cm_ptr->addr)->sin_port) - 1000,
+                                            ntohs(cm_ptr->msg.p_size), &cm_ptr->msg.p_data);
+                       dapl_ep_unlink_cm(cm_ptr->ep, cm_ptr);
+                       dapli_cm_free(cm_ptr);
+                       return;
+               }
+
                goto bail;
        }