From: Arlin Davis Date: Thu, 13 May 2010 17:31:17 +0000 (-0700) Subject: scm: SOCKOPT ERR Connection timed out on large clusters X-Git-Tag: dapl-2.0.30-1~25 X-Git-Url: https://openfabrics.org/gitweb/?a=commitdiff_plain;h=4b04afc32940ac42fb2a9bc789a537b527d149fe;p=~ardavis%2Fdapl.git scm: SOCKOPT ERR Connection timed out on large clusters Large scale all to all connections on +1000 cores the listen backlog is reached and SYN's are dropped which causes the connect to timeout. Retry connect on timeout errors. Signed-off-by: Arlin Davis --- diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c index 7465190..4c8d4a1 100644 --- a/dapl/openib_scm/cm.c +++ b/dapl/openib_scm/cm.c @@ -60,6 +60,12 @@ #include "dapl_ep_util.h" #include "dapl_osd.h" +/* forward declarations */ +static DAT_RETURN +dapli_socket_connect(DAPL_EP * ep_ptr, + DAT_IA_ADDRESS_PTR r_addr, + DAT_CONN_QUAL r_qual, DAT_COUNT p_size, DAT_PVOID p_data); + #ifdef DAPL_DBG /* Check for EP linking to IA and proper connect state */ void dapli_ep_check(DAPL_EP *ep) @@ -494,13 +500,27 @@ static void dapli_socket_connected(dp_ib_cm_handle_t cm_ptr, int err) if (err) { dapl_log(DAPL_DBG_TYPE_ERR, - " CONN_PENDING: %s ERR %s -> %s %d\n", + " CONN_PENDING: %s ERR %s -> %s %d - %s\n", err == -1 ? "POLL" : "SOCKOPT", err == -1 ? strerror(dapl_socket_errno()) : strerror(err), inet_ntoa(((struct sockaddr_in *) &cm_ptr->addr)->sin_addr), ntohs(((struct sockaddr_in *) - &cm_ptr->addr)->sin_port)); + &cm_ptr->addr)->sin_port), + err == ETIMEDOUT ? "RETRYING...":"ABORTING"); + + /* retry a timeout */ + if (err == ETIMEDOUT) { + closesocket(cm_ptr->socket); + cm_ptr->socket = DAPL_INVALID_SOCKET; + dapli_socket_connect(cm_ptr->ep, (DAT_IA_ADDRESS_PTR)&cm_ptr->addr, + ntohs(((struct sockaddr_in *)&cm_ptr->addr)->sin_port) - 1000, + ntohs(cm_ptr->msg.p_size), &cm_ptr->msg.p_data); + dapl_ep_unlink_cm(cm_ptr->ep, cm_ptr); + dapli_cm_free(cm_ptr); + return; + } + goto bail; }