From: Arlin Davis Date: Fri, 13 Sep 2013 22:12:05 +0000 (-0700) Subject: mpxyd: ERR: stalled, insufficient proxy memory X-Git-Url: https://openfabrics.org/gitweb/?a=commitdiff_plain;h=e5e46b1fd0a4d3bf6c0b2353c3bf74a6efb5dd0b;p=~ardavis%2Fdapl.git mpxyd: ERR: stalled, insufficient proxy memory When scaling up/out with lots of QP's using shared proxy buffer the rdma writes can block waiting for memory to free. The signal rate on the posted writes must be reduced to insure proxy buffer are freed in a more timely manner. Add logic to return failure if stalling becomes excessive. Allow administrator to adjust IB mcm_signal_rate via mpxyd.conf. Default is now 10 instead of 100. Signed-off-by: Arlin Davis --- diff --git a/dapl/svc/mpxyd.c b/dapl/svc/mpxyd.c index 77ca76b..e8b3f01 100644 --- a/dapl/svc/mpxyd.c +++ b/dapl/svc/mpxyd.c @@ -94,7 +94,7 @@ static int mix_max_msg_mb = 16; static int mix_inline_threshold = 256; static int mix_eager_completion = 1; static int mcm_ib_inline = 128; -static int mcm_ib_signal_rate = 20; +static int mcm_ib_signal_rate = 10; static int mcm_counters = 0; /* cm parameters */ @@ -844,7 +844,7 @@ static void mpxy_set_options( int debug_mode ) else if (!strcasecmp("scif_listen_qlen", opt)) scif_listen_qlen = atoi(value); else if (!strcasecmp("mcm_signal_rate", opt)) - mcm_signal = atoi(value); + mcm_ib_signal_rate = atoi(value); else if (!strcasecmp("mcm_req_timeout_ms", opt)) mcm_rep_ms = atoi(value); else if (!strcasecmp("mcm_rep_timeout_ms", opt)) @@ -892,6 +892,7 @@ static void mpxy_log_options(void) mlog(0, "RDMA SCIF inline threshold %d\n", mix_inline_threshold); mlog(0, "RDMA IB inline threshold %d\n", mcm_ib_inline); mlog(0, "RDMA eager completion %d\n", mix_eager_completion); + mlog(0, "RDMA proxy signal rate %d\n", mcm_ib_signal_rate); mlog(0, "Maximum message size %d MB\n", mix_max_msg_mb); mlog(0, "CM msg queue depth %d\n", mcm_depth); mlog(0, "CM msg completion signal rate %d\n", mcm_signal); @@ -2413,7 +2414,7 @@ err: /* create new proxy CQ */ static int mix_cq_create(mcm_scif_dev_t *smd, dat_mix_cq_t *pmsg) { - int len, ret; + int len, ret, cq_len; struct mcm_cq *new_mcq; /* hdr already read, get operation data */ @@ -2423,11 +2424,14 @@ static int mix_cq_create(mcm_scif_dev_t *smd, dat_mix_cq_t *pmsg) mlog(0, " ERR: ret %d, exp %d\n", ret, len); return ret; } - mlog(8, " MIX_CQ_CREATE: cq_len = %d, mic_ctx = %Lx\n", pmsg->cq_len, pmsg->cq_ctx); - if (m_cq_create(smd, pmsg->cq_len, &new_mcq)) + cq_len = max(pmsg->cq_len, DAT_MIX_WR_MAX); + mlog(8, " MIX_CQ_CREATE: cq_len = %d,%d mic_ctx = %Lx\n", pmsg->cq_len, cq_len, pmsg->cq_ctx); + + if (m_cq_create(smd, cq_len, &new_mcq)) goto err; new_mcq->cq_ctx = pmsg->cq_ctx; + pmsg->cq_len = cq_len; pmsg->cq_id = new_mcq->cq_id = new_mcq->entry.tid; pmsg->cq_ctx = (uint64_t)new_mcq; pmsg->hdr.status = MIX_SUCCESS; @@ -3413,10 +3417,21 @@ retry_mr: if (l_start < smd->m_tl && l_end > smd->m_tl) { MCNTR(smd->md, MCM_MX_MR_STALL); - if (++retries == 1) + if (++retries == 1) { write(smd->md->mc->tx_pipe[1], "w", sizeof("w")); /* signal tx_thread */ - mlog(0, " ERR: stalled, insufficient proxy memory, hd 0x%x, tl 0x%x, len %d, retries %d\n", - smd->m_hd, smd->m_tl, seg_len, retries); + mlog(0, " Warning: stalled, low proxy memory, %x hd 0x%x tl 0x%x %x," + " need 0x%x-0x%x ln %d, retrying %d\n", + smd->m_buf, smd->m_hd, smd->m_tl, smd->m_buf + smd->m_len, + l_start, l_end, seg_len, retries); + } + if (retries > 500) { + mlog(0, " ERROR: retries exhuasted, no proxy memory, %x hd 0x%x tl 0x%x %x," + " need 0x%x-0x%x ln %d, retries = %d\n", + smd->m_buf, smd->m_hd, smd->m_tl, smd->m_buf + smd->m_len, + l_start, l_end, seg_len, retries); + ret = ENOMEM; + goto bail; + } pthread_mutex_unlock(&smd->qplock); sleep_usec(500); pthread_mutex_lock(&smd->qplock); @@ -3434,7 +3449,7 @@ retry_mr: MCNTR(smd->md, MCM_MX_MR_STALL); if (++retries == 1) write(smd->md->mc->tx_pipe[1], "w", sizeof("w")); /* signal tx_thread */ - mlog(0, " ERR: stalled, insufficient proxy memory, hd 0x%x, tl 0x%x, len %d retries %d\n", + mlog(0, " ERR: stalled, low proxy memory, hd 0x%x, tl 0x%x, len %d retries %d\n", m_qp->m_hd, m_qp->m_tl, seg_len, retries); pthread_mutex_unlock(&smd->qplock); sleep_usec(500); diff --git a/doc/mpxyd.conf b/doc/mpxyd.conf index 7d80860..077c023 100644 --- a/doc/mpxyd.conf +++ b/doc/mpxyd.conf @@ -64,18 +64,17 @@ mcm_affinity_base_hca 0 mcm_affinity_base_mic 0 # mcm_depth: -# Specifies the number of request queue entries available for RDMA. -# per client/server connection. A larger depth consumes more system -# resources +# Specifies the number of request queue entries available for CM messages. +# A larger depth consumes more system resources, 256 * mcm_depth * ib devices mcm_depth 500 # mcm_signal_rate: -# Specifies the number of request posted before signaling for completions. +# Specifies the number of request segments posted before signaling for completions. # Larger the value reduces interrupts but could increase reserve times on -# buffer resources. +# buffer resources. Default = 10 -mcm_signal_rate 100 +mcm_signal_rate 10 # max_message_mb: # Specifies the maximum message size. The default is 16 (MB).