]> git.openfabrics.org - ~ardavis/dapl.git/commitdiff
mpxyd: ERR: stalled, insufficient proxy memory
authorArlin Davis <arlin.r.davis@intel.com>
Fri, 13 Sep 2013 22:12:05 +0000 (15:12 -0700)
committerArlin Davis <arlin.r.davis@intel.com>
Fri, 13 Sep 2013 22:12:05 +0000 (15:12 -0700)
When scaling up/out with lots of QP's using shared
proxy buffer the rdma writes can block waiting for
memory to free. The signal rate on the posted
writes must be reduced to insure proxy buffer
are freed in a more timely manner.

Add logic to return failure if stalling becomes
excessive.

Allow administrator to adjust IB mcm_signal_rate
via mpxyd.conf. Default is now 10 instead of 100.

Signed-off-by: Arlin Davis <arlin.r.davis@intel.com>
dapl/svc/mpxyd.c
doc/mpxyd.conf

index 77ca76bff17e7f42904a4b256928b4034711ac92..e8b3f01274353d29cabf54b90cb8da593c90eb5c 100644 (file)
@@ -94,7 +94,7 @@ static int mix_max_msg_mb = 16;
 static int mix_inline_threshold = 256;
 static int mix_eager_completion = 1;
 static int mcm_ib_inline = 128;
-static int mcm_ib_signal_rate = 20;
+static int mcm_ib_signal_rate = 10;
 static int mcm_counters = 0;
 
 /* cm parameters */
@@ -844,7 +844,7 @@ static void mpxy_set_options( int debug_mode )
                else if (!strcasecmp("scif_listen_qlen", opt))
                        scif_listen_qlen = atoi(value);
                else if (!strcasecmp("mcm_signal_rate", opt))
-                       mcm_signal = atoi(value);
+                       mcm_ib_signal_rate = atoi(value);
                else if (!strcasecmp("mcm_req_timeout_ms", opt))
                        mcm_rep_ms = atoi(value);
                else if (!strcasecmp("mcm_rep_timeout_ms", opt))
@@ -892,6 +892,7 @@ static void mpxy_log_options(void)
        mlog(0, "RDMA SCIF inline threshold %d\n", mix_inline_threshold);
        mlog(0, "RDMA IB inline threshold %d\n", mcm_ib_inline);
        mlog(0, "RDMA eager completion %d\n", mix_eager_completion);
+       mlog(0, "RDMA proxy signal rate %d\n", mcm_ib_signal_rate);
        mlog(0, "Maximum message size %d MB\n", mix_max_msg_mb);
        mlog(0, "CM msg queue depth %d\n", mcm_depth);
        mlog(0, "CM msg completion signal rate %d\n", mcm_signal);
@@ -2413,7 +2414,7 @@ err:
 /* create new proxy CQ */
 static int mix_cq_create(mcm_scif_dev_t *smd, dat_mix_cq_t *pmsg)
 {
-       int len, ret;
+       int len, ret, cq_len;
        struct mcm_cq *new_mcq;
 
        /* hdr already read, get operation data */
@@ -2423,11 +2424,14 @@ static int mix_cq_create(mcm_scif_dev_t *smd, dat_mix_cq_t *pmsg)
                mlog(0, " ERR: ret %d, exp %d\n", ret, len);
                return ret;
        }
-       mlog(8, " MIX_CQ_CREATE: cq_len = %d, mic_ctx = %Lx\n", pmsg->cq_len, pmsg->cq_ctx);
-       if (m_cq_create(smd, pmsg->cq_len, &new_mcq))
+       cq_len = max(pmsg->cq_len, DAT_MIX_WR_MAX);
+       mlog(8, " MIX_CQ_CREATE: cq_len = %d,%d mic_ctx = %Lx\n", pmsg->cq_len, cq_len, pmsg->cq_ctx);
+
+       if (m_cq_create(smd, cq_len, &new_mcq))
                goto err;
 
        new_mcq->cq_ctx = pmsg->cq_ctx;
+       pmsg->cq_len = cq_len;
        pmsg->cq_id = new_mcq->cq_id = new_mcq->entry.tid;
        pmsg->cq_ctx = (uint64_t)new_mcq;
        pmsg->hdr.status = MIX_SUCCESS;
@@ -3413,10 +3417,21 @@ retry_mr:
 
                                if (l_start < smd->m_tl && l_end > smd->m_tl) {
                                        MCNTR(smd->md, MCM_MX_MR_STALL);
-                                       if (++retries == 1)
+                                       if (++retries == 1) {
                                                write(smd->md->mc->tx_pipe[1], "w", sizeof("w")); /* signal tx_thread */
-                                       mlog(0, " ERR: stalled, insufficient proxy memory, hd 0x%x, tl 0x%x, len %d, retries %d\n",
-                                               smd->m_hd, smd->m_tl, seg_len, retries);
+                                               mlog(0, " Warning: stalled, low proxy memory, %x hd 0x%x tl 0x%x %x,"
+                                                       " need 0x%x-0x%x ln %d, retrying %d\n",
+                                                       smd->m_buf, smd->m_hd, smd->m_tl, smd->m_buf + smd->m_len,
+                                                       l_start, l_end, seg_len, retries);
+                                       }
+                                       if (retries > 500) {
+                                               mlog(0, " ERROR: retries exhuasted, no proxy memory, %x hd 0x%x tl 0x%x %x,"
+                                                       " need 0x%x-0x%x ln %d, retries = %d\n",
+                                                       smd->m_buf, smd->m_hd, smd->m_tl, smd->m_buf + smd->m_len,
+                                                       l_start, l_end, seg_len, retries);
+                                               ret = ENOMEM;
+                                               goto bail;
+                                       }
                                        pthread_mutex_unlock(&smd->qplock);
                                        sleep_usec(500);
                                        pthread_mutex_lock(&smd->qplock);
@@ -3434,7 +3449,7 @@ retry_mr:
                                        MCNTR(smd->md, MCM_MX_MR_STALL);
                                        if (++retries == 1)
                                                write(smd->md->mc->tx_pipe[1], "w", sizeof("w")); /* signal tx_thread */
-                                       mlog(0, " ERR: stalled, insufficient proxy memory, hd 0x%x, tl 0x%x, len %d retries %d\n",
+                                       mlog(0, " ERR: stalled, low proxy memory, hd 0x%x, tl 0x%x, len %d retries %d\n",
                                                m_qp->m_hd, m_qp->m_tl, seg_len, retries);
                                        pthread_mutex_unlock(&smd->qplock);
                                        sleep_usec(500);
index 7d8086095263165ce206e3224d206a1a038f2a50..077c023569a135f203bbff3e6e8cd36881c94d6e 100644 (file)
@@ -64,18 +64,17 @@ mcm_affinity_base_hca 0
 mcm_affinity_base_mic 0
 
 # mcm_depth:
-# Specifies the number of request queue entries available for RDMA.
-# per client/server connection.  A larger depth consumes more system 
-# resources
+# Specifies the number of request queue entries available for CM messages.
+# A larger depth consumes more system resources, 256 * mcm_depth * ib devices
 
 mcm_depth 500
 
 # mcm_signal_rate:
-# Specifies the number of request posted before signaling for completions.
+# Specifies the number of request segments posted before signaling for completions.
 # Larger the value reduces interrupts but could increase reserve times on 
-# buffer resources. 
+# buffer resources. Default = 10
 
-mcm_signal_rate 100
+mcm_signal_rate 10
 
 # max_message_mb:
 # Specifies the maximum message size. The default is 16 (MB).