]> git.openfabrics.org - ~ardavis/dapl.git/commitdiff
commom: add cm, link, and diag event counters in IB extended builds
authorArlin Davis <arlin.r.davis@intel.com>
Fri, 20 Apr 2012 00:15:22 +0000 (17:15 -0700)
committerArlin Davis <arlin.r.davis@intel.com>
Fri, 20 Apr 2012 00:15:22 +0000 (17:15 -0700)
Add additional event monitoring capabilities during runtime to help
isolate issues during scaling in lieu of logging/printing warning
messages. Counters have been added to provider CM services and counters
have been added and mapped to sysfs ib_cm, device port and device
diag counters. ibdev_path is used for device sysfs counters.

uDAPL CM events are tracked on a per IA instance via internal
provider counters. The ib_cm, link, and diag events are tracked on a
per platform basis via sysfs. For these running counters a start
and stop function is provided for sampling and mapping to DAPL
64 bit counters. All counters, along with new start and stop functions,
are provided via dat_ib_extensions.h. New IB extension version is 2.0.7

New DCNT_IA_xx counters include 40 cm, 9 link, and 9 diag types.

To enable new counters (default build is disabled):
./configure --enable-counters

New bitmappings have been added to DAPL_DBG_TYPE environment
variable to automatically start/stop counters and log
errors if counters are enabled. The following will control
CM, LINK, and DIAG respectively:

   DAPL_DBG_TYPE_CM_ERRS = 0x080000,
   DAPL_DBG_TYPE_LINK_ERRS = 0x100000,
   DAPL_DBG_TYPE_DIAG_ERRS = 0x400000,

Signed-off-by: Arlin Davis <arlin.r.davis@intel.com>
Makefile.am
configure.in
dapl/common/dapl_debug.c
dapl/common/dapl_ia_open.c
dapl/common/dapl_ia_util.c
dapl/include/dapl_debug.h
dapl/openib_common/dapl_ib_common.h
dapl/openib_common/ib_extensions.c
dapl/udapl/linux/dapl_osd.h
dat/include/dat2/dat_ib_extensions.h

index a9bdedae2f62f68a0767013869550786d1f3f3a6..edff7f8018d18a58655940f33fd096e82d28db7b 100755 (executable)
@@ -20,6 +20,9 @@ XFLAGS = -DDAT_EXTENSIONS
 XPROGRAMS = dapl/openib_common/ib_extensions.c
 XHEADERS =
 XLIBS =
+if DEFINE_COUNTERS
+XFLAGS += -DDAPL_COUNTERS
+endif
 if COLL_TYPE_FCA
 XFLAGS += -DDAT_IB_COLLECTIVES -DDAT_FCA_PROVIDER
 XPROGRAMS += dapl/openib_common/collectives/fca_provider.c
index 71da96c3df5e2d85c93d49cff8d0cd731631d716..d577525f045bab7b3068f59007c45d210e00beba 100644 (file)
@@ -104,6 +104,17 @@ AC_ARG_ENABLE([ucm],
   [ucm=true])
 AM_CONDITIONAL(DEFINE_UCM, test x$ucm = xtrue)
 
+dnl Support to enable/disable IB extended counters (CM,LINK,DIAG)
+AC_ARG_ENABLE([counters], 
+  AS_HELP_STRING([--enable-counters],[enable counters provider build, default=disabled]),
+  [case "${enableval}" in
+    yes) counters=true ;;
+    no)  counters=false ;;
+    *) AC_MSG_ERROR(bad value ${enableval} for --enable-counters) ;; 
+  esac],
+  [counters=false])
+AM_CONDITIONAL(DEFINE_COUNTERS, test x$counters = xtrue)
+
 dnl Support ib_extension build - if enable-ext-type == ib 
 AC_ARG_ENABLE(ext-type,
 [  --enable-ext-type Enable extensions support for library: ib, none, default=ib],
index 7a0a199a4a861676b6e3c518c0b52cc97c18eafc..cb454964231c1474fb304fc0478a4674b143dc71 100644 (file)
@@ -74,6 +74,328 @@ void dapl_internal_dbg_log(DAPL_DBG_TYPE type, const char *fmt, ...)
 
 #ifdef DAPL_COUNTERS
 
+static int rd_ctr(const char *dev,
+                 const char *file,
+                 int port,
+                 DAT_IA_COUNTER_TYPE type,
+                 DAT_UINT64 *value)
+{
+       char *f_path;
+       int len, fd;
+       char vstr[21];
+       char pstr[2];
+
+       sprintf(pstr, "%d", port);
+       *value = 0;
+
+       switch (type) {
+       case DCNT_IA_CM:
+               if (asprintf(&f_path, "/sys/class/infiniband_cm/%s/%s/%s", dev, pstr, file) < 0)
+                       return -1;
+               break;
+       case DCNT_IA_LNK:
+               if (asprintf(&f_path, "%s/ports/%s/counters/%s", dev, pstr, file) < 0)
+                       return -1;
+               break;
+       case DCNT_IA_DIAG:
+               if (asprintf(&f_path, "%s/diag_counters/%s", dev, file) < 0)
+                       return -1;
+               break;
+       default:
+               return -1;
+       }
+
+       fd = open(f_path, O_RDONLY);
+       if (fd < 0) {
+               free(f_path);
+               return -1;
+       }
+
+       len = read(fd, vstr, 21);
+
+       if (len > 0 && vstr[--len] == '\n')
+               vstr[len] = '\0';
+
+       *value = (DAT_UINT64)atoi(vstr);
+
+       close(fd);
+       free(f_path);
+       return 0;
+}
+
+#ifdef _OPENIB_CMA_
+static void dapl_start_cm_cntrs(DAT_HANDLE dh)
+{
+       DAPL_IA *ia = (DAPL_IA *)dh;
+       const char *dev = ibv_get_device_name(ia->hca_ptr->ib_trans.ib_dev);
+       int port = ia->hca_ptr->port_num;
+       DAT_UINT64 *cntrs = (DAT_UINT64 *)ia->cntrs;
+
+       rd_ctr(dev,"cm_tx_msgs/req", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_REQ_TX]);
+       rd_ctr(dev,"cm_tx_msgs/rep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_REP_TX]);
+       rd_ctr(dev,"cm_tx_msgs/rtu", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_RTU_TX]);
+       rd_ctr(dev,"cm_tx_msgs/rej", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_USER_REJ_TX]);
+       rd_ctr(dev,"cm_tx_msgs/mra", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_MRA_TX]);
+       rd_ctr(dev,"cm_tx_msgs/dreq", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_DREQ_TX]);
+       rd_ctr(dev,"cm_tx_msgs/drep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_DREP_TX]);
+
+       rd_ctr(dev,"cm_rx_msgs/req", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_REQ_RX]);
+       rd_ctr(dev,"cm_rx_msgs/rep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_REP_RX]);
+       rd_ctr(dev,"cm_rx_msgs/rtu", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_RTU_RX]);
+       rd_ctr(dev,"cm_rx_msgs/rej", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_USER_REJ_RX]);
+       rd_ctr(dev,"cm_rx_msgs/mra", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_MRA_RX]);
+       rd_ctr(dev,"cm_rx_msgs/dreq", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_DREQ_RX]);
+       rd_ctr(dev,"cm_rx_msgs/drep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_DREP_RX]);
+
+       rd_ctr(dev,"cm_tx_retries/req", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_REQ_RETRY]);
+       rd_ctr(dev,"cm_tx_retries/rep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_REP_RETRY]);
+       rd_ctr(dev,"cm_tx_retries/rtu", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_RTU_RETRY]);
+       rd_ctr(dev,"cm_tx_retries/mra", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_MRA_RETRY]);
+       rd_ctr(dev,"cm_tx_retries/dreq", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_DREQ_RETRY]);
+       rd_ctr(dev,"cm_tx_retries/drep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_DREP_RETRY]);
+
+       rd_ctr(dev,"cm_tx_duplicates/req", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_REQ_DUP]);
+       rd_ctr(dev,"cm_tx_duplicates/rep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_REP_DUP]);
+       rd_ctr(dev,"cm_tx_duplicates/rtu", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_RTU_DUP]);
+       rd_ctr(dev,"cm_tx_duplicates/mra", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_MRA_DUP]);
+       rd_ctr(dev,"cm_tx_duplicates/dreq", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_DREQ_DUP]);
+       rd_ctr(dev,"cm_tx_duplicates/drep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_DREP_DUP]);
+}
+
+static void dapl_stop_cm_cntrs(DAT_HANDLE dh)
+{
+       DAPL_IA *ia = (DAPL_IA *)dh;
+       const char *dev = ibv_get_device_name(ia->hca_ptr->ib_trans.ib_dev);
+       int port = ia->hca_ptr->port_num;
+       DAT_UINT64 *cntrs = (DAT_UINT64 *)ia->cntrs;
+       DAT_UINT64 val = 0;
+
+       rd_ctr(dev,"cm_tx_msgs/req", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_REQ_TX] = val - cntrs[DCNT_IA_CM_REQ_TX];
+       rd_ctr(dev,"cm_tx_msgs/rep", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_REP_TX] = val - cntrs[DCNT_IA_CM_REP_TX];
+       rd_ctr(dev,"cm_tx_msgs/rtu", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_RTU_TX] = val - cntrs[DCNT_IA_CM_RTU_TX];
+       rd_ctr(dev,"cm_tx_msgs/rej", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_USER_REJ_TX] = val - cntrs[DCNT_IA_CM_USER_REJ_TX];
+       rd_ctr(dev,"cm_tx_msgs/mra", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_MRA_TX] = val - cntrs[DCNT_IA_CM_MRA_TX];
+       rd_ctr(dev,"cm_tx_msgs/dreq", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_DREQ_TX] = val - cntrs[DCNT_IA_CM_DREQ_TX];
+       rd_ctr(dev,"cm_tx_msgs/drep", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_DREP_TX] = val - cntrs[DCNT_IA_CM_DREP_TX];
+
+       rd_ctr(dev,"cm_rx_msgs/req", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_REQ_RX] = val - cntrs[DCNT_IA_CM_REQ_RX];
+       rd_ctr(dev,"cm_rx_msgs/rep", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_REP_RX] = val - cntrs[DCNT_IA_CM_REP_RX];
+       rd_ctr(dev,"cm_rx_msgs/rtu", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_RTU_RX] = val - cntrs[DCNT_IA_CM_RTU_RX];
+       rd_ctr(dev,"cm_rx_msgs/rej", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_USER_REJ_RX] = val - cntrs[DCNT_IA_CM_USER_REJ_RX];
+       rd_ctr(dev,"cm_rx_msgs/mra", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_MRA_RX] = val - cntrs[DCNT_IA_CM_MRA_RX];
+       rd_ctr(dev,"cm_rx_msgs/dreq", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_DREQ_RX] = val - cntrs[DCNT_IA_CM_DREQ_RX];
+       rd_ctr(dev,"cm_rx_msgs/drep", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_DREP_RX] = val - cntrs[DCNT_IA_CM_DREP_RX];
+
+       rd_ctr(dev,"cm_tx_retries/req", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_ERR_REQ_RETRY] = val - cntrs[DCNT_IA_CM_ERR_REQ_RETRY];
+       rd_ctr(dev,"cm_tx_retries/rep", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_ERR_REP_RETRY] = val - cntrs[DCNT_IA_CM_ERR_REP_RETRY];
+       rd_ctr(dev,"cm_tx_retries/rtu", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_ERR_RTU_RETRY] = val - cntrs[DCNT_IA_CM_ERR_RTU_RETRY];
+       rd_ctr(dev,"cm_tx_retries/mra", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_ERR_MRA_RETRY] = val - cntrs[DCNT_IA_CM_ERR_MRA_RETRY];
+       rd_ctr(dev,"cm_tx_retries/dreq", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_ERR_DREQ_RETRY] = val - cntrs[DCNT_IA_CM_ERR_DREQ_RETRY];
+       rd_ctr(dev,"cm_tx_retries/drep", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_ERR_DREP_RETRY] = val - cntrs[DCNT_IA_CM_ERR_DREP_RETRY];
+
+       rd_ctr(dev,"cm_tx_duplicates/req", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_ERR_REQ_DUP] = val - cntrs[DCNT_IA_CM_ERR_REQ_DUP];
+       rd_ctr(dev,"cm_tx_duplicates/rep", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_ERR_REP_DUP] = val - cntrs[DCNT_IA_CM_ERR_REP_DUP];
+       rd_ctr(dev,"cm_tx_duplicates/rtu", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_ERR_RTU_DUP] = val - cntrs[DCNT_IA_CM_ERR_RTU_DUP];
+       rd_ctr(dev,"cm_tx_duplicates/mra", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_ERR_MRA_DUP] = val - cntrs[DCNT_IA_CM_ERR_MRA_DUP];
+       rd_ctr(dev,"cm_tx_duplicates/dreq", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_ERR_DREQ_DUP] = val - cntrs[DCNT_IA_CM_ERR_DREQ_DUP];
+       rd_ctr(dev,"cm_tx_duplicates/drep", port, DCNT_IA_CM, &val);
+       cntrs[DCNT_IA_CM_ERR_DREP_DUP] = val - cntrs[DCNT_IA_CM_ERR_DREP_DUP];
+}
+#endif
+
+/* map selective IB port counters to dapl counters */
+static void dapl_start_lnk_cntrs(DAT_HANDLE dh)
+{
+       DAPL_IA *ia = (DAPL_IA *)dh;
+       char *dev = ia->hca_ptr->ib_hca_handle->device->ibdev_path;
+       int port = ia->hca_ptr->port_num;
+       DAT_UINT64 *cntrs = (DAT_UINT64 *)ia->cntrs;
+
+       rd_ctr(dev,"port_rcv_errors", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_ERR_RCV]);
+       rd_ctr(dev,"port_rcv_remote_physical_errors", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_ERR_RCV_REM_PHYS]);
+       rd_ctr(dev,"port_rcv_contraint_errors", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_ERR_RCV_CONSTRAINT]);
+       rd_ctr(dev,"port_xmit_discards", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_ERR_XMT_DISCARDS]);
+       rd_ctr(dev,"port_xmit_contraint", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_ERR_XMT_CONTRAINT]);
+       rd_ctr(dev,"local_link_integrity_errors", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_ERR_INTEGRITY]);
+       rd_ctr(dev,"excessive_buffer_overrun_errors", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_ERR_EXC_BUF_OVERRUN]);
+       rd_ctr(dev,"port_xmit_wait", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_WARN_XMT_WAIT]);
+       rd_ctr(dev,"port_rcv_switch_relay_errors", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_WARN_RCV_SW_RELAY]);
+}
+
+static void dapl_stop_lnk_cntrs(DAT_HANDLE dh)
+{
+       DAPL_IA *ia = (DAPL_IA *)dh;
+       char *dev = ia->hca_ptr->ib_hca_handle->device->ibdev_path;
+       int port = ia->hca_ptr->port_num;
+       DAT_UINT64 *cntrs = (DAT_UINT64 *)ia->cntrs;
+       DAT_UINT64 val = 0;
+
+       rd_ctr(dev,"port_rcv_errors", port, DCNT_IA_LNK, &val);
+       cntrs[DCNT_IA_LNK_ERR_RCV] = val - cntrs[DCNT_IA_LNK_ERR_RCV];
+       rd_ctr(dev,"port_rcv_remote_physical_errors", port, DCNT_IA_LNK, &val);
+       cntrs[DCNT_IA_LNK_ERR_RCV_REM_PHYS] = val - cntrs[DCNT_IA_LNK_ERR_RCV_REM_PHYS];
+       rd_ctr(dev,"port_rcv_contraint_errors", port, DCNT_IA_LNK, &val);
+       cntrs[DCNT_IA_LNK_ERR_RCV_CONSTRAINT] = val - cntrs[DCNT_IA_LNK_ERR_RCV_CONSTRAINT];
+       rd_ctr(dev,"port_xmit_discards", port, DCNT_IA_LNK, &val);
+       cntrs[DCNT_IA_LNK_ERR_XMT_DISCARDS] = val - cntrs[DCNT_IA_LNK_ERR_XMT_DISCARDS];
+       rd_ctr(dev,"port_xmit_contraint", port, DCNT_IA_LNK, &val);
+       cntrs[DCNT_IA_LNK_ERR_XMT_CONTRAINT] = val - cntrs[DCNT_IA_LNK_ERR_XMT_CONTRAINT];
+       rd_ctr(dev,"local_link_integrity_errors", port, DCNT_IA_LNK, &val);
+       cntrs[DCNT_IA_LNK_ERR_INTEGRITY]  = val - cntrs[DCNT_IA_LNK_ERR_INTEGRITY] ;
+       rd_ctr(dev,"excessive_buffer_overrun_errors", port, DCNT_IA_LNK, &val);
+       cntrs[DCNT_IA_LNK_ERR_EXC_BUF_OVERRUN] = val - cntrs[DCNT_IA_LNK_ERR_EXC_BUF_OVERRUN];
+       rd_ctr(dev,"port_rcv_switch_relay_errors", port, DCNT_IA_LNK, &val);
+       cntrs[DCNT_IA_LNK_WARN_RCV_SW_RELAY] = val - cntrs[DCNT_IA_LNK_WARN_RCV_SW_RELAY];
+       rd_ctr(dev,"port_xmit_wait", port, DCNT_IA_LNK, &val);
+       cntrs[DCNT_IA_LNK_WARN_XMT_WAIT] = val - cntrs[DCNT_IA_LNK_WARN_XMT_WAIT];
+}
+
+/* map selective IB diag_counters to dapl counters */
+static void dapl_start_diag_cntrs(DAT_HANDLE dh)
+{
+       DAPL_IA *ia = (DAPL_IA *)dh;
+       char *dev = ia->hca_ptr->ib_hca_handle->device->ibdev_path;
+       int port = ia->hca_ptr->port_num;
+       DAT_UINT64 *cntrs = (DAT_UINT64 *)ia->cntrs;
+
+       rd_ctr(dev,"rq_num_rae", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_RQ_RAE]);
+       rd_ctr(dev,"rq_num_oos", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_RQ_OOS]);
+       rd_ctr(dev,"rq_num_rire", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_RQ_RIRE]);
+       rd_ctr(dev,"rq_num_udsdprd", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_RQ_UDSDPRD]);
+       rd_ctr(dev,"sq_num_rae", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_SQ_RAE]);
+       rd_ctr(dev,"sq_num_oos", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_SQ_OOS]);
+       rd_ctr(dev,"sq_num_rire", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_SQ_RIRE]);
+       rd_ctr(dev,"sq_num_rree", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_SQ_RREE]);
+       rd_ctr(dev,"sq_num_tree", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_SQ_TREE]);
+}
+
+static void dapl_stop_diag_cntrs(DAT_HANDLE dh)
+{
+       DAPL_IA *ia = (DAPL_IA *)dh;
+       char *dev = ia->hca_ptr->ib_hca_handle->device->ibdev_path;
+       int port = ia->hca_ptr->port_num;
+       DAT_UINT64 *cntrs = (DAT_UINT64 *)ia->cntrs;
+       DAT_UINT64 val = 0;
+
+       rd_ctr(dev,"rq_num_rae", port, DCNT_IA_DIAG, &val);
+       cntrs[DCNT_IA_DIAG_ERR_RQ_RAE] = val - cntrs[DCNT_IA_DIAG_ERR_RQ_RAE];
+       rd_ctr(dev,"rq_num_oos", port, DCNT_IA_DIAG, &val);
+       cntrs[DCNT_IA_DIAG_ERR_RQ_OOS] = val - cntrs[DCNT_IA_DIAG_ERR_RQ_OOS];
+       rd_ctr(dev,"rq_num_rire", port, DCNT_IA_DIAG, &val);
+       cntrs[DCNT_IA_DIAG_ERR_RQ_RIRE] = val - cntrs[DCNT_IA_DIAG_ERR_RQ_RIRE];
+       rd_ctr(dev,"rq_num_udsdprd", port, DCNT_IA_DIAG, &val);
+       cntrs[DCNT_IA_DIAG_ERR_RQ_UDSDPRD] = val - cntrs[DCNT_IA_DIAG_ERR_RQ_UDSDPRD];
+       rd_ctr(dev,"sq_num_rae", port, DCNT_IA_DIAG, &val);
+       cntrs[DCNT_IA_DIAG_ERR_SQ_RAE] = val - cntrs[DCNT_IA_DIAG_ERR_SQ_RAE];
+       rd_ctr(dev,"sq_num_oos", port, DCNT_IA_DIAG, &val);
+       cntrs[DCNT_IA_DIAG_ERR_SQ_OOS] = val - cntrs[DCNT_IA_DIAG_ERR_SQ_OOS];
+       rd_ctr(dev,"sq_num_rire", port, DCNT_IA_DIAG, &val);
+       cntrs[DCNT_IA_DIAG_ERR_SQ_RIRE] = val - cntrs[DCNT_IA_DIAG_ERR_SQ_RIRE];
+       rd_ctr(dev,"sq_num_rree", port, DCNT_IA_DIAG, &val);
+       cntrs[DCNT_IA_DIAG_ERR_SQ_RREE] = val - cntrs[DCNT_IA_DIAG_ERR_SQ_RREE];
+       rd_ctr(dev,"sq_num_tree", port, DCNT_IA_DIAG, &val);
+       cntrs[DCNT_IA_DIAG_ERR_SQ_TREE] = val - cntrs[DCNT_IA_DIAG_ERR_SQ_TREE];
+}
+
+void dapl_start_counters(DAT_HANDLE dh, DAT_IA_COUNTER_TYPE type)
+{
+       switch (type) {
+       case DCNT_IA_CM:
+#ifdef _OPENIB_CMA_
+               dapl_start_cm_cntrs(dh); /* ib cm timers, cma only */
+#endif
+               break;
+       case DCNT_IA_LNK:
+               dapl_start_lnk_cntrs(dh);
+               break;
+       case DCNT_IA_DIAG:
+               dapl_start_diag_cntrs(dh);
+               break;
+       default:
+               break;
+       }
+}
+
+void dapl_stop_counters(DAT_HANDLE dh, DAT_IA_COUNTER_TYPE type)
+{
+       switch (type) {
+       case DCNT_IA_CM:
+#ifdef _OPENIB_CMA_
+               dapl_stop_cm_cntrs(dh);
+#endif
+               break;
+       case DCNT_IA_LNK:
+               dapl_stop_lnk_cntrs(dh);
+               break;
+       case DCNT_IA_DIAG:
+               dapl_stop_diag_cntrs(dh);
+               break;
+       default:
+               break;
+
+       }
+}
+
+void dapli_start_counters(DAT_HANDLE dh)
+{
+#ifdef _OPENIB_CMA_
+       if (g_dapl_dbg_type & (DAPL_DBG_TYPE_CM_ERRS | DAPL_DBG_TYPE_CM_STATS))
+               dapl_start_cm_cntrs(dh);
+#endif
+       if (g_dapl_dbg_type & DAPL_DBG_TYPE_LINK_ERRS)
+               dapl_start_lnk_cntrs(dh);
+       if (g_dapl_dbg_type & DAPL_DBG_TYPE_DIAG_ERRS)
+               dapl_start_diag_cntrs(dh);
+}
+
+void dapli_stop_counters(DAT_HANDLE dh)
+{
+#ifdef _OPENIB_CMA_
+       if (g_dapl_dbg_type & (DAPL_DBG_TYPE_CM_ERRS | DAPL_DBG_TYPE_CM_STATS))
+               dapl_stop_cm_cntrs(dh);
+#endif
+       if (g_dapl_dbg_type & DAPL_DBG_TYPE_LINK_ERRS)
+               dapl_stop_lnk_cntrs(dh);
+       if (g_dapl_dbg_type & DAPL_DBG_TYPE_DIAG_ERRS)
+               dapl_stop_diag_cntrs(dh);
+
+       if (g_dapl_dbg_type & DAPL_DBG_TYPE_CM_STATS)
+               dapl_print_counter_str(dh, DCNT_IA_ALL_COUNTERS, 1, "_CM");
+       else if (g_dapl_dbg_type & DAPL_DBG_TYPE_CM_ERRS)
+               dapl_print_counter_str(dh, DCNT_IA_ALL_COUNTERS, 1, "_CM_ERR");
+       if (g_dapl_dbg_type & DAPL_DBG_TYPE_LINK_ERRS)
+               dapl_print_counter_str(dh, DCNT_IA_ALL_COUNTERS, 1, "_LNK_ERR");
+       if (g_dapl_dbg_type & DAPL_DBG_TYPE_LINK_WARN)
+               dapl_print_counter_str(dh, DCNT_IA_ALL_COUNTERS, 1, "_LNK_WARN");
+       if (g_dapl_dbg_type & DAPL_DBG_TYPE_DIAG_ERRS)
+               dapl_print_counter_str(dh, DCNT_IA_ALL_COUNTERS, 1, "_DIAG_ERR");
+}
+
 /*
  * The order of this list must match the DAT counter definitions 
  */
@@ -103,9 +425,69 @@ static char *ia_cntr_names[] = {
        "DCNT_IA_MEM_FREE",
        "DCNT_IA_ASYNC_ERROR",
        "DCNT_IA_ASYNC_QP_ERROR",
-       "DCNT_IA_ASYNC_CQ_ERROR"
+       "DCNT_IA_ASYNC_CQ_ERROR",
+       "DCNT_IA_CM_LISTEN",
+       "DCNT_IA_CM_REQ_TX",
+       "DCNT_IA_CM_REQ_RX",
+       "DCNT_IA_CM_REP_TX",
+       "DCNT_IA_CM_REP_RX",
+       "DCNT_IA_CM_RTU_TX",
+       "DCNT_IA_CM_RTU_RX",
+       "DCNT_IA_CM_USER_REJ_TX",
+       "DCNT_IA_CM_USER_REJ_RX",
+       "DCNT_IA_CM_ACTIVE_EST",
+       "DCNT_IA_CM_PASSIVE_EST",
+       "DCNT_IA_CM_AH_REQ_TX",
+       "DCNT_IA_CM_AH_REQ_RX",
+       "DCNT_IA_CM_AH_RESOLVED",
+       "DCNT_IA_CM_DREQ_TX",
+       "DCNT_IA_CM_DREQ_RX",
+       "DCNT_IA_CM_DREP_TX",
+       "DCNT_IA_CM_DREP_RX",
+       "DCNT_IA_CM_MRA_TX",
+       "DCNT_IA_CM_MRA_RX",
+       "DCNT_IA_CM_REQ_FULLQ_POLL",
+       "DCNT_IA_CM_ERR",
+       "DCNT_IA_CM_ERR_REQ_FULLQ",
+       "DCNT_IA_CM_ERR_REQ_DUP",
+       "DCNT_IA_CM_ERR_REQ_RETRY",
+       "DCNT_IA_CM_ERR_REP_DUP",
+       "DCNT_IA_CM_ERR_REP_RETRY",
+       "DCNT_IA_CM_ERR_RTU_DUP",
+       "DCNT_IA_CM_ERR_RTU_RETRY",
+       "DCNT_IA_CM_ERR_REFUSED",
+       "DCNT_IA_CM_ERR_RESET",
+       "DCNT_IA_CM_ERR_TIMEOUT",
+       "DCNT_IA_CM_ERR_REJ_TX",
+       "DCNT_IA_CM_ERR_REJ_RX",
+       "DCNT_IA_CM_ERR_DREQ_DUP",
+       "DCNT_IA_CM_ERR_DREQ_RETRY",
+       "DCNT_IA_CM_ERR_DREP_DUP",
+       "DCNT_IA_CM_ERR_DREP_RETRY",
+       "DCNT_IA_CM_ERR_MRA_DUP",
+       "DCNT_IA_CM_ERR_MRA_RETRY",
+       "DCNT_IA_CM_ERR_UNEXPECTED",
+       "DCNT_IA_LNK_ERR_RCV",
+       "DCNT_IA_LNK_ERR_RCV_REM_PHYS",
+       "DCNT_IA_LNK_ERR_RCV_CONSTRAINT",
+       "DCNT_IA_LNK_ERR_XMT_DISCARDS",
+       "DCNT_IA_LNK_ERR_XMT_CONTRAINT",
+       "DCNT_IA_LNK_ERR_INTEGRITY",
+       "DCNT_IA_LNK_ERR_EXC_BUF_OVERRUN",
+       "DCNT_IA_LNK_WARN_RCV_SW_RELAY",
+       "DCNT_IA_LNK_WARN_XMT_WAIT",
+       "DCNT_IA_DIAG_ERR_RQ_RAE",
+       "DCNT_IA_DIAG_ERR_RQ_OOS",
+       "DCNT_IA_DIAG_ERR_RQ_RIRE",
+       "DCNT_IA_DIAG_ERR_RQ_UDSDPRD",
+       "DCNT_IA_DIAG_ERR_SQ_RAE",
+       "DCNT_IA_DIAG_ERR_SQ_OOS",
+       "DCNT_IA_DIAG_ERR_SQ_RIRE",
+       "DCNT_IA_DIAG_ERR_SQ_RREE",
+       "DCNT_IA_DIAG_ERR_SQ_TREE",
 };
 
+
 static char *ep_cntr_names[] = {
        "DCNT_EP_CONNECT",
        "DCNT_EP_DISCONNECT",
@@ -234,8 +616,9 @@ void dapl_print_counter(DAT_HANDLE dh, int counter, int reset)
 
        for (i = 0; i < max; i++) {
                if ((counter == i) || (counter == max)) {
-                       printf(" %s = " F64u " \n",
-                              dapl_query_counter_name(dh, i), p_cntrs[i]);
+                       printf(" %s:0x%x: %s = " F64u " \n",
+                               _hostname_, dapl_os_getpid(),
+                               dapl_query_counter_name(dh, i), p_cntrs[i]);
                        if (reset)
                                p_cntrs[i] = 0;
                }
@@ -246,7 +629,47 @@ void dapl_print_counter(DAT_HANDLE dh, int counter, int reset)
            (g_dapl_dbg_type & DAPL_DBG_TYPE_CM_LIST)) {
                dapls_print_cm_list((DAPL_IA*)dh);
        }
-       return;
+}
+
+void dapl_print_counter_str(DAT_HANDLE dh, int counter, int reset, const char *pattern)
+{
+       int i, max;
+       DAT_UINT64 *p_cntrs;
+       DAT_HANDLE_TYPE type = 0;
+       DAPL_IA *ia = NULL;
+
+       dat_get_handle_type(dh, &type);
+
+       switch (type) {
+       case DAT_HANDLE_TYPE_IA:
+               max = DCNT_IA_ALL_COUNTERS;
+               ia = (DAPL_IA *)dh;
+               p_cntrs = ((DAPL_IA *) dh)->cntrs;
+               break;
+       case DAT_HANDLE_TYPE_EP:
+               max = DCNT_EP_ALL_COUNTERS;
+               p_cntrs = ((DAPL_EP *) dh)->cntrs;
+               break;
+       case DAT_HANDLE_TYPE_EVD:
+               max = DCNT_EVD_ALL_COUNTERS;
+               p_cntrs = ((DAPL_EVD *) dh)->cntrs;
+               break;
+       default:
+               return;
+       }
+
+       /* print only counters with pattern string match and non-zero values */
+       for (i = 0; i < max; i++) {
+               if ((counter == i) || (counter == max)) {
+                       if (p_cntrs[i] && !dapl_os_pstrcmp(pattern, dapl_query_counter_name(dh, i))) {
+                               printf(" %s:0x%x: %s = " F64u " \n",
+                                       _hostname_, dapl_os_getpid(),
+                                       dapl_query_counter_name(dh, i), p_cntrs[i]);
+                               if (reset)
+                                       p_cntrs[i] = 0;
+                       }
+               }
+       }
 }
 
 #endif                         /* DAPL_COUNTERS */
index edead04d54c2f4bb952ef9ff9ec6d8487c61cc0c..e43d78d3e2b5fb226b8ee76c3f972d44311ad53d 100644 (file)
@@ -266,6 +266,10 @@ dapl_ia_open(IN const DAT_NAME_PTR name,
        *ia_handle_ptr = ia_ptr;
        *async_evd_handle_ptr = evd_ptr;
 
+#if DAPL_COUNTERS
+       dapli_start_counters((DAT_HANDLE)ia_ptr);
+#endif
+
       bail:
        if (dat_status != DAT_SUCCESS) {
                if (ia_ptr) {
index 2208c23b26bc630cd5b9c63495ef52e7bae04d8b..6d1b5a83fe9d9e11645d27f41c83afc14bb3fc67 100755 (executable)
@@ -525,6 +525,13 @@ void dapli_ia_release_hca(DAPL_HCA * hca_ptr)
        dapl_os_lock(&hca_ptr->lock);
        dapl_os_atomic_dec(&hca_ptr->handle_ref_count);
        if (dapl_os_atomic_read(&hca_ptr->handle_ref_count) == 0) {
+#ifdef DAPL_COUNTERS
+{
+               DAPL_IA *ia = (DAPL_IA *)dapl_llist_peek_head(&hca_ptr->ia_list_head);
+               dapli_stop_counters(ia);
+               dapl_os_free(ia->cntrs, sizeof(DAT_UINT64) * DCNT_IA_ALL_COUNTERS);
+}
+#endif
                dapls_ib_close_hca(hca_ptr);
                hca_ptr->ib_hca_handle = IB_INVALID_HANDLE;
                hca_ptr->async_evd = NULL;
@@ -566,11 +573,6 @@ void dapls_ia_free(DAPL_IA * ia_ptr)
        dapl_hca_unlink_ia(ia_ptr->hca_ptr, ia_ptr);
        ia_ptr->header.magic = DAPL_MAGIC_INVALID;      /* reset magic to prevent reuse */
        dapl_os_lock_destroy(&ia_ptr->header.lock);
-
-#ifdef DAPL_COUNTERS
-       dapl_os_free(ia_ptr->cntrs, sizeof(DAT_UINT64) * DCNT_IA_ALL_COUNTERS);
-#endif                         /* DAPL_COUNTERS */
-
        dapl_os_free(ia_ptr, sizeof(DAPL_IA));
 }
 
index bb11c3ddf40073ab450368ab3cd077162d246947..6cbe0284df7895de2d86b9ef7f49b0aa59d36fa1 100644 (file)
@@ -71,7 +71,11 @@ typedef enum
     DAPL_DBG_TYPE_CM_EST       = 0x8000,
     DAPL_DBG_TYPE_CM_WARN      = 0x10000,
     DAPL_DBG_TYPE_EXTENSION    = 0x20000,
-    DAPL_DBG_TYPE_CM_STATS     = 0x40000
+    DAPL_DBG_TYPE_CM_STATS     = 0x40000,
+    DAPL_DBG_TYPE_CM_ERRS      = 0x80000,
+    DAPL_DBG_TYPE_LINK_ERRS    = 0x100000,
+    DAPL_DBG_TYPE_LINK_WARN    = 0x200000,
+    DAPL_DBG_TYPE_DIAG_ERRS    = 0x400000,
 
 } DAPL_DBG_TYPE;
 
@@ -100,6 +104,7 @@ extern void dapl_internal_dbg_log(DAPL_DBG_TYPE type,  const char *fmt,  ...);
 
 #define DAPL_CNTR(h_ptr, cntr) ((DAT_UINT64*)h_ptr->cntrs)[cntr]++
 #define DAPL_CNTR_DATA(h_ptr, cntr, data) ((DAT_UINT64*)h_ptr->cntrs)[cntr]+= data
+#define DAPL_CNTR_RESET(h_ptr, cntr) ((DAT_UINT64*)h_ptr->cntrs)[cntr] = 0
 
 DAT_RETURN dapl_query_counter(DAT_HANDLE dh, 
                              int counter, 
@@ -107,11 +112,17 @@ DAT_RETURN dapl_query_counter(DAT_HANDLE dh,
                              int reset);
 char *dapl_query_counter_name(DAT_HANDLE dh, int counter);
 void dapl_print_counter(DAT_HANDLE dh, int counter, int reset);
+void dapl_print_counter_str(DAT_HANDLE dh, int counter, int reset, const char *pattern);
+void dapl_start_counters(DAT_HANDLE ia, DAT_IA_COUNTER_TYPE type);
+void dapl_stop_counters(DAT_HANDLE ia, DAT_IA_COUNTER_TYPE type);
+void dapli_start_counters(DAT_HANDLE ia);
+void dapli_stop_counters(DAT_HANDLE ia);
 
 #else
 
 #define DAPL_CNTR(handle, cntr)
 #define DAPL_CNTR_DATA(handle, cntr, data)
+#define DAPL_CNTR_RESET(handle, cntr)
 
 #endif /* DAPL_COUNTERS */
 
index e757b650bb7a897a3a1bef4d72115e62539eb627..ba805d0b4bd2e4c65de50fad97017d259c4f76ea 100644 (file)
@@ -342,7 +342,7 @@ dapl_convert_errno( IN int err, IN const char *str )
     if (!err)  return DAT_SUCCESS;
        
     if ((err != EAGAIN) && (err != ETIMEDOUT))
-       dapl_log (DAPL_DBG_TYPE_ERR," %s %s\n", str, strerror(err));
+       dapl_log (DAPL_DBG_TYPE_ERR," DAPL ERR %s %s\n", str, strerror(err));
 
     switch( err )
     {
index c85323c6d1c56216d2f5916d591d89c9e423fb94..0952bd52339e5e5c4ce347b0b0b8490ea8082177 100644 (file)
@@ -184,6 +184,32 @@ dapl_extensions(IN DAT_HANDLE dat_handle,
                        status = DAT_SUCCESS;
                        break;
                }
+       case DAT_IB_START_COUNTERS_OP:
+               {
+                       DAT_IA_COUNTER_TYPE type;
+
+                       dapl_dbg_log(DAPL_DBG_TYPE_RTN,
+                                    " Start counter extension call\n");
+
+                       type = va_arg(args, int);
+
+                       dapl_start_counters(dat_handle, type);
+                       status = DAT_SUCCESS;
+                       break;
+               }
+       case DAT_IB_STOP_COUNTERS_OP:
+               {
+                       DAT_IA_COUNTER_TYPE type;
+
+                       dapl_dbg_log(DAPL_DBG_TYPE_RTN,
+                                    " Start counter extension call\n");
+
+                       type = va_arg(args, int);
+
+                       dapl_stop_counters(dat_handle, type);
+                       status = DAT_SUCCESS;
+                       break;
+               }
 #endif                         /* DAPL_COUNTERS */
 #ifdef DAT_IB_COLLECTIVES
        case DAT_IB_COLLECTIVE_CREATE_MEMBER_OP:
index cb61cae6101008aba027fa7beb95e8b64786cf29..71984394f1172bcbe1b5baa2c0694643fe13298a 100644 (file)
@@ -515,6 +515,22 @@ STATIC _INLINE_ char * dapl_os_strdup(const char *str)
     return strdup(str);
 }
 
+STATIC _INLINE_ int dapl_os_pstrcmp(const char *pstr, const char *str)
+{
+       int i, ii;
+       int plen = strlen(pstr);
+       int slen = strlen(str);
+
+       for (i=0; i < slen; i++) {
+               for (ii=0; ii < plen && i < slen; ii++, i++) {
+                       if ((pstr[ii] == str[i]) && (ii == plen-1))
+                               return 0;
+                       else if (pstr[ii] != str[i])
+                               break;
+               }
+       }
+       return 1;
+}
 
 /*
  * Timer Functions
index ac69fedec3f55015b7b6da855a4cb09b9fd0e4cf..6e3cb9ee91d313b473fa3204d1596a07936b2694 100755 (executable)
  * 2.0.4 - Add DAT_IB_UD_CONNECTION_REJECT_EVENT extended UD event
  * 2.0.5 - Add DAT_IB_UD extended UD connection error events
  * 2.0.6 - Add MPI over IB collective extensions
+ * 2.0.7 - Add new IA counters for dapl CM, device LINK, device DIAG
  *
  */
-#define DAT_IB_EXTENSION_VERSION       206     /* 2.0.6 */
+#define DAT_IB_EXTENSION_VERSION       207     /* 2.0.7 */
 #define DAT_IB_ATTR_COUNTERS           "DAT_COUNTERS"
 #define DAT_IB_ATTR_FETCH_AND_ADD      "DAT_IB_FETCH_AND_ADD"
 #define DAT_IB_ATTR_CMP_AND_SWAP       "DAT_IB_CMP_AND_SWAP"
@@ -151,6 +152,8 @@ typedef enum dat_ib_op
        DAT_IB_COLLECTIVE_SCAN_OP,
        DAT_IB_COLLECTIVE_BROADCAST_OP,
        DAT_IB_COLLECTIVE_BARRIER_OP,
+       DAT_IB_START_COUNTERS_OP,
+       DAT_IB_STOP_COUNTERS_OP,
        
 } DAT_IB_OP;
 
@@ -369,6 +372,65 @@ typedef enum dat_ia_counters
        DCNT_IA_ASYNC_ERROR,
        DCNT_IA_ASYNC_QP_ERROR,
        DCNT_IA_ASYNC_CQ_ERROR,
+       DCNT_IA_CM_LISTEN,
+       DCNT_IA_CM_REQ_TX,
+       DCNT_IA_CM_REQ_RX,
+       DCNT_IA_CM_REP_TX,
+       DCNT_IA_CM_REP_RX,
+       DCNT_IA_CM_RTU_TX,
+       DCNT_IA_CM_RTU_RX,
+       DCNT_IA_CM_USER_REJ_TX,
+       DCNT_IA_CM_USER_REJ_RX,
+       DCNT_IA_CM_ACTIVE_EST,
+       DCNT_IA_CM_PASSIVE_EST,
+       DCNT_IA_CM_AH_REQ_TX,
+       DCNT_IA_CM_AH_REQ_RX,
+       DCNT_IA_CM_AH_RESOLVED,
+       DCNT_IA_CM_DREQ_TX,
+       DCNT_IA_CM_DREQ_RX,
+       DCNT_IA_CM_DREP_TX,
+       DCNT_IA_CM_DREP_RX,
+       DCNT_IA_CM_MRA_TX,
+       DCNT_IA_CM_MRA_RX,
+       DCNT_IA_CM_REQ_FULLQ_POLL,
+       DCNT_IA_CM_ERR,
+       DCNT_IA_CM_ERR_REQ_FULLQ,
+       DCNT_IA_CM_ERR_REQ_DUP,
+       DCNT_IA_CM_ERR_REQ_RETRY,
+       DCNT_IA_CM_ERR_REP_DUP,
+       DCNT_IA_CM_ERR_REP_RETRY,
+       DCNT_IA_CM_ERR_RTU_DUP,
+       DCNT_IA_CM_ERR_RTU_RETRY,
+       DCNT_IA_CM_ERR_REFUSED,
+       DCNT_IA_CM_ERR_RESET,
+       DCNT_IA_CM_ERR_TIMEOUT,
+       DCNT_IA_CM_ERR_REJ_TX,
+       DCNT_IA_CM_ERR_REJ_RX,
+       DCNT_IA_CM_ERR_DREQ_DUP,
+       DCNT_IA_CM_ERR_DREQ_RETRY,
+       DCNT_IA_CM_ERR_DREP_DUP,
+       DCNT_IA_CM_ERR_DREP_RETRY,
+       DCNT_IA_CM_ERR_MRA_DUP,
+       DCNT_IA_CM_ERR_MRA_RETRY,
+       DCNT_IA_CM_ERR_UNEXPECTED,
+       DCNT_IA_LNK_ERR_RCV,
+       DCNT_IA_LNK_ERR_RCV_REM_PHYS,
+       DCNT_IA_LNK_ERR_RCV_CONSTRAINT,
+       DCNT_IA_LNK_ERR_XMT_DISCARDS,
+       DCNT_IA_LNK_ERR_XMT_CONTRAINT,
+       DCNT_IA_LNK_ERR_INTEGRITY,
+       DCNT_IA_LNK_ERR_EXC_BUF_OVERRUN,
+       DCNT_IA_LNK_WARN_RCV_SW_RELAY,
+       DCNT_IA_LNK_WARN_XMT_WAIT,
+       DCNT_IA_DIAG_ERR_RQ_RAE,
+       DCNT_IA_DIAG_ERR_RQ_OOS,
+       DCNT_IA_DIAG_ERR_RQ_RIRE,
+       DCNT_IA_DIAG_ERR_RQ_UDSDPRD,
+       DCNT_IA_DIAG_ERR_SQ_RAE,
+       DCNT_IA_DIAG_ERR_SQ_OOS,
+       DCNT_IA_DIAG_ERR_SQ_RIRE,
+       DCNT_IA_DIAG_ERR_SQ_RREE,
+       DCNT_IA_DIAG_ERR_SQ_TREE,
        DCNT_IA_ALL_COUNTERS,  /* MUST be last */
 
 } DAT_IA_COUNTERS;
@@ -425,6 +487,19 @@ typedef enum dat_evd_counters
 
 } DAT_EVD_COUNTERS;
 
+/*
+ * Definitions IA Counter Types
+ *     for sampling running counters
+ *
+ */
+typedef enum dat_ia_counter_type
+{
+       DCNT_IA_CM,
+       DCNT_IA_LNK,
+       DCNT_IA_DIAG,
+
+} DAT_IA_COUNTER_TYPE;
+
 /*
  * Data type for reduce operations
  */
@@ -654,6 +729,24 @@ dat_strerror_ext_status (
                IN (int) (cntr), \
                IN (int) (reset))
 
+/*
+ * Start and stop counter(s):
+ * Provide IA, call will start sampling running IB counters
+ *     DAT_HANDLE dat_handle, counter type (link, diag)
+ *
+ */
+#define dat_ib_start_counter(dat_handle, type) \
+       dat_extension_op(\
+               IN (DAT_HANDLE) dat_handle, \
+               IN (DAT_IB_OP) DAT_IB_START_COUNTERS_OP, \
+               IN (DAT_COUNTER_TYPE) (type))
+
+#define dat_ib_stop_counter(dat_handle, type) \
+       dat_extension_op(\
+               IN (DAT_HANDLE) dat_handle, \
+               IN (DAT_IB_OP) DAT_IB_STOP_COUNTERS_OP, \
+               IN (DAT_COUNTER_TYPE) (type))
+
 /*
  ************************ MPI IB Collective Functions ***********************
  */