From 49889a544fa254091dcd4acba233d1fc41376c14 Mon Sep 17 00:00:00 2001 From: Arlin Davis Date: Fri, 21 Jun 2013 13:43:17 -0700 Subject: [PATCH] mpxyd: add support for dynamic affinity support Add query feature via mic sysfs files numa_node and local_cpulist for proper thread bindings - host to device. Signed-off-by: Arlin Davis --- dapl/svc/mpxyd.c | 115 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 84 insertions(+), 31 deletions(-) diff --git a/dapl/svc/mpxyd.c b/dapl/svc/mpxyd.c index be2a910..1c1b1cd 100644 --- a/dapl/svc/mpxyd.c +++ b/dapl/svc/mpxyd.c @@ -135,13 +135,12 @@ typedef struct mcm_ib_dev { LLIST_ENTRY smd_list; /* MIC client open instances */ pthread_mutex_t slock; /* SCIF client device lock */ pthread_mutex_t plock; /* port space lock */ - pthread_mutex_t txlock; /* MCM UD tx lock */ + pthread_mutex_t txlock; /* MCM UD CM tx lock */ /* MCM - IB Device Resources */ struct ibv_device *ibdev; struct ibv_context *ibctx; struct mcm_client *mc; /* parent MIC client */ int ref_cnt; - char name[IBV_SYSFS_NAME_MAX]; uint16_t port; /* IB device port */ struct ibv_pd *pd; struct ibv_cq *scq; @@ -166,6 +165,7 @@ typedef struct mcm_ib_dev { int cm_timer; int rep_time; int rtu_time; + int numa_node; void *cntrs; } mcm_ib_dev_t; @@ -316,6 +316,8 @@ typedef struct mcm_scif_dev { #define MCM_CLIENT_MAX 16 typedef struct mcm_client { uint16_t scif_id; + int numa_node; + int cpu_mask; int op_pipe[2]; int tx_pipe[2]; int cm_pipe[2]; @@ -928,6 +930,30 @@ static void mpxyd_release_lock_file( void ) unlink(lock_file); } +static int rd_dev_file(char *path, char *file, char *v_str, int len) +{ + char *f_path; + int fd; + + if (asprintf(&f_path, "%s/%s", path, file) < 0) + return -1; + + fd = open(f_path, O_RDONLY); + if (fd < 0) { + free(f_path); + return -1; + } + + len = read(fd, v_str, len); + + if ((len > 0) && (v_str[--len] == '\n')) + v_str[len] = '\0'; + + close(fd); + free(f_path); + return 0; +} + static inline int scif_send_msg(scif_epd_t ep, void *msg, int len) { int ret; @@ -1343,7 +1369,7 @@ static void mcm_dqlisten(mcm_scif_dev_t *smd, mcm_cm_t *cm) /* * Open IB device */ -static struct ibv_context *open_ib_device(char *name, int port) +static struct ibv_context *open_ib_device(struct mcm_ib_dev *md, char *name, int port) { int i, ibcnt; struct ibv_device **iblist; @@ -1368,8 +1394,10 @@ static struct ibv_context *open_ib_device(char *name, int port) mlog(0,"ERR ibv_query, %s\n", strerror(errno)); goto bail; } - else + else { + md->ibdev = iblist[i]; break; + } } else { continue; @@ -1428,7 +1456,6 @@ static void mcm_destroy_md(struct mcm_ib_dev *md) } md->port = 0; - memset((void *)&md->name[0], 0, IBV_SYSFS_NAME_MAX); return; } @@ -1808,7 +1835,21 @@ static mcm_scif_dev_t *mix_open_device(char *name, int port, scif_epd_t op_ep, s /* New MIC node, start up OP and TX threads per node */ if (!mc->scif_id) { + char value[64]; + char path[64]; + mc->scif_id = node; + sprintf(path, "/sys/class/mic/mic%d/device", mc->scif_id - 1); + + if (!rd_dev_file(path, "numa_node", value, sizeof value)) + mc->numa_node = atoi(value); + + if (!rd_dev_file(path, "local_cpulist", value, sizeof value)) + mc->cpu_mask = atoi(value); + + mlog(0, " New MIC device - %s, numa_node %d, cpu %d - %s\n", + path, mc->numa_node, mc->cpu_mask, value); + if (pthread_create(&mc->op_thread, NULL, (void *(*)(void *))mpxy_op_thread, (void*)mc)) { mlog(0, " op pthread_create ERR: %s\n", strerror(errno)); @@ -1831,10 +1872,12 @@ static mcm_scif_dev_t *mix_open_device(char *name, int port, scif_epd_t op_ep, s for (i=0; imdev[i]; - if (!strcmp(md->name, name) && md->port == port) + if (md->ibdev && !strcmp(md->ibdev->name, name) && md->port == port) goto found; - else if (md->ibctx == NULL && new_md == NULL) + else if (md->ibctx == NULL && new_md == NULL) { new_md = md; + break; + } } if (!new_md) @@ -1856,10 +1899,9 @@ static mcm_scif_dev_t *mix_open_device(char *name, int port, scif_epd_t op_ep, s goto err; } memset(md->cntrs, 0, sizeof(uint64_t) * MCM_ALL_COUNTERS); - strcpy(md->name, name); md->mc = mc; md->port = port; - md->ibctx = open_ib_device(name, port); + md->ibctx = open_ib_device(md, name, port); if ((!md->ibctx) || init_cm_service(md)) { mcm_destroy_md(md); @@ -4188,7 +4230,7 @@ static void mcm_process_recv(mcm_ib_dev_t *md, dat_mcm_msg_t *msg, mcm_cm_t *cm, " <- op %s, %s spsp %x sqpn %x slid %x\n", mcm_op_str(ntohs(msg->op)), mcm_state_str(cm->state), ntohs(msg->sport), ntohl(msg->sqpn), ntohs(msg->saddr.lid)); - MCNTR(cm->md, MCM_CM_ERR_UNEXPECTED_STATE); + MCNTR(md, MCM_CM_ERR_UNEXPECTED_STATE); pthread_mutex_unlock(&cm->lock); break; } @@ -4372,7 +4414,7 @@ mcm_cm_t *mcm_get_cm(mcm_ib_dev_t *md, dat_mcm_msg_t *msg) } if (ntohs(msg->op) == MCM_DREP) { - MCNTR(cm->md, MCM_CM_ERR_DREP_DUP); + MCNTR(md, MCM_CM_ERR_DREP_DUP); } #ifdef MCM_DEBUG pthread_mutex_lock(&md->slock); @@ -4708,18 +4750,22 @@ void mpxy_tx_thread(void *mic_client) struct mcm_scif_dev *smd; struct mcm_cq *m_cq; struct pollfd set; - int i, data, events; + int i, data, events, cpu_id; char rbuf[2]; if (mcm_affinity) { - CPU_ZERO( &mc->tx_mask ); - CPU_SET( mcm_affinity_base_hca + mc->scif_id, &mc->tx_mask ); - if(sched_setaffinity( 0, sizeof(mc->tx_mask), &mc->tx_mask) == -1) + CPU_ZERO( &mc->op_mask ); + if (mcm_affinity_base_mic) /* static config settings */ + cpu_id = mcm_affinity_base_mic + mc->scif_id; + else + cpu_id = mc->cpu_mask + mc->scif_id; + + CPU_SET(cpu_id, &mc->op_mask ); + if(sched_setaffinity( 0, sizeof(mc->op_mask), &mc->op_mask) == -1) mlog(0, "WARNING: could not set CPU Affinity (%s), continuing...\n", strerror(errno)); } - mlog(0, "TX thread (%x) started for MIC %p node_id %d, CPU_affinity(%s)=%d\n", - pthread_self(), mc, mc->scif_id, mcm_affinity ? "SET":"UNSET", - mcm_affinity ? (mcm_affinity_base_hca + mc->scif_id):0 ); + mlog(0, "TX thread (%x) MIC node_id %d bound to numa_node %d and cpu_id=%d\n", + pthread_self(), mc->scif_id, mc->numa_node, mcm_affinity ? cpu_id:0 ); while (!finished) { pthread_mutex_lock(&mc->txlock); @@ -4776,17 +4822,21 @@ void mpxy_op_thread(void *mic_client) struct mcm_ib_dev *md; struct mcm_scif_dev *smd, *next; char rbuf[2]; - int i, ret, time_ms; + int i, ret, time_ms, cpu_id; if (mcm_affinity) { CPU_ZERO( &mc->op_mask ); - CPU_SET( (mcm_affinity_base_mic + mc->scif_id + 1), &mc->op_mask ); + if (mcm_affinity_base_mic) /* static config settings */ + cpu_id = mcm_affinity_base_mic + mc->scif_id + 1; + else + cpu_id = mc->cpu_mask + mc->scif_id + 1; + + CPU_SET(cpu_id, &mc->op_mask ); if(sched_setaffinity( 0, sizeof(mc->op_mask), &mc->op_mask) == -1) mlog(0, "WARNING: could not set CPU Affinity (%s), continuing...\n", strerror(errno)); } - mlog(0, "OP/CM thread (%x) started for MIC %p node_id %d, CPU_affinity(%s)=%d\n", - pthread_self(), mc, mc->scif_id, mcm_affinity ? "SET":"UNSET", - mcm_affinity ? (mcm_affinity_base_mic + mc->scif_id + 1):0 ); + mlog(0, "OP thread (%x) MIC node_id %d bound to numa_node %d and cpu_id=%d\n", + pthread_self(), mc->scif_id, mc->numa_node, mcm_affinity ? cpu_id:0 ); /* FD array */ set = mcm_alloc_fd_set(); @@ -4805,7 +4855,6 @@ void mpxy_op_thread(void *mic_client) if (md->ibctx == NULL) continue; - mcm_fd_set(md->ibctx->async_fd, set, POLLIN); mcm_fd_set(md->rch->fd, set, POLLIN); /* all active SCIF MIC clients, OP channels */ @@ -4868,18 +4917,22 @@ void mpxy_cm_thread(void *mic_client) mcm_client_t *mc = (mcm_client_t*)mic_client; struct mcm_ib_dev *md; struct pollfd set[MCM_IB_MAX*3]; - int i, fds; + int i, fds, cpu_id; char rbuf[2]; if (mcm_affinity) { - CPU_ZERO( &mc->cm_mask ); - CPU_SET( mcm_affinity_base_hca + mc->scif_id + 2, &mc->cm_mask ); - if(sched_setaffinity( 0, sizeof(mc->cm_mask), &mc->cm_mask) == -1) + CPU_ZERO( &mc->op_mask ); + if (mcm_affinity_base_mic) /* static config settings */ + cpu_id = mcm_affinity_base_mic + mc->scif_id + 2; + else + cpu_id = mc->cpu_mask + mc->scif_id + 2; + + CPU_SET(cpu_id, &mc->op_mask ); + if(sched_setaffinity( 0, sizeof(mc->op_mask), &mc->op_mask) == -1) mlog(0, "WARNING: could not set CPU Affinity (%s), continuing...\n", strerror(errno)); } - mlog(0, "CM thread (%x) started for MIC %p node_id %d, CPU_affinity(%s)=%d\n", - pthread_self(), mc, mc->scif_id, mcm_affinity ? "SET":"UNSET", - mcm_affinity ? (mcm_affinity_base_hca + mc->scif_id + 2):0 ); + mlog(0, "CM thread (%x) MIC node_id %d bound to numa_node %d and cpu_id=%d\n", + pthread_self(), mc->scif_id, mc->numa_node, mcm_affinity ? cpu_id:0 ); while (!finished) { fds = 0; -- 2.46.0