#define ALIGN_PAGE(o) ((o + 4096 - 1) & ~(4096-1))
#define ALIGN_UP_PPAGE(o) ((((uintptr_t)o) + 4096 - 1)& ~(4096-1))
#define ALIGN_DOWN_PPAGE(o) ((((uintptr_t)o)) & ~(4096-1))
+#define ALIGN_64_PAD(o) (ALIGN_64(o) - (o))
static inline char * mcm_qp_state_str(IN int st)
{
MIX_PZ_FREE,
MIX_QUERY_DEVICE,
MIX_QUERY_PORT,
+ MIX_MMAP_ALLOC,
+ MIX_MMAP_FREE,
MIX_LAST_OP, /* Keep last */
} dat_mix_ops_t;
"PZ_FREE",
"QUERY_DEVICE",
"QUERY_PORT",
+ "MMAP_ALLOC",
+ "MMAP_FREE",
};
return ((op < 2 || op >= MIX_LAST_OP) ? "Invalid OP?" : mix_ops[op]);
}
} __attribute__((packed)) dat_mix_dev_attr_t;
+#define DAT_MIX_MMAP_CAP ( 1 << 0 )
+
/**** MIX attributes, 120 bytes *****/
typedef struct dat_mix_prov_attr
{
uint8_t gid_idx;
uint32_t cpu_model;
uint32_t cpu_family;
- uint8_t resv[31];
-
+ uint8_t cap;
+ uint8_t resv[30];
} __attribute__((packed)) dat_mix_prov_attr_t;
/***** MIX open, device address info returned */
} __attribute__((packed)) dat_mix_sr_t;
+/* DAT_MIX_MMAP_CAP support: fast post_send via scif MMAP */
+typedef struct dat_mix_mmap_addr
+{
+ dat_mix_hdr_t hdr;
+ off_t addr;
+} __attribute__((packed)) dat_mix_mmap_addr_t;
+
+#define DAT_MIX_MMAP_WR_MAX 8
+
+typedef struct dat_mix_mmap_wr
+{
+ dat_mix_sr_t msg;
+ uint8_t inline_data[DAT_MIX_INLINE_MAX];
+ volatile uint32_t flags;
+ uint8_t tpad[ALIGN_64_PAD(sizeof(dat_mix_sr_t) + DAT_MIX_INLINE_MAX + sizeof(uint32_t))];
+
+} __attribute__((packed)) dat_mix_mmap_wr_t;
+
typedef union dat_mix_msg
{
dat_mix_open_t op;
dat_mix_wc_t wc;
dat_mix_wr_t wr;
dat_mix_dto_comp_t dto;
- dat_mix_sr_t sr;
-
+ dat_mix_sr_t sr;
+ dat_mix_mmap_addr_t mm_addr;
+ dat_mix_mmap_wr_t mm_wr;
} DAT_MIX_MSG;
#define DAT_MIX_MSG_MAX sizeof(DAT_MIX_MSG)
}
sprintf(tp->fam_str, "%d", tp->pr_attr.cpu_family);
sprintf(tp->mod_str, "%d", tp->pr_attr.cpu_model);
+
+ /* scif_mmap post_sends MIC->HST if supported */
+ if (tp->pr_attr.cap & DAT_MIX_MMAP_CAP) {
+ if (dapli_mix_mmap_alloc(tp)) {
+ dapl_log(DAPL_DBG_TYPE_WARN,
+ "mmap ERR: run compat mode\n");
+ }
+ }
}
#else
tp->na.mode = "DIRECT";
ib_named_attr_t na;
dat_mix_prov_attr_t pr_attr; /* attributes from proxy */
+ /* Direct memory mapping for post_send WR entries, MIC to HOST */
+ dat_mix_mmap_wr_t *mm_s_addr; /* sbuf for post_send WR to host with MMAP */
+ int mm_s_head; /* next location in PEER array to write WR */
+ dat_mix_mmap_wr_t *mm_s_peer_addr; /* writes goto remote MMAP address on host */
+ off_t mm_s_peer_addr_off; /* PEER scif registered memory for sbuf WR array */
+ int *mm_s_place_holder; /* local not used local memory for mmap */
+ volatile int *mm_r_addr; /* host mmap this rbuf mem and updates tail */
+ off_t mm_r_addr_off; /* SCIF registration for rbuf, host writes */
+
} ib_hca_transport_t;
/* prototypes */
/* MIC eXchange (MIX) operations, mix.c */
int dapli_mix_mode(ib_hca_transport_t *tp, char *name);
int dapli_mix_open(ib_hca_transport_t *tp, char *name, int port, int query);
+int dapli_mix_mmap_free(ib_hca_transport_t *tp, uint8_t stat);
+int dapli_mix_mmap_alloc(ib_hca_transport_t *tp);
void dapli_mix_close(ib_hca_transport_t *tp);
int dapli_mix_get_attr(ib_hca_transport_t *tp, dat_mix_prov_attr_t *pr_attr);
int dapli_mix_query_device(ib_hca_transport_t *tp, struct ibv_device_attr *dev_attr);
return 0;
}
+
+/*
+ * MIX_MMAP_FREE
+ */
+int dapli_mix_mmap_free(ib_hca_transport_t *tp, uint8_t stat)
+{
+ dat_mix_mmap_addr_t msg;
+ int len, ret = 0;
+
+ dapl_log(DAPL_DBG_TYPE_EXTENSION," MIX_MMAP_FREE\n");
+
+ if (tp->mm_s_peer_addr_off != SCIF_REGISTER_FAILED && tp->scif_ep) {
+
+ msg.hdr.ver = DAT_MIX_VER;
+ msg.hdr.op = MIX_MMAP_FREE;
+ msg.hdr.status = stat;
+ msg.hdr.flags = MIX_OP_REQ;
+ msg.hdr.req_id = dapl_os_getpid();
+
+ len = sizeof(dat_mix_mmap_addr_t);
+ ret = scif_send(tp->scif_ep, &msg, len, SCIF_SEND_BLOCK);
+ if (ret != len) {
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " scif_send ERR %s ret %d, exp %d, err %s\n",
+ mix_op_str(msg.hdr.op), ret, len,
+ strerror(errno));
+ return -1;
+ }
+ dapl_log(DAPL_DBG_TYPE_EXTENSION,
+ " %s ep %d, req_id 0x%x\n",
+ mix_op_str(msg.hdr.op), tp->scif_ep, msg.hdr.req_id);
+
+ /* wait to other side to set "no access" to our local memory */
+ ret = scif_recv(tp->scif_ep, &msg, len, SCIF_RECV_BLOCK);
+ if (ret != len) {
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " scif_recv ERR %s ret %d, exp %d, err %s\n",
+ mix_op_str(msg.hdr.op), ret,
+ len, strerror(errno));
+ return -1;
+ }
+
+ if (msg.hdr.op != MIX_MMAP_FREE ||
+ msg.hdr.flags != MIX_OP_RSP ||
+ msg.hdr.status != MIX_SUCCESS) {
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " reply ERR %s, flags 0x%x, stat 0x%x\n",
+ mix_op_str(msg.hdr.op),
+ msg.hdr.flags, msg.hdr.status);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * MIX_MMAP_ALLOC
+ */
+int dapli_mix_mmap_alloc(ib_hca_transport_t *tp)
+{
+ dat_mix_mmap_addr_t msg;
+ int len, ret;
+
+ dapl_log(DAPL_DBG_TYPE_EXTENSION," MIX_MMAP_ALLOC\n");
+
+ if(!tp->mm_s_addr) {
+ dapl_log(DAPL_DBG_TYPE_WARN,
+ " WARN: mmap_init err - don't send mmap info\n");
+ return -1;
+ }
+
+ msg.hdr.ver = DAT_MIX_VER;
+ msg.hdr.op = MIX_MMAP_ALLOC;
+ msg.hdr.status = 0;
+ msg.hdr.flags = MIX_OP_REQ;
+ msg.hdr.req_id = dapl_os_getpid();
+ msg.addr = tp->mm_r_addr_off;
+
+ len = sizeof(dat_mix_mmap_addr_t);
+ ret = scif_send(tp->scif_ep, &msg, len, SCIF_SEND_BLOCK);
+ if (ret != len) {
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " ERR: %s send on %d, ret %d, exp %d, error %s\n",
+ mix_op_str(msg.hdr.op),tp->scif_ep, ret,
+ len, strerror(errno));
+ goto remote_err;
+ }
+
+ dapl_log(DAPL_DBG_TYPE_EXTENSION,
+ " Sent %s request on SCIF EP %d, req_id 0x%x\n",
+ mix_op_str(msg.hdr.op), tp->scif_ep, ntohl(msg.hdr.req_id));
+
+ /* MIX_SEND_OP_ADDR_EXG: reply includes peer scif address for SEND OP buffer */
+ ret = scif_recv(tp->scif_ep, &msg, len, SCIF_RECV_BLOCK);
+ if (ret != len) {
+ dapl_log(1, " ERR: send_op_addr_exg ep %d, ret %d, exp %d, error %s\n",
+ tp->scif_ep, ret, len, strerror(errno));
+ goto remote_err;
+ }
+
+ if (msg.addr == SCIF_REGISTER_FAILED || msg.hdr.op != MIX_MMAP_ALLOC ||
+ msg.hdr.flags != MIX_OP_RSP || msg.hdr.status != MIX_SUCCESS) {
+ dapl_log(1, " ERR: send op exg: op %s, flags 0x%x, stat 0x%x peer addr 0x%llx\n",
+ mix_op_str(msg.hdr.op), msg.hdr.flags, msg.hdr.status, msg.addr);
+ goto remote_err;
+ }
+
+ tp->mm_s_peer_addr_off = msg.addr; /* scif_off from proxy host, WR array */
+
+ dapl_log(DAPL_DBG_TYPE_EXTENSION,
+ " Recv'd %s reply on SCIF EP %d, dev_id %d is 0x%llx\n",
+ mix_op_str(msg.hdr.op), tp->scif_ep, msg.hdr.req_id, msg.addr);
+
+ dapl_log(DAPL_DBG_TYPE_EXTENSION,
+ " s_off 0x%llx, r_off 0x%llx, peer_head = 0x%x\n",
+ tp->mm_s_peer_addr_off, tp->mm_r_addr_off, *tp->mm_r_addr);
+
+ /* mmap host memory, dat_mix_mmap_wr_t WR array, to write as local memory */
+ ret = posix_memalign((void **)&tp->mm_s_place_holder, 4096,
+ ALIGN_PAGE(DAT_MIX_MMAP_WR_MAX * sizeof(dat_mix_mmap_wr_t)));
+ if (ret) {
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " ERR: send op exg: alloc mmap_place_holder. %d\n",
+ strerror(errno));
+ goto local_err;
+ }
+
+ tp->mm_s_peer_addr = (dat_mix_mmap_wr_t *)
+ scif_mmap(&tp->mm_s_place_holder,
+ ALIGN_PAGE(DAT_MIX_MMAP_WR_MAX * sizeof(dat_mix_mmap_wr_t)),
+ SCIF_PROT_READ | SCIF_PROT_WRITE,
+ 0, tp->scif_ep,
+ tp->mm_s_peer_addr_off);
+
+ if (tp->mm_s_peer_addr == SCIF_MMAP_FAILED) {
+ dapl_log(DAPL_DBG_TYPE_ERR, " ERR: send op exg: Failed to mmap peer memory");
+ goto local_err;
+ }
+
+ dapl_log(DAPL_DBG_TYPE_EXTENSION,
+ " mm_s_place_holder %p, mm_s_peer_addr %p\n",
+ tp->mm_s_place_holder, tp->mm_s_peer_addr);
+
+
+ return 0;
+
+local_err:
+ if (tp->mm_s_place_holder)
+ free(tp->mm_s_place_holder);
+
+ dapli_mix_mmap_free(tp, MIX_ENOMEM); /* Send abort to host */
+
+remote_err:
+ tp->mm_s_peer_addr_off = SCIF_REGISTER_FAILED;
+ tp->mm_s_peer_addr = NULL;
+
+ return -1;
+}
+
+
+/*
+ * Allocate and register buffers needed for scif_mmap and fast post_send WR's
+ */
+static int mix_mmap_init(ib_hca_transport_t *tp)
+{
+ int ret, len;
+
+ dapl_log(DAPL_DBG_TYPE_EXTENSION," mix_mmap_init\n");
+
+ tp->mm_s_peer_addr_off = SCIF_REGISTER_FAILED;
+ tp->mm_s_peer_addr = NULL;
+
+ len = ALIGN_PAGE(DAT_MIX_MMAP_WR_MAX * sizeof(dat_mix_mmap_wr_t));
+ ret = posix_memalign((void **)&tp->mm_s_addr, 4096, len);
+ if (ret) {
+ dapl_log(DAPL_DBG_TYPE_WARN,
+ "mmap_init: ERR sbuf alloc - %s\n", strerror(errno));
+ tp->mm_s_addr = NULL;
+ goto err;
+ }
+ memset(tp->mm_s_addr, 0, len);
+
+ dapl_log(DAPL_DBG_TYPE_EXTENSION,
+ " mmap_init: sbuf %p ln %d\n", tp->mm_s_addr, len);
+
+ len = ALIGN_PAGE(sizeof(uint32_t));
+ ret = posix_memalign((void **)&tp->mm_r_addr, 4096, len);
+ if (ret) {
+ dapl_log(DAPL_DBG_TYPE_WARN,
+ "mmap_init: ERR rbuf alloc - %s\n", strerror(errno));
+ goto err1;
+ }
+ memset((void *)tp->mm_r_addr, 0, len);
+
+ dapl_log(DAPL_DBG_TYPE_EXTENSION,
+ " mmap_init: rbuf %p ln %d\n", tp->mm_r_addr, len);
+
+ tp->mm_r_addr_off =
+ scif_register(tp->scif_ep, (void *)tp->mm_r_addr, len,
+ (off_t)0, SCIF_PROT_READ | SCIF_PROT_WRITE, 0);
+
+ if (tp->mm_r_addr_off == SCIF_REGISTER_FAILED) {
+ dapl_log(DAPL_DBG_TYPE_WARN,
+ "mmap_init: ERR scif_reg - %s\n", strerror(errno));
+ goto err2;
+ }
+
+ dapl_log(DAPL_DBG_TYPE_EXTENSION,
+ " mmap_init: success - rbuf scif registered off = 0x%llx\n",
+ tp->mm_r_addr_off);
+
+ tp->mm_s_head = 0;
+ return 0;
+
+err2:
+ free((void *)tp->mm_r_addr);
+ tp->mm_r_addr = NULL;
+
+err1:
+ free((void*)tp->mm_s_addr);
+ tp->mm_s_addr = NULL;
+
+err:
+ return -1;
+}
+
+/*
+ * Free the post_send WR data structures needed for direct scif mmap
+ */
+static void mix_mmap_free(ib_hca_transport_t *tp)
+{
+ dapl_log(DAPL_DBG_TYPE_EXTENSION," Clean send OP\n");
+
+ tp->mm_s_peer_addr_off = SCIF_REGISTER_FAILED;
+
+ if (tp->mm_s_peer_addr) {
+ scif_munmap((void *)tp->mm_s_peer_addr,
+ ALIGN_PAGE(DAT_MIX_MMAP_WR_MAX * sizeof(dat_mix_mmap_wr_t)));
+ tp->mm_s_peer_addr = NULL;
+ }
+
+ if (tp->mm_s_place_holder) {
+ free(tp->mm_s_place_holder);
+ tp->mm_s_place_holder = NULL;
+ }
+
+ /* unmap host before free local memory */
+ dapli_mix_mmap_free(tp, MIX_SUCCESS);
+
+ /* Make sure to unmap this memry at host before unregister and free */
+ if (tp->scif_ep && tp->mm_r_addr_off > 0) {
+ scif_unregister(tp->scif_ep, tp->mm_r_addr_off, ALIGN_PAGE(sizeof(uint32_t)));
+ tp->mm_r_addr_off = SCIF_REGISTER_FAILED;
+ }
+
+ if (tp->mm_s_addr) {
+ free(tp->mm_s_addr);
+ tp->mm_s_addr = NULL;
+ }
+
+ if(tp->mm_r_addr) {
+ free((void *)tp->mm_r_addr);
+ tp->mm_r_addr = NULL;
+ }
+}
+
/*
* MIX_IA_OPEN
*/
tp->scif_ep, ret, len, strerror(errno));
return -1;
}
- dapl_log(DAPL_DBG_TYPE_EXTENSION," Recv'd %s reply on SCIF EP %d, dev_id %d\n",
- mix_op_str(msg.hdr.op), tp->scif_ep, msg.hdr.req_id);
+ dapl_log(DAPL_DBG_TYPE_EXTENSION,
+ " Recv'd %s reply on SCIF EP %d, dev_id %d\n",
+ mix_op_str(msg.hdr.op), tp->scif_ep, msg.hdr.req_id);
if (msg.hdr.ver != DAT_MIX_VER || msg.hdr.op != MIX_IA_OPEN ||
msg.hdr.flags != MIX_OP_RSP || msg.hdr.status != MIX_SUCCESS) {
tp->ib_cm.mtu = msg.dev_attr.mtu; /* proxy sets active_MTU mode */
tp->dev_id = msg.hdr.req_id;
+ /* We do not use this var in MFO, but use it as a flag to signal success */
if (MFO_EP(&tp->addr))
- /* We do not use this var in MFO, but use it as a flag to signal success */
tp->ib_ctx = (struct ibv_context *)0xdeadbeef;
+ if (mix_mmap_init(tp)) {
+ dapl_log(DAPL_DBG_TYPE_WARN,
+ " WARN: init mmap for send_op failed\n");
+ }
+
dapl_log(DAPL_DBG_TYPE_EXTENSION,
" mix_open reply (msg %p, ln %d) EPs %d %d %d - dev_id %d lid 0x%x\n",
&msg, len, tp->scif_ep, tp->scif_ev_ep,
" MIX_IA_CLOSE: tp %p scif EP's %d,%d,%d dev_id %d\n",
tp, tp->scif_ep, tp->scif_tx_ep, tp->scif_ev_ep, tp->dev_id);
+ mix_mmap_free(tp);
+
if (tp->scif_ep) {
scif_close(tp->scif_ep);
tp->scif_ep = 0;
int dapli_mix_post_send(ib_qp_handle_t m_qp, int txlen, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr)
{
char cmd[DAT_MIX_MSG_MAX + DAT_MIX_INLINE_MAX];
- dat_mix_sr_t *msg = (dat_mix_sr_t *)cmd;
+ dat_mix_sr_t *msg = (dat_mix_sr_t *) cmd;
scif_epd_t mix_ep = m_qp->tp->scif_ep;
- int ret, i, offset = sizeof(dat_mix_sr_t);
+ int ret, i, stall, off = sizeof(dat_mix_sr_t);
+ ib_hca_transport_t *tp = m_qp->tp;
+ dat_mix_mmap_wr_t *mm_addr;
- dapl_log(DAPL_DBG_TYPE_EXTENSION,
- " mix_post_send: msg=%p sge=%d len=%d op=%d off=%d (%p)raddr %Lx rkey 0x%x, wr_id %LX\n",
- msg, wr->num_sge, txlen, wr->opcode, offset, &wr->wr.rdma.remote_addr,
- wr->wr.rdma.remote_addr, wr->wr.rdma.rkey, wr->wr_id);
+ if (tp->mm_s_peer_addr_off != SCIF_REGISTER_FAILED) {
+ msg = &tp->mm_s_addr[tp->mm_s_head].msg;
+ tp->mm_s_addr[tp->mm_s_head].flags = 0;
+ }
if (wr->opcode != IBV_WR_SEND &&
wr->opcode != IBV_WR_RDMA_WRITE &&
} else {
msg->hdr.flags |= MIX_OP_INLINE;
for (i=0; i < wr->num_sge; i++) {
- memcpy(&cmd[offset], (void*)wr->sg_list[i].addr, wr->sg_list[i].length);
- offset += wr->sg_list[i].length;
+ if(tp->mm_s_peer_addr_off != SCIF_REGISTER_FAILED) {
+ memcpy(&((char *)msg)[off], (void*)wr->sg_list[i].addr, wr->sg_list[i].length);
+ } else {
+ memcpy(&cmd[off], (void*)wr->sg_list[i].addr, wr->sg_list[i].length);
+ }
+ off += wr->sg_list[i].length;
}
}
- ret = scif_send(mix_ep, msg, offset, SCIF_SEND_BLOCK);
- if (ret != offset) {
- dapl_log(1, " ERR: %s on %d, ret %d, exp %d, error %s\n",
- mix_op_str(msg->hdr.op), mix_ep, ret,
- offset, strerror(errno));
- return -1;
- }
+ if (tp->mm_s_peer_addr_off != SCIF_REGISTER_FAILED) {
+ stall=0;
+ while (((tp->mm_s_head + 1) % DAT_MIX_MMAP_WR_MAX) == *tp->mm_r_addr) {
+ if(!stall) {
+ dapl_log(DAPL_DBG_TYPE_EXTENSION,
+ "post_send mmap: WR qfull. hd %d tl %d\n",
+ tp->mm_s_head, *tp->mm_r_addr);
+ }
+ stall++;
+ usleep(1);
+ }
- dapl_log(DAPL_DBG_TYPE_EXTENSION," Sent MIX_SEND on SCIF EP %d, mlen=%d\n", mix_ep, offset);
+ /* Copy WR + inline via mmap, sync data, notify peer */
+ mm_addr = tp->mm_s_peer_addr + tp->mm_s_head;
+
+ memcpy((void *)mm_addr, (void *)msg, ALIGN_64(off));
+ __sync_synchronize();
+
+ *((uint32_t *)(((char *)mm_addr) + offsetof(dat_mix_mmap_wr_t, flags))) = 1;
+ tp->mm_s_head = (tp->mm_s_head + 1) % DAT_MIX_MMAP_WR_MAX; /* next */
+
+ } else {
+ ret = scif_send(mix_ep, msg, off, SCIF_SEND_BLOCK);
+ if (ret != off) {
+ dapl_log(1, " ERR: %s on %d, ret %d, exp %d, error %s\n",
+ mix_op_str(msg->hdr.op), mix_ep, ret,
+ off, strerror(errno));
+ return -1;
+ }
+ }
return 0;
+
}
int dapli_mix_post_recv(ib_qp_handle_t m_qp, int len, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr)
pr_attr->system_guid = system_guid;
pr_attr->cpu_model = mcm_cpu_model;
pr_attr->cpu_family = mcm_cpu_family;
+ pr_attr->cap |= DAT_MIX_MMAP_CAP;
}
/* close MCM device, MIC client, md->slock held */
return (scif_send_msg(smd->scif_op_ep, (void*)pmsg, len));
}
+
+static int mix_mmap_free(mcm_scif_dev_t *smd, dat_mix_mmap_addr_t *pmsg)
+{
+ int ret, len;
+
+ /* hdr already read, get operation data */
+ len = sizeof(dat_mix_mmap_addr_t) - sizeof(dat_mix_hdr_t);
+ ret = scif_recv(smd->scif_op_ep, ((char*)pmsg + sizeof(dat_mix_hdr_t)), len, SCIF_RECV_BLOCK);
+ if (ret != len) {
+ mlog(0, " ERR: ret %d, exp %d %s\n", ret, len, strerror(errno));
+ return ret;
+ }
+
+ if (pmsg->hdr.status != MIX_SUCCESS ) {
+ /* MIC could not init after host ACK ADDR_EXG - fall back to scif_send */
+ smd->mm_s_peer_addr_off = SCIF_REGISTER_FAILED;
+ mlog(0, " WARN: MIC failed to init SEND_OP via mmap memory\n");
+ }
+
+ pmsg->hdr.status = MIX_SUCCESS;
+
+ /* MIC side want us not to access it's memory */
+ if (smd->mm_s_peer_addr) {
+ ret = scif_munmap((void *)smd->mm_s_peer_addr, ALIGN_PAGE(sizeof(int)));
+ if (ret < 0) {
+ mlog(0, " ERR: scif_munmap %s\n", strerror(errno));
+ pmsg->hdr.status = MIX_EINVAL;
+ }
+ }
+
+ /* Valid s_peer_addr and NULL s_mmap_addr mark that we stopped using mmap memory
+ * we are closing down but that will keep process pending SEND_OP */
+ smd->mm_s_peer_addr = NULL;
+
+ if (smd->mm_s_place_holder) {
+ free(smd->mm_s_place_holder);
+ smd->mm_s_place_holder = NULL;
+ }
+
+ pmsg->hdr.flags = MIX_OP_RSP;
+ len = sizeof(dat_mix_mmap_addr_t);
+
+ /* send back response */
+ if (smd->scif_op_ep) {
+ ret = scif_send_msg(smd->scif_op_ep, (void*)pmsg, len);
+ if (ret != len) {
+ mlog(0, " ERR: ret %d, exp %d %s\n",
+ ret, len, strerror(errno));
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+
+static int mix_mmap_alloc(mcm_scif_dev_t *smd, dat_mix_mmap_addr_t *pmsg)
+{
+ int ret, len;
+
+ /* hdr already read, get operation data */
+ len = sizeof(dat_mix_mmap_addr_t) - sizeof(dat_mix_hdr_t);
+ ret = scif_recv(smd->scif_op_ep, ((char*)pmsg + sizeof(dat_mix_hdr_t)), len, SCIF_RECV_BLOCK);
+ if (ret != len) {
+ mlog(0, " ERR: ret %d, exp %d %s\n", ret, len, strerror(errno));
+ return ret;
+ }
+
+ pmsg->hdr.status = MIX_EINVAL;
+
+ mlog(8, " mm_s_peer_addr_off from MIC 0x%llx\n", pmsg->addr);
+
+ if (pmsg->addr == SCIF_REGISTER_FAILED) {
+ mlog(0, " ERR: op send got invalid input 0x%llx\n", pmsg->addr);
+ goto resp;
+ }
+
+ if(!smd->mm_r_addr) {
+ /* init_smd_send_op_mmap failed - fall back to reg OP */
+ mlog(8, " init_smd_send_op_mmap failed - send MIC ENOMEM\n");
+ pmsg->hdr.status = MIX_ENOMEM;
+ goto resp;
+ }
+
+ smd->mm_s_peer_addr_off = pmsg->addr;
+
+ ret = posix_memalign((void **)&smd->mm_s_place_holder, 4096, ALIGN_PAGE(sizeof(int)));
+ if (ret) {
+ mlog(0, " ERR: alloc mm_s_place_holder, ret=%d\n", ret);
+ smd->mm_s_peer_addr_off = SCIF_REGISTER_FAILED;
+ smd->mm_s_place_holder = NULL;
+ goto resp;
+ }
+
+ /* mmap peer buffer so we can write into it like a reg memory */
+ smd->mm_s_peer_addr = (volatile int *)
+ scif_mmap(smd->mm_s_place_holder,
+ ALIGN_PAGE(sizeof(int)),
+ SCIF_PROT_READ | SCIF_PROT_WRITE, 0,
+ smd->scif_op_ep, smd->mm_s_peer_addr_off);
+
+ if (smd->mm_s_peer_addr == SCIF_MMAP_FAILED) {
+ smd->mm_s_peer_addr_off = SCIF_REGISTER_FAILED;
+ smd->mm_s_peer_addr = NULL;
+ free(smd->mm_s_place_holder);
+ smd->mm_s_place_holder = NULL;
+ mlog(0, " ERR: scif_mmap m_s_peer_addr %s", strerror(errno));
+ goto resp;
+ }
+
+ /* Send other side our scif base address of the SNED OP ARRAY */
+ pmsg->addr = smd->mm_r_addr_off;
+ pmsg->hdr.status = MIX_SUCCESS;
+
+ /* Initialize the other side, WR tail */
+ *smd->mm_s_peer_addr = 0;
+
+ mlog(8, " mmap done: mm_s_peer_off 0x%llx, mm_r_addr_off 0x%llx\n",
+ smd->mm_s_peer_addr_off, smd->mm_r_addr_off);
+resp:
+ /* send back response */
+ pmsg->hdr.flags = MIX_OP_RSP;
+ len = sizeof(dat_mix_mmap_addr_t);
+
+ return (scif_send_msg(smd->scif_op_ep, (void*)pmsg, len));
+}
+
/* create new proxy-out PZ */
static int mix_pz_create(mcm_scif_dev_t *smd, dat_mix_pz_t *pmsg)
{
m_wr->wr.sg_list = m_wr->sg;
m_wr->wr.num_sge = len ? 1:0;
- mlog(4, " INLINE m_wr[%d] %p raddr %p rkey 0x%x, ib_wr raddr %p rkey 0x%x %d bytes\n",
- m_qp->wr_hd, m_wr, pmsg->wr.wr.rdma.remote_addr, pmsg->wr.wr.rdma.rkey,
- m_wr->wr.wr.rdma.remote_addr, m_wr->wr.wr.rdma.rkey, len);
-
/* M_WR */
m_wr->org_id = pmsg->wr.wr_id;
m_wr->m_idx = 0;
mpxy_unlock(&smd->tblock);
goto bail;
}
- mlog(0x10, "[%d:%d:%d] %s_INLINE_post_sig: qp %p wr %p wr_id %p flgs 0x%x,"
- " pcnt %d sg_rate %d hd %d tl %d sz %d m_idx %x\n",
- m_qp->smd->md->mc->scif_id, m_qp->smd->entry.tid,
- m_qp->r_entry.tid,
- (MXF_EP(&m_qp->cm->msg.daddr1)) ? "po_pi":"po_direct",
- m_qp, m_wr, m_wr->wr.wr_id, m_wr->wr.send_flags,
- m_qp->post_cnt, mcm_rw_signal, m_qp->wr_hd, m_qp->wr_tl,
- m_wr->wr.sg_list->length, m_wr->m_idx);
}
mpxy_unlock(&smd->tblock);
if (len) {
/* copy data into proxy buffer, signal TX thread via wr_id */
- ret = scif_recv(smd->scif_op_ep, (void*)m_wr->sg->addr, len, SCIF_RECV_BLOCK);
- if (ret != len) {
- mlog(0, " ERR: scif_recv inline DATA, ret %d, exp %d\n", ret, len);
- ret = errno;
- len = 0;
- goto bail;
+ if (smd->mm_s_peer_addr_off != SCIF_REGISTER_FAILED && pmsg->hdr.op == MIX_SEND) {
+ /* inline data is after the msg */
+ memcpy((void*)m_wr->sg->addr, pmsg + 1, len);
+ } else {
+ ret = scif_recv(smd->scif_op_ep, (void*)m_wr->sg->addr, len, SCIF_RECV_BLOCK);
+ if (ret != len) {
+ mlog(0, " ERR: scif_recv inline DATA, ret %d, exp %d\n", ret, len);
+ ret = errno;
+ len = 0;
+ goto bail;
+ }
}
}
struct dat_mix_wc wc;
char dbuf[DAT_MIX_INLINE_MAX];
- if (len) /* drain inline data */
+ if (len && !(smd->mm_s_peer_addr_off != SCIF_REGISTER_FAILED && pmsg->hdr.op == MIX_SEND)) {
+ /* drain inline data */
scif_recv(smd->scif_op_ep, dbuf, len, SCIF_RECV_BLOCK);
+ }
wc.wr_id = pmsg->wr.wr_id;
wc.byte_len = len;
return ret;
}
-/* Post SEND message request, IB send or rdma write, operation channel */
+/* Post SEND message request, IB send or rdma write, operation channel Via scif send*/
static int mix_post_send(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg)
{
int len, ret;
return (mix_proxy_out(smd, pmsg, m_qp));
}
+/* Post SEND message request, IB send or rdma write, operation channel Via scif mmap memory */
+int mix_post_send_ext(mcm_scif_dev_t *smd)
+{
+ int ret, retry, max_io = 32;
+ struct mcm_qp *m_qp;
+ dat_mix_mmap_wr_t *mm_wr_entry;
+ volatile dat_mix_sr_t *pmsg;
+
+ while (max_io--) {
+ retry = 100;
+ mm_wr_entry = &smd->mm_r_addr[smd->mm_r_head];
+ pmsg = &mm_wr_entry->msg;
+
+ /* wait little for SEND OP msg */
+ while(retry-- && !mm_wr_entry->flags) {
+ if(!retry)
+ return 0;
+ sched_yield();
+ }
+
+ if (pmsg->hdr.op != MIX_SEND) {
+ mlog(0, " ERR: no MIX_SEND OP CODE? Got %d, exp %d\n",
+ pmsg->hdr.op, MIX_SEND);
+ return -1;
+ }
+
+ /* get QP by ID */
+ m_qp = mix_get_qp(smd, pmsg->qp_id);
+ if (!m_qp || !m_qp->ib_qp2) {
+ mlog(0, " ERR: mix_get_qp id %d not found\n",
+ pmsg->qp_id);
+ return POLLERR; /* async err, no QP to report */
+ }
+
+ ret = mix_proxy_out(smd, (dat_mix_sr_t *)pmsg, m_qp);
+
+ /* Mark entry empty, update local head and MIC head */
+ mm_wr_entry->flags = 0;
+ smd->mm_r_head = ((smd->mm_r_head + 1) % smd->mm_r_last);
+
+ if(smd->mm_s_peer_addr)
+ *smd->mm_s_peer_addr = smd->mm_r_head;
+
+ if (ret) {
+ mlog(0, " ERR: failed mix proxy out. ret %d\n", ret);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
/* Post RECV message request on Proxy-RX channel */
static int mix_post_recv(mcm_scif_dev_t *smd, dat_mix_sr_t *pmsg)
{
case MIX_CM_DISC:
ret = mix_cm_disc_out(smd, (dat_mix_cm_t *)phdr, scif_ep);
break;
+ case MIX_MMAP_ALLOC:
+ ret = mix_mmap_alloc(smd, (dat_mix_mmap_addr_t *)phdr);
+ break;
+ case MIX_MMAP_FREE:
+ ret = mix_mmap_free(smd, (dat_mix_mmap_addr_t *)phdr);
+ break;
case MIX_CM_DREP:
default:
mlog(0, " ERR: smd %p unknown msg->op: %d, close dev_id %d\n",
return;
}
+void destroy_smd_send_op_mmap(mcm_scif_dev_t *smd)
+{
+ if (smd->mm_r_addr_off != SCIF_REGISTER_FAILED && smd->scif_op_ep) {
+ scif_unregister(smd->scif_op_ep, smd->mm_r_addr_off, smd->mm_r_len);
+ smd->mm_r_addr_off = SCIF_REGISTER_FAILED;
+ }
+
+ if (smd->mm_r_addr) {
+ free(smd->mm_r_addr);
+ smd->mm_r_addr = NULL;
+ }
+
+ if (smd->mm_s_peer_addr > (int *)0 && smd->scif_op_ep) {
+ scif_munmap((void *)smd->mm_s_peer_addr, ALIGN_PAGE(sizeof(int)));
+ smd->mm_s_peer_addr = NULL;
+ }
+
+ if (smd->mm_s_place_holder) {
+ free(smd->mm_s_place_holder);
+ smd->mm_s_place_holder = NULL;
+ }
+}
+
void mpxy_destroy_bpool(mcm_scif_dev_t *smd)
{
if (smd->m_offset && smd->scif_tx_ep)
if (smd->ref_cnt)
mlog(0, " WARNING: ref_cnt not 0, = %d \n", smd->ref_cnt);
+ destroy_smd_send_op_mmap(smd);
+ mlog(8, " send op via scif wt destroyed\n");
+
mpxy_destroy_bpool(smd);
mlog(8, " proxy buffer pools destroyed \n");
free(smd);
}
+static int init_smd_send_op_mmap(mcm_scif_dev_t *smd)
+{
+ int ret, len;
+
+ smd->mm_s_peer_addr_off = SCIF_REGISTER_FAILED;
+ smd->mm_r_addr_off = SCIF_REGISTER_FAILED;
+ smd->mm_s_peer_addr = NULL;
+ smd->mm_s_place_holder = NULL;
+ smd->mm_r_head = 0;
+ smd->mm_r_last = DAT_MIX_MMAP_WR_MAX;
+
+ len = ALIGN_PAGE(DAT_MIX_MMAP_WR_MAX * (sizeof(dat_mix_mmap_wr_t)));
+ smd->mm_r_len = len;
+ ret = posix_memalign((void **)&smd->mm_r_addr, 4096, len);
+ if (ret) {
+ mlog(0, " ERR: alloc r_addr ln=%d, %s\n", len, strerror(errno));
+ smd->mm_r_addr = NULL;
+ return -1;
+ }
+ memset(smd->mm_r_addr, 0, len);
+
+ mlog(8, " MMAP send_op: buf %p len %d\n", smd->mm_r_addr, len);
+
+ smd->mm_r_addr_off = scif_register(smd->scif_op_ep, smd->mm_r_addr, len,
+ (off_t)0, SCIF_PROT_READ | SCIF_PROT_WRITE, 0);
+
+ if (smd->mm_r_addr_off == SCIF_REGISTER_FAILED) {
+ mlog(0, " ERR: scif_register addr=%p,%d ret=%s\n", smd->mm_r_addr, len, strerror(errno));
+ free(smd->mm_r_addr);
+ smd->mm_r_addr = NULL;
+ return -1;
+ }
+ mlog(8, " MMAP send_op: addr=%p, off=0x%llx, len %d\n",
+ smd->mm_r_addr, smd->mm_r_addr_off, len);
+
+ return 0;
+}
+
+
static int create_smd_bpool(mcm_scif_dev_t *smd)
{
int ret;
if (!smd->cm_id)
goto err;
+ /* no need to check ret val - in case of failure we fall back to reg OP */
+ init_smd_send_op_mmap(smd);
+
if (create_smd_bpool(smd))
goto err;
return smd;
err:
if (smd) {
+ if (smd->mm_r_addr)
+ free(smd->mm_r_addr);
if (smd->cmd_buf)
free(smd->cmd_buf);
if (smd->ports)
}
mc->tx_busy = data;
time_ms = (data) ? 0:-1;
+
+ if (time_ms && mcm_op_poll)
+ time_ms = 0;
+
mpxy_unlock(&mc->txlock);
if (time_ms == -1) mlog(0x10," sleep\n");
mcm_select(set, time_ms);
smd->th_ref_cnt++;
mpxy_unlock(&md->slock);
+ ret = 0;
+ if (smd->mm_s_peer_addr_off != SCIF_REGISTER_FAILED)
+ ret = mix_post_send_ext(smd); /* mmap operation */
+
+ if (ret == POLLERR)
+ mix_close_device(md, smd);
+
ret = mcm_poll(smd->scif_op_ep, POLLIN); /* operations */
if (ret == POLLIN)
ret = mix_scif_recv(smd, smd->scif_op_ep);
smd = next;
}
mpxy_unlock(&md->slock);
- sched_yield();
+
+ if (smd)
+ sched_yield();
}
mpxy_unlock(&mc->oplock);
/* data-path, loop if busy or device open & single core */
if ((mc->tx_busy || mc->rx_busy) || (smd_cnt && mcm_op_poll))
time_ms = 0;
- mcm_select(set, time_ms);
+ else {
+ time_ms = 0;
+ }
+
+ mcm_select(set, time_ms); /* Another sched yield */
if (time_ms == -1) mlog(0x10," OP wake\n");
if (mcm_poll(mc->op_pipe[0], POLLIN) == POLLIN)
read(mc->op_pipe[0], rbuf, 2);
}
mc->rx_busy = data;
time_ms = data ? 0:-1;
+
+ if (time_ms && mcm_op_poll)
+ time_ms = 0;
+
mpxy_unlock(&mc->rxlock);
if (time_ms == -1) mlog(0x10," RX sleep\n");
mcm_select(set, time_ms);
char *cmd_buf; /* operation command buffer */
struct dat_mix_dev_attr dev_attr; /* Manage attributes per MIC client open */
uint8_t mtu_env; /* mtu override with DAPL_IB_MTU */
+
+ dat_mix_mmap_wr_t *mm_r_addr; /* Address of post_send WR array, updated from MIC via writes */
+ int mm_r_len; /* total bytes of WR array */
+ off_t mm_r_addr_off; /* WR entry rcv buffer, map from MIC via mmap, SCIF registration */
+ int mm_r_head; /* location for new posted WR entries from MIC */
+ int mm_r_last; /* mmap WR array size */
+ off_t mm_s_peer_addr_off; /* peer scif address for tail update */
+ volatile int *mm_s_peer_addr; /* writing to this address is writing to remote mem */
+ int *mm_s_place_holder; /* alloc local memory for scif_mmap, not referenced */
+
#ifdef MCM_PROFILE
uint16_t m_hd_ro; /* HD,TL tracking */
uint16_t m_tl_ro;
void m_qp_free(struct mcm_qp *m_qp);
void m_mr_free(struct mcm_mr *m_mr);
int mix_scif_recv(mcm_scif_dev_t *smd, scif_epd_t scif_ep);
+int mix_post_send_ext(mcm_scif_dev_t *smd);
int mix_cm_disc_in(mcm_cm_t *m_cm);
int mix_cm_rtu_in(mcm_cm_t *m_cm, dat_mcm_msg_t *pkt, int pkt_len);
int mix_cm_req_in(mcm_cm_t *cm, dat_mcm_msg_t *pkt, int pkt_len);
#
# The default is 1
-mcm_affinity 2
+mcm_affinity 1
+mcm_op_poll 1
# mcm_affinity_base_mic:
# Specifies a hard binding for CPU id base value used for affinity support of