-I$(srcdir)/dat/include/ -I$(srcdir)/dapl/include/ \
-I$(srcdir)/dapl/common -I$(srcdir)/dapl/udapl/linux \
-I$(srcdir)/dapl/openib_common \
+ -I$(srcdir)/dapl/svc \
-I$(srcdir)/dapl/openib_mcm \
-I$(srcdir)/dapl/openib_mcm/linux
endif
dapl/openib_common/util.c \
dapl/openib_mcm/cm.c \
dapl/openib_mcm/mix.c \
+ dapl/openib_mcm/proxy.c \
dapl/openib_mcm/device.c $(XPROGRAMS)
dapl_udapl_libdaplomcm_la_LDFLAGS = -version-info 2:0:0 $(daplomcm_version_script) \
dapl/openib_ucm/linux/openib_osd.h \
dapl/openib_mcm/dapl_ib_util.h \
dapl/openib_mcm/linux/openib_osd.h \
+ dapl/svc/mpxy.h \
dapl/svc/mpxyd.h \
dat/udat/libdat2.map \
dapl/udapl/libdaplofa.map \
OP_RDMA_WRITE);
dapl_dbg_log(DAPL_DBG_TYPE_RTN,
- "dapl_ep_post_rdma_write () returns 0x%x", dat_status);
+ "dapl_ep_post_rdma_write () returns 0x%x\n", dat_status);
return dat_status;
}
{
#ifdef DAPL_DBG
dapl_dbg_log(DAPL_DBG_TYPE_CALLBACK,
- "\t >>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<\n");
- dapl_dbg_log(DAPL_DBG_TYPE_CALLBACK,
- "\t dapl_evd_dto_callback : CQE \n");
- dapl_dbg_log(DAPL_DBG_TYPE_CALLBACK,
- "\t\t work_req_id %lli\n", DAPL_GET_CQE_WRID(cqe_ptr));
- if (DAPL_GET_CQE_STATUS(cqe_ptr) == 0) {
- dapl_dbg_log(DAPL_DBG_TYPE_CALLBACK,
- "\t\t op_type: %s\n",
- DAPL_GET_CQE_OP_STR(cqe_ptr));
- dapl_dbg_log(DAPL_DBG_TYPE_CALLBACK,
- "\t\t bytes_num %d\n",
- DAPL_GET_CQE_BYTESNUM(cqe_ptr));
- }
- dapl_dbg_log(DAPL_DBG_TYPE_CALLBACK,
- "\t\t status %d vendor_err 0x%x\n",
+ "DTO CQE: WR 0x%llx op %s ln %d stat %d vn 0x%x\n",
+ DAPL_GET_CQE_WRID(cqe_ptr),
+ DAPL_GET_CQE_OP_STR(cqe_ptr),
+ DAPL_GET_CQE_BYTESNUM(cqe_ptr),
DAPL_GET_CQE_STATUS(cqe_ptr),
DAPL_GET_CQE_VENDOR_ERR(cqe_ptr));
- dapl_dbg_log(DAPL_DBG_TYPE_CALLBACK,
- "\t >>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<\n");
#endif
return;
}
DAT_RETURN dapls_evd_cqe_to_event(DAPL_EVD * evd_ptr, ib_work_completion_t *cqe)
{
- DAT_RETURN dat_status;
DAT_EVENT *event;
if (evd_ptr->ib_cq_handle == IB_INVALID_HANDLE)
if (hca_ptr->ib_trans.ib_cq_empty) {
struct ibv_comp_channel *channel;
- channel = hca_ptr->ib_trans.ib_cq_empty->ib_cq->channel;
- ibv_destroy_cq(hca_ptr->ib_trans.ib_cq_empty->ib_cq);
+ channel = hca_ptr->ib_trans.ib_cq_empty->cq->channel;
+ ibv_destroy_cq(hca_ptr->ib_trans.ib_cq_empty->cq);
ibv_destroy_comp_channel(channel);
}
IN DAPL_EVD * evd_ptr, IN DAT_COUNT * cqlen)
{
struct ibv_comp_channel *channel = NULL;
- int ret = ENOMEM;
+ int opts, ret = ENOMEM;
dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
"dapls_ib_cq_alloc: evd %p cqlen=%d \n", evd_ptr, *cqlen);
if (MXS_EP(&ia_ptr->hca_ptr->ib_trans.addr))
return DAT_SUCCESS;
}
+ dapl_llist_init_entry(&evd_ptr->ib_cq_handle->entry);
#endif
if (!evd_ptr->cno_ptr)
channel = ibv_create_comp_channel(ia_ptr->hca_ptr->ib_hca_handle);
if (!channel)
goto err;
- evd_ptr->ib_cq_handle->ib_cq =
+ /* move channel FD to non-blocking */
+ opts = fcntl(channel->fd, F_GETFL);
+ if (opts < 0 || fcntl(channel->fd, F_SETFL, opts | O_NONBLOCK) < 0) {
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " dapls_config_fd: fcntl on channel->fd %d ERR %d %s\n",
+ channel->fd, opts, strerror(errno));
+ goto err;
+ }
+ evd_ptr->ib_cq_handle->cq =
ibv_create_cq(ia_ptr->hca_ptr->ib_hca_handle,
*cqlen, evd_ptr, channel, 0);
- if (!evd_ptr->ib_cq_handle->ib_cq)
+ if (!evd_ptr->ib_cq_handle->cq)
goto err;
/* arm cq for events */
dapls_set_cq_notify(ia_ptr, evd_ptr);
/* update with returned cq entry size */
- *cqlen = evd_ptr->ib_cq_handle->ib_cq->cqe;
+ *cqlen = evd_ptr->ib_cq_handle->cq->cqe;
dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
"dapls_ib_cq_alloc: new_cq %p cqlen=%d \n",
if (evd_ptr->ib_cq_handle != IB_INVALID_HANDLE) {
#ifdef _OPENIB_MCM_
+ /* remove from device PI processing list */
+ dapl_os_lock(&ia_ptr->hca_ptr->ib_trans.cqlock);
+ if (evd_ptr->ib_cq_handle->entry.list_head)
+ dapl_llist_remove_entry(&ia_ptr->hca_ptr->ib_trans.cqlist,
+ &evd_ptr->ib_cq_handle->entry);
+ dapl_os_unlock(&ia_ptr->hca_ptr->ib_trans.cqlock);
+
/* shadow support, MPXYD */
if (ia_ptr->hca_ptr->ib_trans.scif_ep) {
dapli_mix_cq_free(evd_ptr->ib_cq_handle);
- if (!evd_ptr->ib_cq_handle->ib_cq) {
+ if (!evd_ptr->ib_cq_handle->cq) {
dapl_os_free(evd_ptr->ib_cq_handle,
sizeof(struct dcm_ib_cq));
evd_ptr->ib_cq_handle = IB_INVALID_HANDLE;
}
#endif
/* pull off CQ and EVD entries and toss */
- while (ibv_poll_cq(evd_ptr->ib_cq_handle->ib_cq, 1, &wc) == 1) ;
+ while (ibv_poll_cq(evd_ptr->ib_cq_handle->cq, 1, &wc) == 1) ;
while (dapl_evd_dequeue(evd_ptr, &event) == DAT_SUCCESS) ;
- channel = evd_ptr->ib_cq_handle->ib_cq->channel;
- if (ibv_destroy_cq(evd_ptr->ib_cq_handle->ib_cq))
+ channel = evd_ptr->ib_cq_handle->cq->channel;
+ if (ibv_destroy_cq(evd_ptr->ib_cq_handle->cq))
return (dapl_convert_errno(errno, "ibv_destroy_cq"));
if (!evd_ptr->cno_ptr)
ibv_destroy_comp_channel(channel);
DAT_RETURN
dapls_evd_dto_wakeup(IN DAPL_EVD * evd_ptr)
{
- dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
- " cq_object_wakeup: evd=%p\n", evd_ptr);
+ dapl_log(DAPL_DBG_TYPE_EVD, " cq_object_wakeup: EVD %p CQ %p\n",
+ evd_ptr, evd_ptr->ib_cq_handle);
/* EVD with CNO; waiting on OS wait object */
- if (evd_ptr->cno_ptr)
+ if (evd_ptr->cno_ptr) {
dapl_os_wait_object_wakeup(&evd_ptr->wait_object);
+ return DAT_SUCCESS;
+ }
#ifdef _OPENIB_MCM_
- if (evd_ptr->ib_cq_handle->tp->scif_ep &&
- ((evd_ptr->ib_cq_handle->type & DCM_CQ_SND) ||
- (!evd_ptr->ib_cq_handle->ib_cq))) {
+{
+ int flags = evd_ptr->ib_cq_handle->flags;
+ if (((flags & DCM_CQ_TX) && (flags & DCM_CQ_TX_INDIRECT)) ||
+ ((flags & DCM_CQ_RX) && (flags & DCM_CQ_RX_INDIRECT)))
dapl_os_wait_object_wakeup(&evd_ptr->wait_object);
- }
+}
#endif
/* otherwise, no wake up mechanism */
return DAT_SUCCESS;
void *context;
int status;
- dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
- " cq_object_wait: EVD %p time %d\n",
- evd_ptr, timeout);
+ dapl_log(DAPL_DBG_TYPE_EVD,
+ " cq_object_wait: EVD %p CQ %p time %d\n",
+ evd_ptr, evd_ptr->ib_cq_handle, timeout);
#ifdef _OPENIB_MCM_
- if (evd_ptr->ib_cq_handle->tp->scif_ep &&
- ((evd_ptr->ib_cq_handle->type & DCM_CQ_SND) ||
- (!evd_ptr->ib_cq_handle->ib_cq))) {
+{
+ int flags = evd_ptr->ib_cq_handle->flags;
+ if (((flags & DCM_CQ_TX) && (flags & DCM_CQ_TX_INDIRECT)) ||
+ ((flags & DCM_CQ_RX) && (flags & DCM_CQ_RX_INDIRECT))) {
return (dapl_os_wait_object_wait(&evd_ptr->wait_object, timeout));
}
+}
#endif
- channel = evd_ptr->ib_cq_handle->ib_cq->channel;
+
+ channel = evd_ptr->ib_cq_handle->cq->channel;
status = dapls_wait_comp_channel(channel, timeout);
if (!status) {
if (!ibv_get_cq_event(channel, &ibv_cq, &context)) {
*/
DAT_RETURN dapls_set_cq_notify(IN DAPL_IA * ia_ptr, IN DAPL_EVD * evd_ptr)
{
- if (evd_ptr->ib_cq_handle->ib_cq &&
- ibv_req_notify_cq(evd_ptr->ib_cq_handle->ib_cq, 0))
+ if (evd_ptr->ib_cq_handle->cq &&
+ ibv_req_notify_cq(evd_ptr->ib_cq_handle->cq, 0))
return (dapl_convert_errno(errno, "notify_cq"));
else
return DAT_SUCCESS;
IN DAPL_EVD * evd_ptr,
IN ib_notification_type_t type)
{
- if (evd_ptr->ib_cq_handle->ib_cq &&
- ibv_req_notify_cq(evd_ptr->ib_cq_handle->ib_cq, type))
+ if (evd_ptr->ib_cq_handle->cq &&
+ ibv_req_notify_cq(evd_ptr->ib_cq_handle->cq, type))
return (dapl_convert_errno(errno, "notify_cq_type"));
else
return DAT_SUCCESS;
int ret;
#ifdef _OPENIB_MCM_
- if (evd_ptr->ib_cq_handle->tp->scif_ep &&
- ((evd_ptr->ib_cq_handle->type & DCM_CQ_SND) ||
- (!evd_ptr->ib_cq_handle->ib_cq))) {
- ret = dapli_mix_cq_poll(evd_ptr->ib_cq_handle, wc_ptr);
- if (ret == 1)
- return DAT_SUCCESS;
- else
- return DAT_QUEUE_EMPTY;
+ if (!evd_ptr->ib_cq_handle->cq) /* proxy service, no direct CQ */
+ return DAT_QUEUE_EMPTY;
+#endif
+ ret = ibv_poll_cq(evd_ptr->ib_cq_handle->cq, 1, wc_ptr);
+
+#ifdef _OPENIB_MCM_
+ /*
+ * HST->MXS, we need to intercept direct TX WC in flight
+ * because there is no way to know if indirect or direct CQ
+ * service is needed at the time of QP creation.
+ */
+ if (ret==1) {
+ DAPL_EP *ep_ptr;
+ DAPL_COOKIE *cookie = (DAPL_COOKIE *)(uintptr_t) DAPL_GET_CQE_WRID(wc_ptr);
+
+ dapl_os_assert((NULL != cookie));
+ ep_ptr = cookie->ep;
+ dapl_os_assert((NULL != ep_ptr));
+ if ((!ep_ptr->qp_handle->tp->scif_ep) &&
+ (ep_ptr->qp_handle->ep_map == MIC_XSOCK_DEV) &&
+ (DAPL_GET_CQE_OPTYPE(wc_ptr) == OP_RDMA_WRITE_IMM)) {
+ dapl_log(DAPL_DBG_TYPE_EP,
+ " MCM_ib_completion_poll: RW_imm: evd %p ep %p st %d op %s ln %d\n",
+ evd_ptr, ep_ptr,
+ DAPL_GET_CQE_STATUS(wc_ptr),
+ DAPL_GET_CQE_OP_STR(wc_ptr),
+ DAPL_GET_CQE_BYTESNUM(wc_ptr));
+ ret = 0; /* WR RW_imm to PI, WC pending from PI */
+ }
}
#endif
- ret = ibv_poll_cq(evd_ptr->ib_cq_handle->ib_cq, 1, wc_ptr);
if (ret == 1)
return DAT_SUCCESS;
#define true 1
#endif /*__cplusplus */
-#define DCM_CQ_SND 0x1
-#define DCM_CQ_RCV 0x2
-
/* Typedefs to map common DAPL provider types to IB verbs */
struct dcm_ib_qp {
- uint64_t qp_ctx; /* local */
- uint64_t sqp_ctx; /* shadow */
struct _ib_hca_transport *tp;
- struct dapl_ep *ep;
- struct ibv_qp *qp; /* local */
- struct ibv_qp *sqp; /* shadow */
- uint32_t qp_id; /* local */
- uint32_t sqp_id; /* shadow */
- uint64_t m_off; /* MPXYD buf pool, SCIF remote */
- uint32_t m_hd; /* xfer head offset */
- uint32_t m_tl; /* xfer tail offset */
- uint32_t m_len; /* total size in bytes */
- uint32_t m_seg; /* segment size */
- uint64_t wr_off; /* MPXYD buf pool, SCIF remote */
- uint64_t wr_hd; /* work request head offset */
- uint64_t wr_tl; /* work request tail offset */
- uint32_t wr_len; /* size */
- uint32_t m_inline; /* SCIF dma inline threshold */
+ struct dapl_ep *ep;
+ struct ibv_qp *qp; /* local QP1 snd-rcv or rcv from PO */
+#ifdef _OPENIB_MCM_
+ struct dcm_ib_cq *req_cq; /* ref to req CQ for HST->MXS */
+ struct dcm_ib_cq *rcv_cq; /* ref to rcv CQ for HST->MXS */
+ struct ibv_qp *qp2; /* local QP2 snd-rcv to-from PI */
+ uint32_t qp_id; /* proxy, SND or RCV */
+ uint32_t m_inline; /* proxy inline threshold */
+ uint32_t wr_hd; /* PO,PI work request head */
+ uint32_t wr_tl; /* PO,PI work request tail */
+ uint32_t wc_tl; /* local WC tail */
+ uint64_t wc_addr; /* WC queue for remote PI */
+ struct ibv_mr *wc_mr; /* WC IB mr info */
+ struct mcm_wrc_info wrc; /* local WC info */
+ struct mcm_wrc_info wrc_rem; /* remote WR info */
+ DAPL_OS_LOCK lock; /* Proxy WR and WC queues */
+ uint8_t ep_map; /* Peer EP mapping, MXS, MSS, HST */
+#endif
};
+#define DCM_CQ_TX 0x1
+#define DCM_CQ_RX 0x2
+#define DCM_CQ_TX_INDIRECT 0x4
+#define DCM_CQ_RX_INDIRECT 0x8
+
struct dcm_ib_cq {
- uint64_t cq_ctx; /* local */
- uint64_t scq_ctx; /* shadow */
struct _ib_hca_transport *tp;
- struct dapl_evd *evd;
- struct ibv_cq *ib_cq;
- struct ibv_comp_channel *ib_ch;
- uint32_t cq_id; /* local */
- uint32_t scq_id; /* shadow */
- int type;
+ struct dapl_evd *evd;
+ struct ibv_cq *cq; /* CQ -> QP1, local rcv, direct */
+ int flags;
+#ifdef _OPENIB_MCM_
+ struct dapl_llist_entry entry;
+ uint32_t cq_id; /* proxy, SND or RCV */
+#endif
};
typedef struct dcm_ib_cq *ib_cq_handle_t;
#define _DAPL_IB_DTO_H_
#include "dapl_ib_util.h"
+#include "dapl_ep_util.h"
#ifdef DAT_EXTENSIONS
#include <dat2/dat_ib_extensions.h>
int ret;
dapl_dbg_log(DAPL_DBG_TYPE_EP,
- " post_snd: ep %p op %d ck %p u_ck %llx sgs %d l_iov %p r_iov %p f %d\n",
- ep_ptr, op_type, cookie,
- cookie->val.dto.cookie.as_64, segments, local_iov,
- remote_iov, completion_flags);
+ " post_snd: %s ep %p op %d ck %p u_ck %llx sgs %d l_iov %p r_iov %p f %d\n",
+ PROVIDER_NAME, ep_ptr, op_type, cookie,
+ cookie->val.dto.cookie.as_64, segments, local_iov,
+ remote_iov, completion_flags);
#ifdef DAT_EXTENSIONS
if (ep_ptr->param.ep_attr.service_type != DAT_SERVICE_TYPE_RC)
if (cookie != NULL) {
for (i = 0; i < segments; i++ ) {
- dapl_dbg_log(DAPL_DBG_TYPE_EP,
+ dapl_dbg_log(DAPL_DBG_TYPE_EP,
" post_snd: lkey 0x%x va %p len %d\n",
ds->lkey, ds->addr, ds->length );
total_len += ds->length;
(op_type == OP_RDMA_WRITE || op_type == OP_RDMA_READ)) {
wr.wr.rdma.remote_addr = remote_iov->virtual_address;
wr.wr.rdma.rkey = remote_iov->rmr_context;
- dapl_dbg_log(DAPL_DBG_TYPE_EP,
+ dapl_dbg_log(DAPL_DBG_TYPE_EP,
" post_snd_rdma: rkey 0x%x va %#016Lx\n",
wr.wr.rdma.rkey, wr.wr.rdma.remote_addr);
}
wr.send_flags |= (DAT_COMPLETION_SOLICITED_WAIT_FLAG &
completion_flags) ? IBV_SEND_SOLICITED : 0;
- dapl_dbg_log(DAPL_DBG_TYPE_EP,
+ dapl_log(DAPL_DBG_TYPE_EP,
" post_snd: op 0x%x flags 0x%x sglist %p, %d\n",
wr.opcode, wr.send_flags, wr.sg_list, wr.num_sge);
#ifdef _OPENIB_MCM_
if (ep_ptr->qp_handle->tp->scif_ep)
ret = dapli_mix_post_send(ep_ptr->qp_handle, total_len, &wr, &bad_wr);
+ else if (ep_ptr->qp_handle->ep_map == MIC_XSOCK_DEV)
+ ret = mcm_send_pi(ep_ptr->qp_handle, total_len, &wr, &bad_wr);
else
- ret = ibv_post_send(ep_ptr->qp_handle->sqp, &wr, &bad_wr);
+ ret = ibv_post_send(ep_ptr->qp_handle->qp2, &wr, &bad_wr);
#else
ret = ibv_post_send(ep_ptr->qp_handle->qp, &wr, &bad_wr);
#endif
dapl_dbg_log(DAPL_DBG_TYPE_EP,
" post_ext_snd: ep %p op %d ck %p sgs",
- "%d l_iov %p r_iov %p f %d\n",
+ "%d l_iov %p r_iov %p f %d ah %p\n",
ep_ptr, op_type, cookie, segments, local_iov,
remote_iov, completion_flags, remote_ah);
wr.opcode, wr.send_flags, wr.sg_list, wr.num_sge);
#ifdef _OPENIB_MCM_
- if (op_type != OP_RDMA_WRITE_IMM)
- return DAT_NOT_IMPLEMENTED;
-
if (ep_ptr->qp_handle->tp->scif_ep)
ret = dapli_mix_post_send(ep_ptr->qp_handle, total_len, &wr, &bad_wr);
+ else if (ep_ptr->qp_handle->ep_map == MIC_XSOCK_DEV)
+ ret = mcm_send_pi(ep_ptr->qp_handle, total_len, &wr, &bad_wr);
else
- ret = ibv_post_send(ep_ptr->qp_handle->sqp, &wr, &bad_wr);
+ ret = ibv_post_send(ep_ptr->qp_handle->qp2, &wr, &bad_wr);
#else
ret = ibv_post_send(ep_ptr->qp_handle->qp, &wr, &bad_wr);
#endif
if (ret)
- return( dapl_convert_errno(errno,"ibv_send") );
+ return( dapl_convert_errno(errno,"ibv_send_ext") );
#ifdef DAPL_COUNTERS
switch (op_type) {
IN DAT_MEM_PRIV_FLAGS privileges, IN DAT_VA_TYPE va_type)
{
struct ibv_device *ibv_dev = ia_ptr->hca_ptr->ib_hca_handle->device;
+ int ib_access = dapls_convert_privileges(privileges);
dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
" mr_register: ia=%p, lmr=%p va=%p ln=%d pv=0x%x\n",
return DAT_ERROR(DAT_NOT_IMPLEMENTED, DAT_NO_SUBTYPE);
}
+#ifdef _OPENIB_MCM_
+ ib_access |= IBV_ACCESS_REMOTE_READ; /* HST->MXS, peer PI RR */
+#endif
+
/* local read is default on IB */
lmr->mr_handle =
ibv_reg_mr(((DAPL_PZ *) lmr->param.pz_handle)->pd_handle,
- virt_addr, length, dapls_convert_privileges(privileges));
+ virt_addr, length, ib_access);
if (!lmr->mr_handle)
return (dapl_convert_errno(ENOMEM, "reg_mr"));
}
#endif
dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
- " mr_register: ia=%p, lmr=%p va=%p ln=%d return\n",
+ " mr_register: ia=%p, lmr=%p va=%p ln=%d priv=%d return\n",
ia_ptr, lmr, virt_addr, length, privileges);
return DAT_SUCCESS;
ib_cq_handle_t rcv_cq, req_cq;
ib_pd_handle_t ib_pd_handle;
int ret = EINVAL;
+ int max_inline = ia_ptr->hca_ptr->ib_trans.max_inline_send;
struct ibv_qp_init_attr qp_create;
#ifdef _OPENIB_CMA_
dp_ib_cm_handle_t conn;
dapl_os_memzero(rcv_cq, sizeof(struct dcm_ib_cq));
- rcv_cq->ib_cq = ibv_create_cq(ia_ptr->hca_ptr->ib_hca_handle,
+ rcv_cq->cq = ibv_create_cq(ia_ptr->hca_ptr->ib_hca_handle,
1, NULL, channel, 0);
- if (!rcv_cq->ib_cq) {
+ if (!rcv_cq->cq) {
ibv_destroy_comp_channel(channel);
return (dapl_convert_errno(ENOMEM, "QP create_cq"));
}
#endif
/* Setup attributes and create qp */
dapl_os_memzero((void *)&qp_create, sizeof(qp_create));
- qp_create.recv_cq = rcv_cq->ib_cq;
+ qp_create.recv_cq = rcv_cq->cq;
qp_create.cap.max_recv_wr = rcv_evd ? attr->max_recv_dtos:0;
qp_create.cap.max_recv_sge = rcv_evd ? attr->max_recv_iov:0;
- qp_create.send_cq = req_cq->ib_cq;
+ qp_create.send_cq = req_cq->cq;
qp_create.cap.max_send_wr = req_evd ? attr->max_request_dtos:0;
qp_create.cap.max_send_sge = req_evd ? attr->max_request_iov:0;
- qp_create.cap.max_inline_data = ia_ptr->hca_ptr->ib_trans.max_inline_send;
+ qp_create.cap.max_inline_data = max_inline;
qp_create.qp_type = IBV_QPT_RC;
qp_create.qp_context = (void *)ep_ptr;
#else
#ifdef _OPENIB_MCM_
- /* mark type of CQ */
- req_cq->type = DCM_CQ_SND;
- rcv_cq->type |= DCM_CQ_RCV;
+ if (dapl_os_lock_init(&ep_ptr->qp_handle->lock))
+ goto err;
+
+ /* mark type of CQ, ref to QP */
+ req_cq->flags |= DCM_CQ_TX;
+ rcv_cq->flags |= DCM_CQ_RX;
/* save resources, 1st QP is receiver, 2nd is sender */
if (ia_ptr->hca_ptr->ib_trans.scif_ep) {
/* Don't create any QP if MIC xsocket, QPt and QPr both on MPXYD */
if (!ia_ptr->hca_ptr->ib_trans.scif_ep ||
(ia_ptr->hca_ptr->ib_trans.scif_ep &&
- !MXS_EP(&ia_ptr->hca_ptr->ib_trans.addr)))
+ !MXS_EP(&ia_ptr->hca_ptr->ib_trans.addr))) {
+ /* QP1 needed for RX only, set QP1 req_cq empty */
+ qp_create.send_cq = ia_ptr->hca_ptr->ib_trans.ib_cq_empty->cq;
+ } else {
+ goto skip_qp;
+ }
#endif
- {
- ep_ptr->qp_handle->qp = ibv_create_qp(ib_pd_handle, &qp_create);
- if (!ep_ptr->qp_handle->qp) {
- dapl_log(1," qp_alloc ERR %d %s line %d on device %s\n",
- errno, strerror(errno), __LINE__ ,
- ibv_get_device_name(ia_ptr->hca_ptr->ib_trans.ib_dev));
- ret = errno;
- goto err;
- }
- dapl_dbg_log(DAPL_DBG_TYPE_EP,
- " QP_ALLOC: QPr 0x%x sq %d,%d rq %d,%d\n",
- ep_ptr->qp_handle->qp->qp_num,
- qp_create.cap.max_send_wr,
- qp_create.cap.max_send_sge,
- qp_create.cap.max_recv_wr,
- qp_create.cap.max_recv_sge);
+ ep_ptr->qp_handle->qp = ibv_create_qp(ib_pd_handle, &qp_create);
+ if (!ep_ptr->qp_handle->qp) {
+ dapl_log(1," qp_alloc ERR %d %s line %d on device %s\n",
+ errno, strerror(errno), __LINE__ ,
+ ibv_get_device_name(ia_ptr->hca_ptr->ib_trans.ib_dev));
+ ret = errno;
+ goto err;
}
+ dapl_dbg_log(DAPL_DBG_TYPE_EP,
+ " QP_ALLOC: QPr %p-0x%x SQ %d,%d cq %p, RQ %d,%d cq %p\n",
+ ep_ptr->qp_handle->qp, ep_ptr->qp_handle->qp->qp_num,
+ qp_create.cap.max_send_wr, qp_create.cap.max_send_sge,
+ req_cq, qp_create.cap.max_recv_wr,
+ qp_create.cap.max_recv_sge, rcv_cq);
+
#ifdef _OPENIB_MCM_
+skip_qp:
/* shadow support, MPXYD */
- ep_ptr->qp_handle->qp_ctx = (uint64_t)ep_ptr;
ep_ptr->qp_handle->qp_id = 0;
if (ia_ptr->hca_ptr->ib_trans.scif_ep) { /* MIC: shadow QPt on proxy */
+ req_cq->flags |= DCM_CQ_TX_INDIRECT;
+ qp_create.send_cq = req_cq->cq;
qp_create.cap.max_inline_data = 32; /* setup for bw not latency */
qp_create.cap.max_send_wr = attr->max_request_dtos;
qp_create.cap.max_send_sge = attr->max_request_iov;
- if (ep_ptr->qp_handle->qp) {
- qp_create.cap.max_recv_wr = 1; /* MIC: unused shadow QPr on proxy */
+ if (ep_ptr->qp_handle->qp) { /* MIC: unused shadow QPr on proxy */
+ qp_create.cap.max_recv_wr = 1;
qp_create.cap.max_recv_sge = 1;
- } else {
- qp_create.cap.max_recv_wr = attr->max_recv_dtos; /* MIC: shadow QPr on proxy */
+ } else { /* MIC: shadow QPr on proxy */
+ rcv_cq->flags |= DCM_CQ_RX_INDIRECT;
+ qp_create.cap.max_recv_wr = attr->max_recv_dtos;
qp_create.cap.max_recv_sge = attr->max_recv_iov;
}
dapl_dbg_log(DAPL_DBG_TYPE_EP,
- " QP_ALLOC: QPt -> (MPXYD) sq %d,%d %s rq %d,%d\n",
+ " QP_ALLOC: QPt (MPXYD) SQ %d,%d %s RQ %d,%d\n",
qp_create.cap.max_send_wr, qp_create.cap.max_send_sge,
ep_ptr->qp_handle->qp ? "":"QPr",
qp_create.cap.max_recv_wr, qp_create.cap.max_recv_sge);
if (ret)
goto err;
} else {
- /* NON-MIC: need QPt, in case of shadowed QP's from MIC's */
- qp_create.cap.max_recv_wr = 1;
+ /* NON-MIC: need QPt, in case of shadowed QP's on remote MIC's */
+ /* Prep for HST -> MXS: xfers via remote PI instead of direct */
+ ia_ptr->hca_ptr->ib_trans.max_inline_send =
+ DAPL_MAX(sizeof(struct mcm_wr_rx), max_inline);
+ /* create CQ for peer PI, HST->MXS case */
+ if (mcm_create_pi_cq(ep_ptr->qp_handle, MCM_WRC_QLEN))
+ goto err;
+ qp_create.recv_cq = ep_ptr->qp_handle->rcv_cq->cq;
+ qp_create.send_cq = req_cq->cq;
+ qp_create.cap.max_recv_wr = MCM_WRC_QLEN;
qp_create.cap.max_recv_sge = 1;
- ep_ptr->qp_handle->sqp = ibv_create_qp(ib_pd_handle, &qp_create);
- if (!ep_ptr->qp_handle->sqp) {
+ qp_create.cap.max_send_wr = DAPL_MAX(MCM_WRC_QLEN, attr->max_request_dtos);
+ qp_create.cap.max_send_sge = attr->max_request_iov;
+ qp_create.cap.max_inline_data = ia_ptr->hca_ptr->ib_trans.max_inline_send;
+
+ ep_ptr->qp_handle->qp2 = ibv_create_qp(ib_pd_handle, &qp_create);
+ if (!ep_ptr->qp_handle->qp2) {
ret = errno;
goto err;
}
- if (dapls_modify_qp_state(ep_ptr->qp_handle->sqp,
+ /* enable RR from remote PI */
+ if (dapls_modify_qp_state(ep_ptr->qp_handle->qp2,
IBV_QPS_INIT, 0, 0, 0) != DAT_SUCCESS) {
- ibv_destroy_qp(ep_ptr->qp_handle->sqp);
ret = errno;
goto err;
}
+ ep_ptr->qp_handle->req_cq = req_cq;
+
dapl_dbg_log(DAPL_DBG_TYPE_EP,
- " 3 - QP_ALLOC: QP (LOCAL) QPt 0x%x sq %d,%d QPr rq %d,%d\n",
- ep_ptr->qp_handle->sqp->qp_num,
+ " QP_ALLOC: QPt %p-0x%x SQ %d,%d cq %p, RQ %d,%d cq %p, il %d\n",
+ ep_ptr->qp_handle->qp2, ep_ptr->qp_handle->qp2->qp_num,
qp_create.cap.max_send_wr, qp_create.cap.max_send_sge,
- qp_create.cap.max_recv_wr, qp_create.cap.max_recv_sge);
+ ep_ptr->qp_handle->rcv_cq, qp_create.cap.max_recv_wr,
+ qp_create.cap.max_recv_sge, ep_ptr->qp_handle->req_cq,
+ qp_create.cap.max_inline_data);
}
if (!ep_ptr->qp_handle->qp) { /* QPr and QPs both shadowed */
ep_ptr->qp_state = IBV_QPS_INIT;
/* Setup QP attributes for INIT state on the way out */
if (dapls_modify_qp_state(ep_ptr->qp_handle->qp,
IBV_QPS_INIT, 0, 0, 0) != DAT_SUCCESS) {
- ibv_destroy_qp(ep_ptr->qp_handle->qp);
ret = errno;
goto err;
}
return DAT_SUCCESS;
err:
- if (ep_ptr->qp_handle)
+ if (ep_ptr->qp_handle) {
+ if (ep_ptr->qp_handle->qp)
+ ibv_destroy_qp(ep_ptr->qp_handle->qp);
+#ifdef _OPENIB_MCM_
+ if (ep_ptr->qp_handle->qp2)
+ ibv_destroy_qp(ep_ptr->qp_handle->qp2);
+#endif
dapl_os_free(ep_ptr->qp_handle, sizeof(struct dcm_ib_qp));
-
+ }
ep_ptr->qp_handle = IB_INVALID_HANDLE;
return (dapl_convert_errno(ret, "create_qp"));
}
struct ibv_qp *qp;
struct ibv_qp_attr qp_attr;
+ dapl_log(DAPL_DBG_TYPE_EP,
+ " dapls_ib_qp_free: ep_ptr %p qp_handle %p\n",
+ ep_ptr, ep_ptr->qp_handle);
+
#ifdef _OPENIB_CMA_
dp_ib_cm_handle_t cm_ptr = dapl_get_cm_from_ep(ep_ptr);
if (!cm_ptr)
dapli_mix_qp_free(ep_ptr->qp_handle);
else /* NON MIC: local shadow queue */
- ibv_destroy_qp(ep_ptr->qp_handle->sqp);
+ ibv_destroy_qp(ep_ptr->qp_handle->qp2);
+
+ dapl_os_lock_destroy(&ep_ptr->qp_handle->lock);
+ mcm_destroy_pi_cq(ep_ptr->qp_handle);
+ mcm_destroy_wc_q(ep_ptr->qp_handle);
#endif
} else {
dapl_os_unlock(&ep_ptr->header.lock);
switch (qp_state) {
case IBV_QPS_RTR:
- dapl_dbg_log(DAPL_DBG_TYPE_EP,
- " QPS_RTR: type %d l_qpn %x qpn %x lid 0x%x"
- " port %d ep %p qp_state %d rd_atomic %d\n",
- qp_handle->qp_type, qp_handle->qp_num,
- ntohl(qpn), ntohs(lid), ia_ptr->hca_ptr->port_num,
- ep_ptr, ep_ptr->qp_state,
- ep_ptr->param.ep_attr.max_rdma_read_in);
-
mask |= IBV_QP_AV |
IBV_QP_PATH_MTU |
IBV_QP_DEST_QPN |
qp_attr.dest_qp_num = ntohl(qpn);
qp_attr.rq_psn = 1;
qp_attr.path_mtu = ia_ptr->hca_ptr->ib_trans.mtu;
+ qp_attr.min_rnr_timer = ia_ptr->hca_ptr->ib_trans.rnr_timer;
#ifdef _OPENIB_MCM_
- qp_attr.max_dest_rd_atomic = 4;
+ qp_attr.max_dest_rd_atomic = ia_ptr->hca_ptr->ib_trans.rd_atom_in;
#else
qp_attr.max_dest_rd_atomic = ep_ptr->param.ep_attr.max_rdma_read_in;
#endif
- qp_attr.min_rnr_timer = ia_ptr->hca_ptr->ib_trans.rnr_timer;
+ dapl_dbg_log(DAPL_DBG_TYPE_EP,
+ " QPS_RTR: type %d l_qpn %x qpn %x lid 0x%x"
+ " port %d ep %p qp_state %d rd_atomic %d\n",
+ qp_handle->qp_type, qp_handle->qp_num,
+ ntohl(qpn), ntohs(lid), ia_ptr->hca_ptr->port_num,
+ ep_ptr, ep_ptr->qp_state, qp_attr.max_dest_rd_atomic);
/* address handle. RC and UD */
qp_attr.ah_attr.dlid = ntohs(lid);
qp_attr.rnr_retry =
ia_ptr->hca_ptr->ib_trans.rnr_retry;
#ifdef _OPENIB_MCM_
- qp_attr.max_rd_atomic = 4;
+ qp_attr.max_rd_atomic = ia_ptr->hca_ptr->ib_trans.rd_atom_out;
#else
- qp_attr.max_rd_atomic = ep_ptr->param.ep_attr.max_rdma_read_out;
+ qp_attr.max_rd_atomic = ep_ptr->param.ep_attr.max_rdma_read_out;
#endif
}
/* RC and UD */
#ifdef _OPENIB_MCM_
/* Adjust for CCL Proxy; limited sge's, no READ support, reduce QP and RDMA limits */
- if (hca_ptr->ib_trans.scif_ep) {
- dev_attr.max_sge = DAPL_MIN(dev_attr.max_sge, DAT_MIX_SGE_MAX);
- dev_attr.max_qp_wr = DAPL_MIN(dev_attr.max_qp_wr,
- dapl_os_get_env_val("DAPL_MCM_WR_MAX", DAT_MIX_WR_MAX));
- port_attr.max_msg_sz = DAPL_MIN(port_attr.max_msg_sz,
- dapl_os_get_env_val("DAPL_MCM_MSG_MAX", DAT_MIX_RDMA_MAX));
- }
+ dev_attr.max_sge = DAPL_MIN(dev_attr.max_sge, DAT_MIX_SGE_MAX);
+ dev_attr.max_qp_wr = DAPL_MIN(dev_attr.max_qp_wr,
+ dapl_os_get_env_val("DAPL_MCM_WR_MAX", DAT_MIX_WR_MAX));
+ port_attr.max_msg_sz = DAPL_MIN(port_attr.max_msg_sz,
+ dapl_os_get_env_val("DAPL_MCM_MSG_MAX", DAT_MIX_RDMA_MAX));
#endif
if (ia_attr != NULL) {
static int mcm_send(ib_hca_transport_t *tp, dat_mcm_msg_t *msg, DAT_PVOID p_data, DAT_COUNT p_size);
DAT_RETURN dapli_cm_disconnect(dp_ib_cm_handle_t cm);
DAT_RETURN dapli_cm_connect(DAPL_EP *ep, dp_ib_cm_handle_t cm);
+static void mcm_log_addrs(int lvl, struct dat_mcm_msg *msg, int state, int in);
/* Service ids - port space */
static uint16_t mcm_get_port(ib_hca_transport_t *tp, uint16_t port)
msg = (dat_mcm_msg_t*) (uintptr_t) wc[i].wr_id;
dapl_dbg_log(DAPL_DBG_TYPE_CM,
- " mcm_recv: stat=%d op=%s ln=%d id=%p sqp=%x\n",
+ " mcm_recv: stat=%d op=%s ln=%d id=%p qp2=%x\n",
wc[i].status, dapl_cm_op_str(ntohs(msg->op)),
wc[i].byte_len,
(void*)wc[i].wr_id, wc[i].src_qp);
goto bail;
}
cm->msg.sqpn = htonl(hca->ib_trans.qp->qp_num); /* ucm */
- cm->msg.saddr2.qpn = htonl(ep->qp_handle->sqp->qp_num); /* QPt */
+ cm->msg.saddr2.qpn = htonl(ep->qp_handle->qp2->qp_num); /* QPt */
cm->msg.saddr2.qp_type = ep->qp_handle->qp->qp_type;
cm->msg.saddr2.lid = hca->ib_trans.addr.lid;
cm->msg.saddr2.ep_map = hca->ib_trans.addr.ep_map;
return;
}
- /* save remote address information to EP and CM */
+ /* CM_REP: save remote address information to EP and CM */
cm->msg.d_id = msg->s_id;
dapl_os_memcpy(&ep->remote_ia_address, &msg->saddr2, sizeof(dat_mcm_addr_t));
dapl_os_memcpy(&cm->msg.daddr2, &msg->saddr2, sizeof(dat_mcm_addr_t));
dapl_os_memcpy(&cm->msg.daddr1, &msg->saddr1, sizeof(dat_mcm_addr_t));
+ dapl_os_memcpy(&cm->msg.p_proxy, &msg->p_proxy, DAT_MCM_PROXY_DATA);
/* validate private data size, and copy if necessary */
if (msg->p_size) {
}
/* QP to RTR-RTS with remote QPr (daddr1) info */
- if (!cm->tp->scif_ep) { /* NON-MIC, sQP is local and not on MPXYD */
+ if (!cm->tp->scif_ep) { /* NON-MIC, qp2 is local and not on MPXYD */
ret = dapls_modify_qp_rtu(
- cm->ep->qp_handle->sqp,
+ cm->ep->qp_handle->qp2,
cm->msg.daddr1.qpn,
cm->msg.daddr1.lid,
(ib_gid_handle_t)cm->msg.daddr1.gid);
event = IB_CME_LOCAL_FAILURE;
goto bail;
}
+ /* MXS peer: setup PI WC and save peer WR queue info */
+ if (MXS_EP(&cm->msg.daddr1)) {
+ /* save PI WR info, create local WC_q, send back WC info */
+ mcm_ntoh_wrc(&ep->qp_handle->wrc_rem, (mcm_wrc_info_t*)cm->msg.p_proxy);
+ mcm_create_wc_q(ep->qp_handle, MCM_WRC_QLEN);
+ mcm_hton_wrc((mcm_wrc_info_t*)cm->msg.p_proxy, &ep->qp_handle->wrc);
+ ep->qp_handle->ep_map = cm->msg.daddr1.ep_map;
+
+ /* post 0-byte rcv for inbound WC's via RW_imm */
+ if (mcm_post_rcv_wc(ep->qp_handle, MCM_WRC_QLEN))
+ goto bail;
+
+ dapl_log(DAPL_DBG_TYPE_CM,
+ "CONN_RTU: WR_rem %p sz %d, WC %p sz %d\n",
+ ep->qp_handle->wrc_rem.wr_addr,
+ ep->qp_handle->wrc_rem.wr_end+1,
+ ep->qp_handle->wrc.wc_addr,
+ ep->qp_handle->wrc.wc_end+1);
+ }
}
dapl_os_unlock(&cm->ep->header.lock);
cm->msg.p_data, ntohs(cm->msg.p_size), cm->ep);
dapl_log(DAPL_DBG_TYPE_CM_EST,
- " mcm_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x\n",
+ " mcm_ACTIVE_CONN %p %d [lid port qpn] %x %x %x -> %x %x %x %s\n",
cm->hca, cm->retries, ntohs(cm->msg.saddr1.lid),
ntohs(cm->msg.sport), ntohl(cm->msg.saddr1.qpn),
ntohs(cm->msg.daddr1.lid), ntohs(cm->msg.dport),
- ntohl(cm->msg.dqpn));
+ ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map));
+
+ mcm_log_addrs(DAPL_DBG_TYPE_CM_EST, &cm->msg, cm->state, 0);
+
return;
bail:
dapl_evd_connection_callback(NULL, event, cm->msg.p_data, ntohs(cm->msg.p_size), cm->ep);
/* CR saddr1 is CM daddr1 info, need EP for local saddr1 */
dapl_os_memcpy(&acm->msg.daddr1, &msg->saddr1, sizeof(dat_mcm_addr_t));
dapl_os_memcpy(&acm->msg.daddr2, &msg->saddr2, sizeof(dat_mcm_addr_t));
+ dapl_os_memcpy(&acm->msg.p_proxy, &msg->p_proxy, DAT_MCM_PROXY_DATA);
dapl_log(DAPL_DBG_TYPE_CM,
" accept: DST port=%x lid=%x, iqp=%x, iqp2=%x, psize=%d\n",
dapls_cr_callback(cm, IB_CME_CONNECTED, NULL, 0, cm->sp);
dapl_log(DAPL_DBG_TYPE_CM_EST,
- " PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x\n",
+ " PASSIVE_CONN %p %d [lid port qpn] %x %x %x <- %x %x %x %s\n",
cm->hca, cm->retries, ntohs(cm->msg.saddr1.lid),
ntohs(cm->msg.sport), ntohl(cm->msg.saddr1.qpn),
ntohs(cm->msg.daddr1.lid), ntohs(cm->msg.dport),
- ntohl(cm->msg.dqpn));
+ ntohl(cm->msg.dqpn), mcm_map_str(cm->msg.daddr1.ep_map));
+
+ mcm_log_addrs(DAPL_DBG_TYPE_CM_EST, &cm->msg, cm->state, 1);
return;
bail:
DAPL_CNTR(((DAPL_IA *)dapl_llist_peek_head(&cm->hca->ia_list_head)), DCNT_IA_CM_ERR);
dapl_log(DAPL_DBG_TYPE_CM,
" MCM_ACCEPT_USR: ep %p cm %p QPt %p QPr %p p_data %p p_size %d\n",
- ep, cm, ep->qp_handle->sqp, ep->qp_handle->qp, p_data, p_size);
+ ep, cm, ep->qp_handle->qp2, ep->qp_handle->qp, p_data, p_size);
dapl_log(DAPL_DBG_TYPE_CM, " MCM_ACCEPT_USR: ep %p cm %p %s refs=%d"
- " %x %x i_%x i2_%x <- %x %x i1_%x i2_%x l_pid %x r_pid %x\n",
+ " %x %x i_%x i2_%x %s <- %x %x i1_%x i2_%x l_pid %x r_pid %x %s\n",
ep, cm, dapl_cm_state_str(cm->state), cm->ref_count,
htons(cm->hca->ib_trans.addr.lid), htons(cm->msg.sport),
ep->qp_handle->qp ? ep->qp_handle->qp->qp_num:0,
- ep->qp_handle->sqp ? ep->qp_handle->sqp->qp_num:0,
+ ep->qp_handle->qp2 ? ep->qp_handle->qp2->qp_num:0,
+ mcm_map_str(cm->hca->ib_trans.addr.ep_map),
htons(cm->msg.daddr1.lid), htons(cm->msg.dport),
htonl(cm->msg.daddr1.qpn), htonl(cm->msg.daddr2.qpn),
- ntohl(cm->msg.s_id), ntohl(cm->msg.d_id));
+ ntohl(cm->msg.s_id), ntohl(cm->msg.d_id),
+ mcm_map_str(cm->msg.daddr1.ep_map));
if (p_size > DAT_MCM_PDATA_SIZE)
return DAT_LENGTH_ERROR;
cm->ep, cm, dapl_cm_state_str(cm->state), cm->ref_count,
htons(cm->hca->ib_trans.addr.lid), htons(cm->msg.sport),
ep->qp_handle->qp ? ep->qp_handle->qp->qp_num:0,
- ep->qp_handle->sqp ? ep->qp_handle->sqp->qp_num:0,
+ ep->qp_handle->qp2 ? ep->qp_handle->qp2->qp_num:0,
htons(cm->msg.daddr1.lid), htons(cm->msg.dport),
htonl(cm->msg.daddr1.qpn), htonl(cm->msg.daddr2.qpn),
ntohl(cm->msg.s_id), ntohl(cm->msg.d_id));
goto bail;
}
}
- /* modify QPt to RTR and then to RTS, QPt (sqp) to remote QPr (daddr1) */
- if (!cm->tp->scif_ep) { /* NON-MIC, sQP is local and not on MPXYD */
- ret = dapls_modify_qp_rtu(ep->qp_handle->sqp,
+ /* modify QPt to RTR and then to RTS, QPt (qp2) to remote QPr (daddr1) */
+ if (!cm->tp->scif_ep) { /* NON-MIC, qp2 is local and not on MPXYD */
+ ret = dapls_modify_qp_rtu(ep->qp_handle->qp2,
cm->msg.daddr1.qpn,
cm->msg.daddr1.lid,
(ib_gid_handle_t)cm->msg.daddr1.gid);
dapl_os_unlock(&ep->header.lock);
goto bail;
}
- cm->msg.saddr2.qpn = htonl(ep->qp_handle->sqp->qp_num);
+ cm->msg.saddr2.qpn = htonl(ep->qp_handle->qp2->qp_num);
cm->msg.saddr2.lid = cm->hca->ib_trans.addr.lid;
cm->msg.saddr2.qp_type = ep->qp_handle->qp->qp_type;
+ cm->msg.saddr2.ep_map = cm->hca->ib_trans.addr.ep_map;
dapl_os_memcpy(&cm->msg.saddr2.gid[0],
&cm->hca->ib_trans.addr.gid, 16);
+
+ /* MXS peer: setup PI WC and save peer WR queue info */
+ if (MXS_EP(&cm->msg.daddr1)) {
+ /* save PI WR info, create local WC_q, send back WC info */
+ mcm_ntoh_wrc(&ep->qp_handle->wrc_rem, (mcm_wrc_info_t*)cm->msg.p_proxy);
+ mcm_create_wc_q(ep->qp_handle, MCM_WRC_QLEN);
+ mcm_hton_wrc((mcm_wrc_info_t*)cm->msg.p_proxy, &ep->qp_handle->wrc);
+ ep->qp_handle->ep_map = cm->msg.daddr1.ep_map;
+
+ /* post 0-byte rcv for inbound WC's via RW_imm */
+ if (mcm_post_rcv_wc(ep->qp_handle, MCM_WRC_QLEN))
+ goto bail;
+
+ dapl_log(DAPL_DBG_TYPE_CM,
+ "ACCEPT_USR: WR_rem %p sz %d, WC %p sz %d\n",
+ ep->qp_handle->wrc_rem.wr_addr,
+ ep->qp_handle->wrc_rem.wr_end+1,
+ ep->qp_handle->wrc.wc_addr,
+ ep->qp_handle->wrc.wc_end+1);
+ }
}
dapl_os_unlock(&ep->header.lock);
cm->msg.saddr1.qpn = htonl(ep->qp_handle->qp->qp_num);
cm->msg.saddr1.qp_type = ep->qp_handle->qp->qp_type;
cm->msg.saddr1.lid = cm->hca->ib_trans.addr.lid;
+ cm->msg.saddr1.ep_map = cm->hca->ib_trans.addr.ep_map;
dapl_os_memcpy(&cm->msg.saddr1.gid[0],
&cm->hca->ib_trans.addr.gid, 16);
}
dapls_modify_qp_state(ep_ptr->qp_handle->qp, IBV_QPS_ERR,0,0,0);
} else { /* QPt and QPr local */
dapli_cm_disconnect(cm_ptr);
- dapls_modify_qp_state(ep_ptr->qp_handle->sqp, IBV_QPS_ERR,0,0,0);
+ dapls_modify_qp_state(ep_ptr->qp_handle->qp2, IBV_QPS_ERR,0,0,0);
}
return DAT_SUCCESS;
{
struct dapl_hca *hca = arg;
dp_ib_cm_handle_t cm, next;
+ ib_cq_handle_t m_cq;
struct dapl_fd_set *set;
char rbuf[2];
int time_ms, ret;
dapl_fd_set(hca->ib_trans.scif_ev_ep, set, DAPL_FD_READ);
dapl_fd_set(hca->ib_trans.ib_cq->fd, set, DAPL_FD_READ);
+ dapl_os_lock(&hca->ib_trans.cqlock); /* CQt for HST->MXS */
+ if (!dapl_llist_is_empty(&hca->ib_trans.cqlist))
+ m_cq = dapl_llist_peek_head(&hca->ib_trans.cqlist);
+ else
+ m_cq = NULL;
+
+ while (m_cq) {
+ dapl_fd_set(m_cq->cq->channel->fd, set, DAPL_FD_READ);
+ dapl_log(DAPL_DBG_TYPE_CM, " cm_thread: mcm_rcv_pi_event(%p)\n", m_cq);
+ mcm_rcv_pi_event(m_cq);
+ m_cq = dapl_llist_next_entry(
+ &hca->ib_trans.cqlist,
+ (DAPL_LLIST_ENTRY *)&m_cq->entry);
+ }
+ dapl_os_unlock(&hca->ib_trans.cqlock);
+
if (!dapl_llist_is_empty(&hca->ib_trans.list))
next = dapl_llist_peek_head(&hca->ib_trans.list);
else
hca->ib_trans.cm_state != IB_THREAD_RUN) {
dapl_os_unlock(&cm->lock);
dapl_log(DAPL_DBG_TYPE_CM,
- " CM FREE: cm %p ep %p st=%s refs=%d\n",
+ " CM destroy: cm %p ep %p st=%s refs=%d\n",
cm, cm->ep, mcm_state_str(cm->state),
cm->ref_count);
dapl_dbg_log(DAPL_DBG_TYPE_UTIL, " cm_thread(hca %p) exit\n", hca);
}
+static void mcm_log_addrs(int lvl, struct dat_mcm_msg *msg, int state, int in)
+{
+ if (in) {
+ if (MXS_EP(&msg->daddr1) && MXS_EP(&msg->saddr1)) {
+ dapl_log(lvl, " QPr_t addr2: %s 0x%x %x 0x%x %s <- QPt_r addr2: 0x%x %x 0x%x %s\n",
+ mcm_state_str(state), htons(msg->daddr2.lid),
+ htonl(msg->daddr2.qpn), htons(msg->dport),
+ mcm_map_str(msg->daddr2.ep_map),
+ htons(msg->saddr2.lid), htonl(msg->saddr2.qpn),
+ htons(msg->sport), mcm_map_str(msg->saddr2.ep_map));
+ } else {
+ dapl_log(lvl, " QPr addr1: %s 0x%x %x 0x%x %s <- QPt addr2: 0x%x %x 0x%x %s\n",
+ mcm_state_str(state), htons(msg->daddr1.lid),
+ htonl(msg->daddr1.qpn), htons(msg->dport),
+ mcm_map_str(msg->daddr1.ep_map),
+ htons(msg->saddr2.lid), htonl(msg->saddr2.qpn),
+ htons(msg->sport), mcm_map_str(msg->saddr2.ep_map));
+ dapl_log(lvl, " QPt addr2: %s 0x%x %x 0x%x %s <- QPr addr1: 0x%x %x 0x%x %s\n",
+ mcm_state_str(state),htons(msg->daddr2.lid),
+ htonl(msg->daddr2.qpn), htons(msg->dport),
+ mcm_map_str(msg->daddr2.ep_map),
+ htons(msg->saddr1.lid), htonl(msg->saddr1.qpn),
+ htons(msg->sport), mcm_map_str(msg->saddr1.ep_map));
+ }
+ } else {
+ if (MXS_EP(&msg->saddr1) && MXS_EP(&msg->daddr1)) {
+ dapl_log(lvl, " QPr_t addr2: %s 0x%x %x 0x%x %s -> QPt_r addr2: 0x%x %x 0x%x %s\n",
+ mcm_state_str(state), htons(msg->saddr2.lid),
+ htonl(msg->saddr2.qpn), htons(msg->sport),
+ mcm_map_str(msg->saddr2.ep_map),
+ htons(msg->daddr2.lid), htonl(msg->daddr2.qpn),
+ htons(msg->dport), mcm_map_str(msg->daddr2.ep_map));
+ } else {
+ dapl_log(lvl, " QPr addr1: %s 0x%x %x 0x%x %s -> QPt addr2: 0x%x %x 0x%x %s\n",
+ mcm_state_str(state), htons(msg->saddr1.lid),
+ htonl(msg->saddr1.qpn), htons(msg->sport),
+ mcm_map_str(msg->saddr1.ep_map),
+ htons(msg->daddr2.lid), htonl(msg->daddr2.qpn),
+ htons(msg->dport), mcm_map_str(msg->daddr2.ep_map));
+ dapl_log(lvl, " QPt addr2: %s 0x%x %x 0x%x %s -> QPr addr1: 0x%x %x 0x%x %s\n",
+ mcm_state_str(state), htons(msg->saddr2.lid),
+ htonl(msg->saddr2.qpn), htons(msg->sport),
+ mcm_map_str(msg->saddr2.ep_map),
+ htons(msg->daddr1.lid), htonl(msg->daddr1.qpn),
+ htons(msg->dport), mcm_map_str(msg->daddr1.ep_map));
+ }
+ }
+}
+
#ifdef DAPL_COUNTERS
static char _ctr_host_[128];
/* Debug aid: List all Connections in process and state */
#define _OPENIB_MCM_
#include <infiniband/verbs.h>
+#include <dat2/dat_mic_extensions.h>
#include <scif.h>
+#include "mpxy.h"
#include "openib_osd.h"
#include "dapl_ib_common.h"
-#include <dat2/dat_mic_extensions.h>
-
/* DAPL CM objects MUST include list_entry, ref_count, event for EP linking */
struct ib_cm_handle
struct dapl_llist_entry *list;
DAPL_OS_LOCK llock; /* listen list */
struct dapl_llist_entry *llist;
+ DAPL_OS_LOCK cqlock; /* CQ list for PI WC's */
+ struct dapl_llist_entry *cqlist;
ib_async_handler_t async_unafiliated;
void *async_un_ctx;
ib_async_cq_handler_t async_cq_error;
ib_async_dto_handler_t async_cq;
ib_async_qp_handler_t async_qp_error;
struct dat_mcm_addr addr; /* lid, port, qp_num, gid */
- DAT_NAMED_ATTR named_attr;
struct dapl_thread_signal signal;
/* dat_mix_dev_attr_t */
uint8_t ack_timer;
dp_ib_cm_handle_t dapls_cm_create(DAPL_HCA *hca, DAPL_EP *ep);
DAT_RETURN dapls_modify_qp_rtu(struct ibv_qp *qp, uint32_t qpn, uint16_t lid, ib_gid_handle_t gid);
-/* MIC eXchange (MIX) operations */
+/* HST->MXS (MIC xsocket) remote PI communication, proxy.c */
+int mcm_send_pi(ib_qp_handle_t m_qp, int len, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr);
+int mcm_post_rcv_wc(struct dcm_ib_qp *m_qp, int cnt);
+void mcm_rcv_pi_event(struct dcm_ib_cq *m_cq);
+int mcm_create_wc_q(struct dcm_ib_qp *m_qp, int entries);
+void mcm_destroy_wc_q(struct dcm_ib_qp *m_qp);
+int mcm_create_pi_cq(struct dcm_ib_qp *m_qp, int len);
+void mcm_destroy_pi_cq(struct dcm_ib_qp *m_qp);
+
+/* MIC eXchange (MIX) operations, mix.c */
int dapli_mix_open(ib_hca_transport_t *tp, char *name, int port, int query);
void dapli_mix_close(ib_hca_transport_t *tp);
int dapli_mix_listen(dp_ib_cm_handle_t cm, uint16_t sid);
fd, opts, strerror(errno));
return errno;
}
-
return 0;
}
return dapls_config_fd(channel->fd);
}
+/* Need CQ for shadow QP's with one half usage */
+static ib_cq_handle_t dapls_create_empty_cq(struct ibv_context *ib_ctx)
+{
+ struct dcm_ib_cq *empty_cq;
+
+ empty_cq = dapl_os_alloc(sizeof(struct dcm_ib_cq));
+ if (!empty_cq)
+ return NULL;
+ dapl_os_memzero(empty_cq, sizeof(struct dcm_ib_cq));
+
+ empty_cq->cq = ibv_create_cq(ib_ctx, 1, NULL, NULL, 0);
+ if (!empty_cq->cq) {
+ dapl_os_free(empty_cq, sizeof(struct dcm_ib_cq));
+ return NULL;
+ }
+ return empty_cq;
+}
+
/*
* dapls_ib_init, dapls_ib_release
*
if ((dapl_os_lock_init(&hca_ptr->ib_trans.plock)) != DAT_SUCCESS)
goto bail;
+ if ((dapl_os_lock_init(&hca_ptr->ib_trans.cqlock)) != DAT_SUCCESS)
+ goto bail;
+
/* EVD events without direct CQ channels, CNO support */
hca_ptr->ib_trans.ib_cq =
ibv_create_comp_channel(hca_ptr->ib_hca_handle);
}
dapls_config_comp_channel(hca_ptr->ib_trans.ib_cq);
+ /* EVD to indirect CQ's, need empty CQ for half QP that is not used */
+ hca_ptr->ib_trans.ib_cq_empty = dapls_create_empty_cq(hca_ptr->ib_hca_handle);
+ if (hca_ptr->ib_trans.ib_cq_empty == NULL) {
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " open_hca: ERR: create_empty_cq = %s\n",
+ strerror(errno));
+ goto bail;
+ }
+
/* initialize CM and listen lists on this HCA uCM QP */
dapl_llist_init_head(&hca_ptr->ib_trans.list);
dapl_llist_init_head(&hca_ptr->ib_trans.llist);
+ dapl_llist_init_head(&hca_ptr->ib_trans.cqlist);
/* create uCM qp services */
if (mcm_service_create(hca_ptr))
dapli_mix_close(&hca_ptr->ib_trans);
dapl_os_lock_destroy(&hca_ptr->ib_trans.lock);
dapl_os_lock_destroy(&hca_ptr->ib_trans.llock);
+ dapl_os_lock_destroy(&hca_ptr->ib_trans.cqlock);
destroy_os_signal(hca_ptr);
mcm_service_destroy(hca_ptr);
done:
if (hca_ptr->ib_trans.ib_cq_empty) {
struct ibv_comp_channel *channel;
- channel = hca_ptr->ib_trans.ib_cq_empty->ib_cq->channel;
- ibv_destroy_cq(hca_ptr->ib_trans.ib_cq_empty->ib_cq);
- ibv_destroy_comp_channel(channel);
+ channel = hca_ptr->ib_trans.ib_cq_empty->cq->channel;
+ ibv_destroy_cq(hca_ptr->ib_trans.ib_cq_empty->cq);
+ if (channel)
+ ibv_destroy_comp_channel(channel);
}
if (hca_ptr->ib_hca_handle != IB_INVALID_HANDLE) {
#define DAPL_SOCKET int
#define DAPL_INVALID_SOCKET -1
-#define DAPL_FD_SETSIZE 16
+#define DAPL_FD_SETSIZE 8192
#define closesocket close
#include "dapl_ep_util.h"
#include "dapl_osd.h"
-static inline void const_mix_wr(struct dat_mix_wr *mwr, struct ibv_send_wr *iwr)
-{
- memset((void*)mwr, 0, sizeof(*mwr));
- mwr->wr_id = iwr->wr_id;
- mwr->num_sge = iwr->num_sge;
- mwr->opcode = iwr->opcode;
- mwr->send_flags = iwr->send_flags;
- mwr->imm_data = iwr->imm_data;
- mwr->wr.rdma.remote_addr = iwr->wr.rdma.remote_addr;
- mwr->wr.rdma.rkey = iwr->wr.rdma.rkey;
-}
-
-static inline void const_ib_wc(struct ibv_wc *iwc, struct dat_mix_wc *mwc, int entries)
-{
- int i;
-
- for (i=0;i<entries;i++) {
- memset((void*)&iwc[i].wr_id, 0, sizeof(*iwc));
- iwc[i].wr_id = mwc[i].wr_id;
- iwc[i].status = mwc[i].status;
- iwc[i].opcode = mwc[i].opcode;
- iwc[i].vendor_err = mwc[i].vendor_err;
- iwc[i].byte_len = mwc[i].byte_len;
- iwc[i].imm_data = mwc[i].imm_data;
- iwc[i].qp_num = mwc[i].qp_num;
- iwc[i].src_qp = mwc[i].src_qp;
- iwc[i].wc_flags = mwc[i].wc_flags;
- iwc[i].pkey_index = mwc[i].pkey_index;
- iwc[i].slid = mwc[i].slid;
- iwc[i].sl = mwc[i].sl;
- iwc[i].dlid_path_bits = mwc[i].dlid_path_bits;
- }
-}
-
/*
* CM proxy services, MCM on MIC to MPXYD via SCIF
*
}
dapl_log(DAPL_DBG_TYPE_EXTENSION," SCIF node_id: %d\n", (uint16_t)tp->self.node);
+ if (tp->self.node == 0)
+ tp->addr.ep_map = HOST_SOCK_DEV; /* non-MIC mapping */
+
if (query_only || (tp->self.node == 0 && !always_proxy)){
dapl_log(DAPL_DBG_TYPE_EXTENSION," Not running on MIC, no MPXY connect required\n");
tp->scif_ep = 0;
msg.qp_r.qp_num = m_qp->qp->qp_num;
msg.qp_r.qp_type = m_qp->qp->qp_type;
msg.qp_r.state = m_qp->qp->state;
- msg.qp_r.rcq_id = rcv_cq->cq_id;
} else { /* QP_r shadowed on proxy */
msg.qp_r.qp_num = 0;
msg.qp_r.qp_type = 0;
msg.qp_r.state = 0;
- msg.qp_r.rcq_id = rcv_cq->scq_id;
}
+ msg.qp_r.rcq_id = rcv_cq->cq_id;
msg.qp_r.ctx = (uint64_t)m_qp;
msg.qp_r.qp_id = 0; /* for now */
msg.qp_r.qp_type = attr->qp_type;
msg.qp_t.max_send_sge = attr->cap.max_send_sge;
msg.qp_t.max_recv_wr = attr->cap.max_recv_wr;
msg.qp_t.max_recv_sge = attr->cap.max_recv_sge;
- msg.qp_t.scq_id = req_cq->scq_id;
+ msg.qp_t.scq_id = req_cq->cq_id; /* QP_t always shadowed on proxy */
dapl_log(DAPL_DBG_TYPE_EXTENSION,
" MIX_QP_CREATE: QP_r - qpn 0x%x, ctx %p, rq %d,%d sq %d,%d rcq_id %d,%p\n",
return EINVAL;
}
- /* save QP_t id, ctx, and proxy buffer and wr pools. used on post_writes */
- m_qp->sqp_id = msg.qp_t.qp_id;
- m_qp->sqp_ctx = msg.qp_t.ctx;
- m_qp->m_seg = msg.m_seg;
- m_qp->wr_off = msg.wr_off;
- m_qp->wr_len = msg.wr_len;
- m_qp->wr_hd = m_qp->wr_tl = 0;
+ /* save QP_t id, QP is shadowed TX */
+ m_qp->qp_id = msg.qp_t.qp_id;
m_qp->m_inline = msg.m_inline;
dapl_log(DAPL_DBG_TYPE_EXTENSION,
- " MIX_QP_CREATE: reply, proxy qp_id 0x%x, ctx %p, wr_len %d\n",
- m_qp->sqp_id, (void*)m_qp->sqp_ctx, m_qp->wr_len);
+ " MIX_QP_CREATE: reply, proxy qp_id 0x%x\n", m_qp->qp_id);
return 0;
}
msg.op = MIX_QP_FREE;
msg.status = 0;
msg.flags = MIX_OP_REQ;
- msg.req_id = m_qp->sqp_id;
+ msg.req_id = m_qp->qp_id; /* shadowed QP */
len = sizeof(dat_mix_hdr_t);
ret = scif_send(mix_ep, &msg, len, SCIF_SEND_BLOCK);
return -1;
}
- /* save CQ_t id and ctx, needed for polling */
- m_cq->scq_id = msg.cq_id;
- m_cq->scq_ctx = msg.cq_ctx;
+ /* save id from proxy CQ create */
+ m_cq->cq_id = msg.cq_id;
+ dapl_log(DAPL_DBG_TYPE_EXTENSION,
+ " MIX_CQ_CREATE: reply, proxy cq_id 0x%x\n", m_cq->cq_id);
return 0;
}
scif_epd_t mix_ep = m_cq->tp->scif_ep;
int ret, len;
- if (!m_cq->scq_id)
- return 0;
-
/* request */
msg.ver = DAT_MIX_VER;
msg.op = MIX_CQ_FREE;
msg.status = 0;
msg.flags = MIX_OP_REQ;
- msg.req_id = m_cq->scq_id;
+ msg.req_id = m_cq->cq_id;
len = sizeof(dat_mix_hdr_t);
ret = scif_send(mix_ep, &msg, len, SCIF_SEND_BLOCK);
return 0;
}
-/* TODO: change for aperture/mapped memory, optimize */
+/* TODO: change for aperture/mapped memory ?? optimize */
int dapli_mix_cq_poll(ib_cq_handle_t m_cq, struct ibv_wc *wc)
{
- dat_mix_dto_comp_t msg;
- scif_epd_t mix_ep = m_cq->tp->scif_ep;
- DAPL_COOKIE *cookie;
- int ret, len;
-
/* MPXYD will send event and update EVD, return empty to avoid unnecessary SCIF traffic */
return 0;
-
- /* request */
- msg.hdr.ver = DAT_MIX_VER;
- msg.hdr.op = MIX_CQ_POLL;
- msg.hdr.status = 0;
- msg.hdr.flags = MIX_OP_REQ;
- msg.cq_id = m_cq->scq_id;
- msg.cq_ctx = m_cq->scq_ctx;
- msg.wc_cnt = 1;
-
- len = sizeof(dat_mix_dto_comp_t);
- ret = scif_send(mix_ep, &msg, len, SCIF_SEND_BLOCK);
- if (ret != len) {
- dapl_log(1, " ERR: %s send on %d, ret %d, exp %d, error %s\n",
- mix_op_str(msg.hdr.op), mix_ep, ret, len, strerror(errno));
- }
- dapl_log(DAPL_DBG_TYPE_EXTENSION," Sent %s request on SCIF EP\n", mix_op_str(msg.hdr.op));
-
- /* response */
- ret = scif_recv(mix_ep, &msg, len, SCIF_RECV_BLOCK);
- if (ret != len) {
- dapl_log(1, " ERR: rcv on new_ep %d, ret %d, exp %d, error %s\n", mix_ep, ret, len, strerror(errno));
- return -1;
- }
- if (msg.hdr.ver != DAT_MIX_VER || msg.hdr.op != MIX_CQ_POLL ||
- msg.hdr.flags != MIX_OP_RSP || msg.hdr.status != MIX_SUCCESS) {
- dapl_log(1, " MIX msg ver %d, op %d, flags %d, or stat %d ERR \n",
- msg.hdr.ver, msg.hdr.op, msg.hdr.flags, msg.hdr.status);
- return -1;
- }
- if (msg.wc_cnt == 1) {
- memcpy(wc, msg.wc, sizeof(*wc));
- /* possible segmentation on mpxyd side, update length if success */
- if (wc->status == 0) {
- cookie = (DAPL_COOKIE *) (uintptr_t) wc->wr_id;
- wc->byte_len = cookie->val.dto.size;
- }
- }
-
- dapl_log(DAPL_DBG_TYPE_EXTENSION," received reply on SCIF EP, result = %d\n", msg.wc_cnt);
- return msg.wc_cnt;
}
-
/* SCIF DMA outbound writes and inbound msg receives; translate to scif_off via LMR */
/* TODO: faster translation for post_send? */
static inline int mix_proxy_data(ib_qp_handle_t m_qp, dat_mix_sr_t *msg, struct ibv_sge *sglist, int txlen, int mix_ep)
msg->hdr.status = 0;
msg->hdr.flags = MIX_OP_REQ;
msg->len = txlen;
- msg->qp_id = m_qp->sqp_id;
- msg->qp_ctx = m_qp->sqp_ctx;
- const_mix_wr(&msg->wr, wr);
+ msg->qp_id = m_qp->qp_id;
+ mcm_const_mix_wr(&msg->wr, wr);
if (txlen > m_qp->m_inline) {
if (mix_proxy_data(m_qp, msg, wr->sg_list, txlen, mix_ep))
msg->hdr.status = 0;
msg->hdr.flags = MIX_OP_REQ;
msg->len = len;
- msg->qp_id = m_qp->sqp_id; /* shadowed QP */
- msg->qp_ctx = m_qp->sqp_ctx;
+ msg->qp_id = m_qp->qp_id; /* shadowed RX */
/* setup work request */
memset((void*)&msg->wr, 0, sizeof(dat_mix_wr_t));
msg.hdr.op = MIX_CM_ACCEPT;
msg.hdr.status = 0;
msg.hdr.flags = MIX_OP_REQ;
- msg.qp_id = m_cm->ep->qp_handle->sqp_id;
+ msg.qp_id = m_cm->ep->qp_handle->qp_id; /* QP2 shadowed TX */
msg.cm_id = m_cm->cm_id;
msg.cm_ctx = (uint64_t)m_cm->cm_ctx;
msg.sp_ctx = (uint64_t)m_cm; /* send back my cm_ctx */
msg.hdr.op = MIX_CM_REQ;
msg.hdr.status = 0;
msg.hdr.flags = MIX_OP_REQ;
- msg.qp_id = m_qp->sqp_id;
+ msg.qp_id = m_qp->qp_id; /* shadowed TX */
msg.cm_id = m_cm->cm_id;
msg.cm_ctx = (uint64_t)m_cm;
memcpy(&msg.msg, &m_cm->msg, sizeof(dat_mcm_msg_t));
}
pmsg->wc[i].byte_len = cookie->val.dto.size;
}
- const_ib_wc(&ib_wc, &pmsg->wc[i], 1);
+ mcm_const_ib_wc(&ib_wc, &pmsg->wc[i], 1);
dapls_evd_cqe_to_event(m_cq->evd, &ib_wc);
}
--- /dev/null
+/*
+ * Copyright (c) 2009 Intel Corporation. All rights reserved.
+ *
+ * This Software is licensed under one of the following licenses:
+ *
+ * 1) under the terms of the "Common Public License 1.0" a copy of which is
+ * available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/cpl.php.
+ *
+ * 2) under the terms of the "The BSD License" a copy of which is
+ * available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/bsd-license.php.
+ *
+ * 3) under the terms of the "GNU General Public License (GPL) Version 2" a
+ * copy of which is available from the Open Source Initiative, see
+ * http://www.opensource.org/licenses/gpl-license.php.
+ *
+ * Licensee has the right to choose one of the above licenses.
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice and one of the license notices.
+ *
+ * Redistributions in binary form must reproduce both the above copyright
+ * notice, one of the license notices in the documentation
+ * and/or other materials provided with the distribution.
+ */
+#include "dapl.h"
+#include "dapl_adapter_util.h"
+#include "dapl_ib_util.h"
+#include "dapl_evd_util.h"
+#include "dapl_ep_util.h"
+#include "dapl_osd.h"
+
+/*
+ * HST -> MXS - proxy-out (PO) to proxy-in (PI)
+ *
+ * non-MIC host to MIC cross socket EP needs to send WR to remote PI service
+ * instead of direct IB send or write. Inbound traffic from remote MXS will still be
+ * be direct so there is no need for PI service on this MCM providers host side.
+ *
+ * NOTE: Initial design with no segmentation, set frequent PI MP signal rate
+ * This will avoid creation and management of a local PO WR queue for segments
+ */
+#define MCM_MP_SIG_RATE 5
+
+int mcm_send_pi(struct dcm_ib_qp *m_qp, int len, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr)
+{
+ struct ibv_send_wr wr_imm;
+ struct ibv_sge sge;
+ struct mcm_wr_rx m_wr_rx;
+ int i, ret = 0, wr_idx;
+ struct wrc_idata wrc;
+ uint32_t wr_flags, offset=0;
+
+ dapl_log(DAPL_DBG_TYPE_EP,
+ " mcm_send_pi: len %d ib_wr %p, WR: tl %d hd %d end %d\n",
+ len, wr, m_qp->wr_tl, m_qp->wr_hd, m_qp->wrc_rem.wr_end);
+
+ if (wr->num_sge > DAT_MIX_SGE_MAX) {
+ ret = EINVAL;
+ goto bail;
+ }
+ /* one WR per IB sge, no additional segmentation */
+ for (i=0;i<wr->num_sge;i++) {
+ wr_flags = M_SEND_DIRECT | M_SEND_PI;
+ if (i==0) wr_flags |= M_SEND_FS;
+ if (i==(wr->num_sge-1)) {
+ wr_flags |= M_SEND_LS;
+ if (wr->send_flags & IBV_SEND_SIGNALED)
+ wr_flags |= M_SEND_CN_SIG;
+ }
+ dapl_os_lock(&m_qp->lock);
+ if (((m_qp->wr_hd + 1) & m_qp->wrc_rem.wr_end) == m_qp->wr_tl) { /* full */
+ ret = ENOMEM;
+ dapl_os_unlock(&m_qp->lock);
+ goto bail;
+ }
+ m_qp->wr_hd = (m_qp->wr_hd + 1) & m_qp->wrc_rem.wr_end; /* move hd */
+ wr_idx = m_qp->wr_hd;
+ if (!(wr_idx % MCM_MP_SIG_RATE) || (wr_flags & M_SEND_CN_SIG))
+ wr_flags |= M_SEND_MP_SIG;
+ dapl_os_unlock(&m_qp->lock);
+
+ dapl_log(DAPL_DBG_TYPE_EP,
+ " mcm_send_pi[%d]: ln %d wr_idx %d, tl %d hd %d\n",
+ i, wr->sg_list[i].length, wr_idx, m_qp->wr_tl, m_qp->wr_hd);
+
+ /* build local m_wr_rx for remote PI */
+ memset((void*)&m_wr_rx, 0, sizeof(struct mcm_wr_rx));
+ m_wr_rx.org_id = (uint64_t) htonll((uint64_t)wr->wr_id);
+ m_wr_rx.flags = htonl(wr_flags);
+ m_wr_rx.w_idx = htonl(m_qp->wc_tl); /* snd back wc tail */
+ m_wr_rx.wr.num_sge = htonl(wr->num_sge);
+ m_wr_rx.wr.opcode = htonl(wr->opcode);
+ m_wr_rx.wr.send_flags = htonl(wr->send_flags);
+ m_wr_rx.wr.imm_data = htonl(wr->imm_data);
+ m_wr_rx.sg[0].addr = htonll(wr->sg_list[i].addr);
+ m_wr_rx.sg[0].lkey = htonl(wr->sg_list[i].lkey);
+ m_wr_rx.sg[0].length = htonl(wr->sg_list[i].length);
+
+ if ((wr->opcode == IBV_WR_RDMA_WRITE) ||
+ (wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)) {
+ m_wr_rx.wr.wr.rdma.remote_addr = htonll(wr->wr.rdma.remote_addr + offset);
+ m_wr_rx.wr.wr.rdma.rkey = htonl(wr->wr.rdma.rkey);
+ offset += wr->sg_list[i].length;
+ }
+
+ /* setup imm_data for PI rcv engine */
+ wrc.id = (uint16_t)wr_idx;
+ wrc.type = M_WR_TYPE;
+ wrc.flags = 0;
+
+ /* setup local WR for wr_rx transfer - RW_imm inline */
+ wr_imm.wr_id = (uint64_t)(uintptr_t)m_qp;
+ wr_imm.next = 0;
+ wr_imm.sg_list = &sge;
+ wr_imm.num_sge = 1;
+ wr_imm.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+ wr_imm.send_flags = IBV_SEND_INLINE; /* m_wr_rx, 148 bytes */
+ if (wr_flags & M_SEND_MP_SIG)
+ wr_imm.send_flags |= IBV_SEND_SIGNALED;
+ wr_imm.imm_data = htonl(*(uint32_t *)&wrc);
+ wr_imm.wr.rdma.rkey = m_qp->wrc_rem.wr_rkey;
+ wr_imm.wr.rdma.remote_addr =
+ (uint64_t)(uintptr_t)
+ ((struct mcm_wr_rx *) (m_qp->wrc_rem.wr_addr + (m_qp->wrc_rem.wr_sz * wr_idx)));
+
+ sge.addr = (uint64_t)(uintptr_t) &m_wr_rx;
+ sge.length = (uint32_t) sizeof(struct mcm_wr_rx); /* 160 byte WR */
+ sge.lkey = 0; /* inline doesn't need registered */
+
+ dapl_log(DAPL_DBG_TYPE_EP,
+ " mcm_send_pi[%d]: WR_RX wr_id %Lx qn %x op %d flgs 0x%x"
+ " imm %x raddr %p rkey %x ln %d\n",
+ i, wr_imm.wr_id, m_qp->qp2->qp_num, wr_imm.opcode,
+ wr_flags, ntohl(wr_imm.imm_data),
+ wr_imm.wr.rdma.remote_addr, wr_imm.wr.rdma.rkey,
+ sizeof(struct mcm_wr_rx));
+ dapl_log(DAPL_DBG_TYPE_EP,
+ " mcm_send_pi[%d]: WR wr_id %Lx qn %x op %d flgs %x"
+ " imm %x raddr %p rkey %x ln %d tl %d me %d hd %d\n",
+ i, wr->wr_id, m_qp->qp2->qp_num, wr->opcode,
+ wr->send_flags, wr->imm_data, wr->wr.rdma.remote_addr,
+ wr->wr.rdma.rkey, wr->sg_list[i].length,
+ m_qp->wr_tl, wr_idx, m_qp->wr_hd);
+
+ ret = ibv_post_send(m_qp->qp2, &wr_imm, bad_wr); /* QP2: QPtx - QPrx PI */
+ if (ret) {
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " mcm_send_pi ERR: m_wr %p idx %d laddr=%p ln=%d lkey=%x flgs %x"
+ " tl %d hd %d\n",
+ m_wr_rx, wr_idx, wr->sg_list[0].addr,
+ wr->sg_list[0].length, wr->sg_list[0].lkey,
+ m_wr_rx.flags, m_qp->wr_tl, m_qp->wr_hd);
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " mcm_send_pi ERR: wr_id %Lx %p sglist %p sge %d op %d flgs %x"
+ " idata 0x%x raddr %p rkey %x \n",
+ m_wr_rx.wr.wr_id, wr->sg_list,
+ m_wr_rx.wr.num_sge, m_wr_rx.wr.opcode,
+ m_wr_rx.wr.send_flags, m_wr_rx.wr.imm_data,
+ m_wr_rx.wr.wr.rdma.remote_addr,
+ m_wr_rx.wr.wr.rdma.rkey);
+ goto bail;
+ }
+ }
+bail:
+ return ret;
+}
+
+/* Work completion of RW data to remote PI, remote RR completion */
+static inline void mcm_rcv_wc(struct dcm_ib_cq *m_cq, struct dcm_ib_qp *m_qp, struct wrc_idata *wrc)
+{
+ struct mcm_wc_rx *m_wc;
+
+ if (wrc->id > m_qp->wrc.wc_end) {
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " RX imm_data: WC id out of range %x > %x \n",
+ wrc->id, m_qp->wrc.wc_end);
+ return;
+ }
+ m_wc = (struct mcm_wc_rx *)(m_qp->wrc.wc_addr + (m_qp->wrc.wc_sz * wrc->id));
+ mcm_ntoh_wc_rx(m_wc); /* convert WC contents, pushed via wire */
+ dapl_log(DAPL_DBG_TYPE_EP,
+ " mcm_rcv_wc: WC id %d m_wc %p wr_id %Lx org_id %Lx flgs 0x%x\n",
+ wrc->id, m_wc, m_wc->wc.wr_id, m_wc->org_id, m_wc->flags);
+ dapl_os_lock(&m_qp->lock);
+ m_qp->wr_tl = m_wc->wr_tl;
+ m_qp->wc_tl = wrc->id; /* move wc_tl, for wc_tl_rem on peer PI service */
+ dapl_os_unlock(&m_qp->lock);
+ if (m_wc->flags & M_SEND_CN_SIG) {
+ struct ibv_wc ib_wc;
+ mcm_const_ib_wc(&ib_wc, &m_wc->wc, 1);
+ dapls_evd_cqe_to_event(m_qp->req_cq->evd, &ib_wc);
+ }
+ dapl_log(DAPL_DBG_TYPE_EP,
+ " mcm_rcv_wc: m_qp %p wr_tl %d wr_hd %d wc_tl %d \n",
+ m_qp, m_qp->wr_tl, m_qp->wr_tl, m_qp->wc_tl);
+}
+
+int mcm_post_rcv_wc(struct dcm_ib_qp *m_qp, int cnt)
+{
+ struct ibv_recv_wr r_wr, *r_err;
+ int err, i;
+
+ r_wr.next = NULL; /* re-post message */
+ r_wr.sg_list = NULL;
+ r_wr.num_sge = 0;
+ r_wr.wr_id = (uint64_t)(uintptr_t) m_qp;
+ errno = 0;
+
+ for (i=0;i<cnt;i++) {
+ err = ibv_post_recv(m_qp->qp2, &r_wr, &r_err);
+ if (err) {
+ dapl_log(DAPL_DBG_TYPE_ERR,"ERR: qp %p (QP2) qpn %x "
+ "ibv_post_recv ret = %d %s\n",
+ m_qp, m_qp->qp2 ? m_qp->qp2->qp_num:0,
+ err, strerror(errno));
+ return errno;
+ }
+ }
+ dapl_log(DAPL_DBG_TYPE_EP, "mcm_post_rcv_wc: qp %p qpn 0x%x posted %d\n",
+ m_qp, m_qp->qp2->qp_num, cnt);
+ return 0;
+}
+
+/* Proxy-in service - called from CM-RX thread, CQ2 is PI service
+ *
+ * <- Work completion in (RW_imm - WC idata), local initiated RW
+ */
+void mcm_rcv_pi_event(struct dcm_ib_cq *m_cq)
+{
+ struct ibv_wc wc[5];
+ struct ibv_cq *ib_cq;
+ void *cq_ctx = NULL;
+ int i, wc_cnt, ret, err, notify;
+
+ dapl_log(DAPL_DBG_TYPE_THREAD," PI event: enter\n");
+
+ ret = ibv_get_cq_event(m_cq->cq->channel, &ib_cq, (void *)&cq_ctx);
+ if (ret == 0)
+ ibv_ack_cq_events(ib_cq, 1);
+
+ wc_cnt = err = notify = 0;
+retry:
+ ret = ibv_poll_cq(m_cq->cq, 5, wc);
+ if (ret <= 0) {
+ if (!ret && !notify) {
+ ibv_req_notify_cq(m_cq->cq, 0);
+ notify = 1;
+ goto retry;
+ }
+ dapl_log(DAPL_DBG_TYPE_THREAD," PI event: empty, return\n");
+ return;
+ } else
+ notify = 0;
+
+ wc_cnt += ret;
+ for (i=0; i<ret; i++) {
+ struct dcm_ib_qp *m_qp = (struct dcm_ib_qp *)wc[i].wr_id;
+ wrc_idata_t wrc;
+#if 1
+ dapl_log(DAPL_DBG_TYPE_EP,
+ " PI event: ib_wc[%d-%d]: st %d, vn %x imm %x op %x wr_id %Lx ctx %p\n",
+ i+1, ret, wc[i].status, wc[i].vendor_err, ntohl(wc[i].imm_data),
+ wc[i].opcode, wc[i].wr_id, cq_ctx);
+#endif
+ if (wc[i].status != IBV_WC_SUCCESS) {
+ if (wc[i].status != IBV_WC_WR_FLUSH_ERR)
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " PI event: ERR DTO st %d, vn %x idata %x m_cq %p m_qp %p\n",
+ wc[i].status, wc[i].vendor_err,
+ ntohl(wc[i].imm_data), m_cq, m_qp);
+ continue;
+ }
+
+ if (wc[i].opcode == (uint32_t)IBV_WR_RDMA_WRITE_WITH_IMM) {
+ dapl_log(DAPL_DBG_TYPE_THREAD," PI event: TX RW_imm -> WR\n");
+ continue; /* post_send -> RW_imm to peer PI */
+ }
+
+ if (wc[i].opcode != IBV_WC_RECV_RDMA_WITH_IMM) {
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " PI event: ERR QPr WC op %d != RECV_RDMA_IMM, m_qp %p\n",
+ wc[i].opcode, m_qp);
+ continue;
+ }
+ dapl_log(DAPL_DBG_TYPE_THREAD," PI event: RX RW_imm <- WC\n");
+ wrc.id = WRC_ID_DATA(ntohl(wc[i].imm_data));
+ wrc.type = WRC_TYPE_DATA(ntohl(wc[i].imm_data));
+ wrc.flags = WRC_FLAGS_DATA(ntohl(wc[i].imm_data));
+
+ if (wrc.type == M_WC_TYPE)
+ mcm_rcv_wc(m_cq, m_qp, &wrc);
+ else
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ "PI event: ERR RX_imm: WC type ?= 0x%x \n", wrc.type);
+
+ err = mcm_post_rcv_wc(m_qp, 1);
+ }
+ goto retry;
+}
+
+void mcm_destroy_wc_q(struct dcm_ib_qp *m_qp)
+{
+ dapl_log(DAPL_DBG_TYPE_EP,
+ "mcm_destroy_wc_q: QP %p PI WC_q %p\n",
+ m_qp, m_qp->wrc.wc_addr);
+
+ if (m_qp->wc_mr) {
+ ibv_dereg_mr(m_qp->wc_mr);
+ m_qp->wc_mr = NULL;
+ }
+ if (m_qp->wrc.wc_addr) {
+ free((void*)m_qp->wrc.wc_addr);
+ m_qp->wrc.wc_addr = 0;
+ }
+}
+
+int mcm_create_wc_q(struct dcm_ib_qp *m_qp, int entries)
+{
+ struct ibv_pd *ib_pd = ((DAPL_PZ *)m_qp->ep->param.pz_handle)->pd_handle;
+
+ dapl_log(DAPL_DBG_TYPE_EP,
+ "mcm_create_wc_q: QP %p entries %d\n", m_qp, entries);
+
+ /* RDMA proxy WC pool, register with SCIF and IB, set pool and segm size with parameters */
+ m_qp->wrc.wc_sz = ALIGN_64(sizeof(struct mcm_wc_rx));
+ m_qp->wrc.wc_len = m_qp->wrc.wc_sz * entries; /* 64 byte aligned for signal_fence */
+ m_qp->wrc.wc_end = entries - 1;
+
+ if (posix_memalign((void **)&m_qp->wrc.wc_addr, 4096, ALIGN_PAGE(m_qp->wrc.wc_len))) {
+ dapl_log(DAPL_DBG_TYPE_EP, "failed to allocate wc_rbuf,"
+ " m_qp=%p, wc_len=%d, entries=%d\n",
+ m_qp, m_qp->wrc.wc_len, entries);
+ return -1;
+ }
+ memset((void*)m_qp->wrc.wc_addr, 0, ALIGN_PAGE(m_qp->wrc.wc_len));
+
+ dapl_log(DAPL_DBG_TYPE_EP, " WC rbuf pool %p, LEN req=%d, act=%d\n",
+ m_qp->wrc.wc_addr, m_qp->wrc.wc_len, ALIGN_PAGE(m_qp->wrc.wc_len));
+
+ m_qp->wc_mr = ibv_reg_mr(ib_pd, (void*)m_qp->wrc.wc_addr, m_qp->wrc.wc_len,
+ IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
+ if (!m_qp->wc_mr) {
+ dapl_log(DAPL_DBG_TYPE_ERR, " IB_register addr=%p,%d failed %s\n",
+ m_qp->wrc.wc_addr, ALIGN_PAGE(m_qp->wrc.wc_len), strerror(errno));
+ return -1;
+ }
+ m_qp->wrc.wc_addr = (uint64_t)(uintptr_t)m_qp->wc_mr->addr;
+ m_qp->wrc.wc_rkey = m_qp->wc_mr->rkey;
+
+ dapl_log(DAPL_DBG_TYPE_EP, " IB_mr for wc_buf addr %p, mr 0x%llx, len %d, entries %d rkey %x lkey %x\n",
+ m_qp->wrc.wc_addr, m_qp->wc_mr->addr, ALIGN_PAGE(m_qp->wrc.wc_len),
+ entries, m_qp->wc_mr->rkey, m_qp->wc_mr->lkey);
+
+ /* Put QP's req and rcv CQ on device PI cqlist, mark CQ for indirect signaling */
+ dapl_os_lock(&m_qp->tp->cqlock);
+ m_qp->req_cq->flags |= DCM_CQ_TX_INDIRECT;
+ dapl_llist_add_tail(&m_qp->tp->cqlist, &m_qp->req_cq->entry, m_qp->req_cq);
+ dapl_llist_add_tail(&m_qp->tp->cqlist, &m_qp->rcv_cq->entry, m_qp->rcv_cq);
+ dapl_os_unlock(&m_qp->tp->cqlock);
+ dapls_thread_signal(&m_qp->tp->signal); /* CM thread will process PI */
+
+ return 0;
+}
+
+void mcm_destroy_pi_cq(struct dcm_ib_qp *m_qp)
+{
+ if (!m_qp->rcv_cq)
+ return;
+
+ dapl_log(DAPL_DBG_TYPE_EP, "mcm_destroy_pi_cq: QP %p CQ %p\n",
+ m_qp, m_qp->rcv_cq);
+
+ /* remove from device PI processing list */
+ dapl_os_lock(&m_qp->tp->cqlock);
+ if (m_qp->rcv_cq->entry.list_head)
+ dapl_llist_remove_entry(&m_qp->tp->cqlist,
+ &m_qp->rcv_cq->entry);
+ dapl_os_unlock(&m_qp->tp->cqlock);
+
+ if (m_qp->rcv_cq->cq) {
+ struct ibv_comp_channel *channel = m_qp->rcv_cq->cq->channel;
+
+ ibv_destroy_cq(m_qp->rcv_cq->cq);
+ m_qp->rcv_cq->cq = NULL;
+ if (channel)
+ ibv_destroy_comp_channel(channel);
+ }
+ dapl_os_free(m_qp->rcv_cq, sizeof(struct dcm_ib_cq));
+ m_qp->rcv_cq = NULL;
+}
+
+int mcm_create_pi_cq(struct dcm_ib_qp *m_qp, int len)
+{
+ struct ibv_comp_channel *channel = NULL;
+ int cqlen = len;
+ int opts, ret = ENOMEM;
+
+ dapl_dbg_log(DAPL_DBG_TYPE_EP,
+ "mcm_create_pi_cq: qp = %p cqlen=%d \n", m_qp, cqlen);
+
+ /* create CQ object */
+ m_qp->rcv_cq = dapl_os_alloc(sizeof(struct dcm_ib_cq));
+ if (!m_qp->rcv_cq)
+ goto err;
+
+ dapl_os_memzero(m_qp->rcv_cq, sizeof(struct dcm_ib_cq));
+ m_qp->rcv_cq->tp = m_qp->tp;
+ dapl_llist_init_entry(&m_qp->rcv_cq->entry);
+
+ errno = 0;
+ channel = ibv_create_comp_channel(m_qp->tp->hca->ib_hca_handle);
+ if (!channel)
+ goto err;
+
+ /* move channel FD to non-blocking */
+ opts = fcntl(channel->fd, F_GETFL);
+ if (opts < 0 || fcntl(channel->fd, F_SETFL, opts | O_NONBLOCK) < 0) {
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " dapls_config_fd: fcntl on channel->fd %d ERR %d %s\n",
+ channel->fd, opts, strerror(errno));
+ goto err;
+ }
+ m_qp->rcv_cq->cq = ibv_create_cq(m_qp->tp->hca->ib_hca_handle,
+ cqlen, m_qp, channel, 0);
+ if (!m_qp->rcv_cq->cq)
+ goto err;
+
+ /* arm cq for events */
+ ibv_req_notify_cq(m_qp->rcv_cq->cq, 0);
+
+ dapl_dbg_log(DAPL_DBG_TYPE_EP,
+ "mcm_create_pi_cq: new_cq %p cqlen=%d \n",
+ m_qp->rcv_cq, cqlen);
+
+ dapl_log(DAPL_DBG_TYPE_EVD,
+ "mcm_create_pi_cq (%d): new_cq %p ib_cq %p cqlen %d,%d\n",
+ m_qp->rcv_cq, m_qp->rcv_cq->cq, len, cqlen);
+
+ return 0;
+
+err:
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ "mcm_create_pi_cq: ERR new_cq %p cqlen %d,%d ret %s\n",
+ m_qp->rcv_cq, len, cqlen, strerror(errno));
+
+ if (m_qp->rcv_cq) {
+ dapl_os_free(m_qp->rcv_cq, sizeof(struct dcm_ib_cq));
+ m_qp->rcv_cq = NULL;
+ }
+ if (channel)
+ ibv_destroy_comp_channel(channel);
+
+ return dapl_convert_errno(ret, "create_pi_cq" );
+}
+
+
+
+
if (hca_ptr->ib_trans.ib_cq_empty) {
struct ibv_comp_channel *channel;
- channel = hca_ptr->ib_trans.ib_cq_empty->ib_cq->channel;
- ibv_destroy_cq(hca_ptr->ib_trans.ib_cq_empty->ib_cq);
+ channel = hca_ptr->ib_trans.ib_cq_empty->cq->channel;
+ ibv_destroy_cq(hca_ptr->ib_trans.ib_cq_empty->cq);
ibv_destroy_comp_channel(channel);
}
union dcm_addr *ucm_ia = (union dcm_addr *) r_addr;
dapl_log(DAPL_DBG_TYPE_CM, " UCM connect -> AF %d LID 0x%x QPN 0x%x GID"
- " 0x" F64x ":" F64x " port %d ep_map %s sl %d qt %d\n",
+ " 0x" F64x ":" F64x " port %d sl %d qt %d\n",
ucm_ia->ib.family, ntohl(ucm_ia->ib.qpn), ntohs(ucm_ia->ib.lid),
(unsigned long long)ntohll(*(uint64_t*)&ucm_ia->ib.gid[0]),
(unsigned long long)ntohll(*(uint64_t*)&ucm_ia->ib.gid[8]),
if (hca_ptr->ib_trans.ib_cq_empty) {
struct ibv_comp_channel *channel;
- channel = hca_ptr->ib_trans.ib_cq_empty->ib_cq->channel;
- ibv_destroy_cq(hca_ptr->ib_trans.ib_cq_empty->ib_cq);
+ channel = hca_ptr->ib_trans.ib_cq_empty->cq->channel;
+ ibv_destroy_cq(hca_ptr->ib_trans.ib_cq_empty->cq);
ibv_destroy_comp_channel(channel);
}
return 0;
}
-
static void mcm_process_recv(mcm_ib_dev_t *md, dat_mcm_msg_t *msg, mcm_cm_t *cm, int len)
{
mlog(2, " cm %p cm_id %d state %s \n",
int mcm_rr_signal = 10;
int mcm_rr_max = 50;
int mcm_wrc_max = 5;
-int mcm_tx_entries = 1024;
-int mcm_rx_entries = 1024;
-int mcm_rx_cq_size = 1024;
-int mcm_tx_cq_size = 1024;
-int mcm_buf_wc_size = 1024;
+int mcm_tx_entries = MCM_WRC_QLEN; /* power of 2, default = 1024 */
+int mcm_rx_entries = MCM_WRC_QLEN;
+int mcm_rx_cq_size = MCM_WRC_QLEN;
+int mcm_tx_cq_size = MCM_WRC_QLEN;
+int mcm_buf_wc_size = MCM_WRC_QLEN;
+extern int mix_buffer_sg_po2;
extern uint64_t system_guid;
extern int mcm_profile;
extern int log_level;
memcpy(&m_cm->msg.daddr1, &pmsg->msg.daddr1, sizeof(dat_mcm_addr_t));
memcpy(&m_cm->msg.daddr2, &pmsg->msg.daddr1, sizeof(dat_mcm_addr_t));
-
- mcm_init_wrc(m_cm); /* send any proxy-in WR/WC raddr,rkey info */
+ mcm_hton_wrc((mcm_wrc_info_t *)m_cm->msg.p_proxy, &m_qp->wrc); /* PI WR/WC raddr,rkey info */
+ m_cm->msg.seg_sz = mix_buffer_sg_po2;
mlog(2," QPt 0x%x QPr 0x%x %Lx -> dport 0x%x, dqpn 0x%x dlid 0x%x psz %d %s\n",
m_cm->msg.saddr2.qpn, m_cm->msg.saddr1.qpn,
memcpy(m_cm->msg.p_proxy, pkt->p_proxy, DAT_MCM_PROXY_DATA);
memcpy(&m_cm->msg.daddr1, &pkt->saddr1, sizeof(dat_mcm_addr_t));
memcpy(&m_cm->msg.daddr2, &pkt->saddr2, sizeof(dat_mcm_addr_t));
- mcm_save_wrc(m_cm); /* save any peer proxy-in WRC info */
+ mcm_ntoh_wrc(&m_cm->m_qp->wrc_rem, (mcm_wrc_info_t *)m_cm->msg.p_proxy); /* peer RI WRC info */
/* MXS <- MSS or HOST, fabric: TX: QP2->QP1 direct, RX: QP1<-QP2 proxy */
if ((MXS_EP(&m_cm->md->addr) && !MXS_EP(&m_cm->msg.daddr1)) &&
system_guid != m_cm->msg.sys_guid) {
- mlog(2, " MXS <- MSS remote \n");
+ mlog(2, " MXS <- %s remote \n", mcm_map_str(m_cm->msg.daddr1.ep_map));
if (m_pi_prep_rcv_q(m_cm->m_qp))
goto err;
mlog(2, " MSS <- %s remote \n", mcm_map_str(m_cm->msg.daddr1.ep_map));
if (MXS_EP(&m_cm->msg.daddr1) && m_pi_prep_rcv_q(m_cm->m_qp))
- goto err;
+ goto err;
if (!MXS_EP(&m_cm->msg.daddr1))
m_pi_destroy_wc_q(m_cm->m_qp); /* created if ep_map was unknown */
htons(pkt->sport), ntohll(pkt->sys_guid),
mcm_map_str(pkt->saddr2.ep_map));
+ /* MXS_EP <- HST_EP, host sends WC on RTU, save WRC info */
+ if (MXS_EP(&pkt->daddr1) && HST_EP(&pkt->saddr2))
+ mcm_ntoh_wrc(&m_cm->m_qp->wrc_rem, (mcm_wrc_info_t *)pkt->p_proxy);
+
/* Forward, as is, conn_reply message to MIC client, with remote QP info */
msg.hdr.ver = DAT_MIX_VER;
msg.hdr.flags = MIX_OP_REQ;
}
m_cm->ref_cnt++; /* Passive: QP ref */
m_cm->m_qp->cm = m_cm;
- mcm_save_wrc(m_cm); /* save remote proxy-in WRC QP info */
+ mcm_ntoh_wrc(&m_cm->m_qp->wrc_rem, (mcm_wrc_info_t *)m_cm->msg.p_proxy); /* save peer PI WRC info */
mlog(8, " loc_guid %Lx, rem_guid %Lx\n",
ntohll(system_guid), ntohll(m_cm->msg.sys_guid));
/* MXS -> MSS or HOST, remote: need QPr1, saddr1 on mpxyd */
if ((MXS_EP(&m_cm->md->addr) && !MXS_EP(&m_cm->msg.daddr1)) &&
(system_guid != m_cm->msg.sys_guid) ) {
- mlog(2, " MXS -> MSS remote \n");
+ mlog(2, " MXS -> %s remote \n", mcm_map_str(m_cm->msg.daddr1.ep_map));
if (m_qp_create_pi(smd, m_cm->m_qp))
goto err;
m_cm->msg.saddr1.ep_map = MIC_XSOCK_DEV;
m_cm->msg.saddr2.ep_map = MIC_XSOCK_DEV;
- /* MSS to MSS,MXS,HOST - fabric, TX: QP2->QP1 on mpxyd and RX: QP1->QP2 on MIC */
+ /* MSS -> MSS,MXS,HOST - fabric, TX: QP2->QP1 on mpxyd and RX: QP1->QP2 on MIC */
} else {
mlog(2, " MSS -> %s remote \n", mcm_map_str(m_cm->msg.daddr1.ep_map));
dqpn = m_cm->msg.daddr1.qpn;
dlid = m_cm->msg.daddr1.lid;
- if (m_pi_create_wc_q(m_cm->m_qp, mcm_rx_entries))
- goto err;
+ if (MXS_EP(&m_cm->msg.daddr1)) {
+ if (m_pi_create_wc_q(m_cm->m_qp, mcm_rx_entries))
+ goto err;
- if (m_pi_prep_rcv_q(m_cm->m_qp))
- goto err;
+ if (m_pi_prep_rcv_q(m_cm->m_qp))
+ goto err;
+ }
}
- mcm_init_wrc(m_cm); /* send back proxy-in WR/WC raddr,rkey info */
+ mcm_hton_wrc((mcm_wrc_info_t *)m_cm->msg.p_proxy, &m_cm->m_qp->wrc); /* send PI WRC info */
+ m_cm->msg.seg_sz = mix_buffer_sg_po2;
mcm_pr_addrs(2, &m_cm->msg, m_cm->state, 0);
/* return sys_guid */
ret = scif_recv(smd->scif_op_ep, ((char*)pmsg + sizeof(dat_mix_hdr_t)), len, SCIF_RECV_BLOCK);
if (ret != len) {
mlog(0, " ERR: scif_recv WR, ret %d, exp %d\n", ret, len);
- return -1;
+ return POLLERR;
}
/* get QP by ID */
m_qp = mix_get_qp(smd, pmsg->qp_id);
- if (!m_qp) {
- struct dat_mix_wc wc;
-
+ if (!m_qp || !m_qp->ib_qp2) {
mlog(0, " ERR: mix_get_qp, id %d, not found\n", pmsg->qp_id);
if ((pmsg->hdr.flags & MIX_OP_INLINE) && pmsg->len) { /* purge data, send event */
char dbuf[DAT_MIX_INLINE_MAX];
return -1;
}
}
- wc.wr_id = pmsg->wr.wr_id;
- wc.byte_len = 0;
- wc.status = IBV_WC_GENERAL_ERR;
- wc.opcode = pmsg->wr.opcode == IBV_WR_SEND ? IBV_WC_SEND:IBV_WC_RDMA_WRITE;
- wc.vendor_err = EINVAL;
- mix_dto_event(m_qp->ib_qp2->send_cq->cq_context, &wc, 1);
- return 0;
+ return POLLERR; /* device async err, cannot report event with no QP */
}
mlog(4, " q_id %d, q_num %x data %d pkt %d wr_id %p, sge %d,"
m_qp->post_cnt, m_qp->post_sig_cnt, m_qp->comp_cnt,
pmsg->hdr.flags & MIX_OP_INLINE ? 1:0,
pmsg->wr.opcode == IBV_WR_SEND ? "SND":"WR",
- m_qp->wr_len_rem ? "PROXY_OUT_IN":"PROXY_OUT");
+ m_qp->wrc.wr_len ? "PROXY_OUT_IN":"PROXY_OUT");
return (mix_proxy_out(smd, pmsg, m_qp));
}
--- /dev/null
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under the OpenIB.org BSD license
+ * below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * MIC Proxy Data Service Definitions - used by MCM provider and MPXYD service
+ *
+ * Communication Protocol between MCM Proxy-out and Proxy-in service agents
+ * - WR and WC management vi IB RDMA write_imm and RDMA reads
+ * - WR and WC written directly from remote proxy peer agent,
+ * - Proxy-in buffer management on receive side, IB RR
+ * - Proxy-out buffer management on send side
+ * IB RW directly to user buffer if peer is MIC same socket
+ * IB RW_imm to PI WR, PI RR, scif_writeto if MIC is remote socket
+ *
+ */
+#ifndef _MPXY_H_
+#define _MPXY_H_
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define htonll(x) (x)
+#define ntohll(x) (x)
+#elif __BYTE_ORDER == __LITTLE_ENDIAN
+#define htonll(x) bswap_64(x)
+#define ntohll(x) bswap_64(x)
+#endif
+
+/* WRC (work request/completion) imm_data definition, qdepth limits of 16 bits */
+#define WRC_MAX_QLEN 1 << 16;
+#define MCM_WRC_QLEN 1024
+
+/* data types, WR or WC */
+#define M_WR_TYPE 1
+#define M_WC_TYPE 2
+
+/* WR flags */
+#define M_WR_FS 1
+#define M_WR_LS 2
+
+#define WRC_ID_DATA(x) ((x) & 0x0000ffff)
+#define WRC_TYPE_DATA(x) (((x) >> 16) & 0x000000ff)
+#define WRC_FLAGS_DATA(x) (((x) >> 24) & 0x000000ff)
+
+/* wr aligned on 64 bytes, use 4 lower bits for type id */
+#define WRID_TX_RW 0x1 /* proxy out, m_wr type, RW */
+#define WRID_TX_RW_IMM 0x2 /* proxy out, m_wr type, RW_imm op */
+#define WRID_RX_RR 0x3 /* proxy in, m_wr_rx type, RR op */
+#define WRID_RX_RW_IMM 0x4 /* proxy in, m_wr_rx type, RW_immed op */
+#define WRID_MASK 0xfffffffffffffff0
+#define WRID_SET(x,y) (((uint64_t)(x) | (uint64_t)(y)))
+#define WRID_TYPE(x) ((x & ~WRID_MASK))
+#define WRID_ADDR(x) ((x & WRID_MASK))
+
+typedef struct wrc_idata {
+
+ uint16_t id; /* work request or completion slot */
+ uint8_t type; /* data types, WR, WC, etc */
+ uint8_t flags; /* flags */
+
+} __attribute__((packed)) wrc_idata_t;
+
+enum mcm_wr_flags {
+ M_SEND_POSTED = 1 << 0, /* m_wr already posted */
+ M_SEND_CN_SIG = 1 << 1, /* m_wr consumer signaled, IB completion */
+ M_SEND_CN_EAGER_SIG = 1 << 2, /* m_wr consumer eager signaled, SCIF read completion */
+ M_SEND_MP_SIG = 1 << 3, /* m_wr mpxyd signaled, segmentation, manage proxy buf/wr resources */
+
+ M_SEND_FS = 1 << 4, /* m_wr - first segment */
+ M_SEND_LS = 1 << 5, /* m_wr - last segment */
+ M_SEND_PI = 1 << 6, /* m_wr - forwarded to proxy in service */
+ M_SEND_INLINE = 1 << 7, /* m_wr - data in cmd msg, no scif_readfrom */
+
+ M_READ_PAUSED = 1 << 8, /* m_wr_rx waiting for proxy buffer */
+ M_RECV_PAUSED = 1 << 9, /* m_wr_rx waiting for posted rcv message */
+ M_READ_POSTED = 1 << 10, /* m_wr_rx ibv posted */
+ M_READ_DONE = 1 << 11, /* m_wr_rx ibv completed */
+
+ M_READ_WRITE_TO = 1 << 12, /* m_wr_rx read data forwarded to MIC scif_writeto */
+ M_READ_WRITE_TO_DONE = 1 << 13, /* m_wr_rx read data forwarded to MIC scif_writeto */
+ M_READ_CN_SIG = 1 << 14, /* m_wr_rx consumer signaled, IB completion needed */
+ M_READ_MP_SIG = 1 << 15, /* m_wr_rx mpxyd signaled, segmentation, manage proxy buf/wr resources */
+
+ M_READ_FROM_DONE = 1 << 16, /* m_wr mpxyd read_from_done, ready for posting */
+ M_SEND_DIRECT = 1 << 17, /* m_wr SEND direct from host memory, no proxy out buffer */
+};
+
+/* 80 bytes */
+typedef struct mcm_sr {
+ uint64_t wr_id; /* from consumer post_recv */
+ uint32_t len; /* total len */
+ uint32_t num_sge; /* number of sglist entries, max 4 */
+ uint32_t m_idx; /* proxy buffer, src */
+ uint32_t w_idx; /* wr_rx WR idx, data xfer in process */
+ uint32_t s_idx; /* my idx, sr_tl update */
+ struct dat_mix_sge sg[DAT_MIX_SGE_MAX]; /* consumer buffer on MIC, off_t */
+} mcm_sr_t;
+
+/* 128 bytes */
+typedef struct mcm_wr {
+ struct ibv_send_wr wr;
+ struct ibv_sge sg[DAT_MIX_SGE_MAX];
+ uint64_t org_id;
+ uint64_t context;
+ uint32_t m_idx;
+ uint32_t w_idx;
+ uint32_t flags;
+} mcm_wr_t;
+
+/* DAT_MCM_PROXY_DATA private data max (40 bytes), Proxy-in WR and WC info exchange */
+typedef struct mcm_wrc_info {
+ uint64_t wr_addr;
+ uint32_t wr_rkey;
+ uint32_t wr_len;
+ uint16_t wr_sz;
+ uint16_t wr_end;
+ uint64_t wc_addr;
+ uint32_t wc_rkey;
+ uint32_t wc_len;
+ uint16_t wc_sz;
+ uint16_t wc_end;
+} __attribute__((packed)) mcm_wrc_info_t;
+
+/* WR: 160 bytes, direct RDMA write from remote Proxy-in service */
+typedef struct mcm_wr_rx {
+ struct dat_mix_wr wr;
+ struct dat_mix_sge sg[DAT_MIX_SGE_MAX];
+ uint64_t org_id;
+ uint64_t context;
+ uint32_t m_idx;
+ uint32_t w_idx;
+ uint32_t s_idx;
+ uint32_t flags;
+ uint32_t time;
+ uint32_t qcnt;
+} __attribute__((packed)) mcm_wr_rx_t;
+
+/* WC: 80 bytes, direct RDMA write from remote Proxy-in service */
+typedef struct mcm_wc_rx {
+ struct dat_mix_wc wc;
+ uint64_t org_id;
+ uint64_t context;
+ uint32_t wr_idx; /* proxy-out, proxy-in WR idx */
+ uint32_t wr_tl; /* proxy-in WR tl update */
+ uint32_t flags;
+ uint8_t rsv[6];
+} __attribute__((packed)) mcm_wc_rx_t;
+
+/* put WRC info to msg->p_proxy, network order, during outbound CM request or reply */
+static inline void mcm_hton_wrc(mcm_wrc_info_t *dst, mcm_wrc_info_t *src)
+{
+ if (src->wr_addr) {
+ dst->wr_addr = htonll(src->wr_addr);
+ dst->wr_rkey = htonl(src->wr_rkey);
+ dst->wr_len = htons(src->wr_len);
+ dst->wr_sz = htons(src->wr_sz);
+ dst->wr_end = htons(src->wr_end);
+ }
+ if (src->wc_addr) {
+ dst->wc_addr = htonll(src->wc_addr);
+ dst->wc_rkey = htonl(src->wc_rkey);
+ dst->wc_len = htons(src->wc_len);
+ dst->wc_sz = htons(src->wc_sz);
+ dst->wc_end = htons(src->wc_end);
+ }
+}
+
+/* get WRC info from msg->p_proxy, network order, during inbound CM request or reply */
+static inline void mcm_ntoh_wrc(mcm_wrc_info_t *dst, mcm_wrc_info_t *src)
+{
+ dst->wr_addr = ntohll(src->wr_addr);
+ dst->wr_rkey = ntohl(src->wr_rkey);
+ dst->wr_len = ntohs(src->wr_len);
+ dst->wr_sz = ntohs(src->wr_sz);
+ dst->wr_end = ntohs(src->wr_end);
+
+ dst->wc_addr = ntohll(src->wc_addr);
+ dst->wc_rkey = ntohl(src->wc_rkey);
+ dst->wc_len = ntohs(src->wc_len);
+ dst->wc_sz = ntohs(src->wc_sz);
+ dst->wc_end = ntohs(src->wc_end);
+}
+
+/* construct a rx_wr in network order to send to remote proxy-in service */
+static inline void mcm_hton_wr_rx(struct mcm_wr_rx *m_wr_rx, struct mcm_wr *m_wr, int wc_tl)
+{
+ int i;
+ memset((void*)m_wr_rx, 0, sizeof(*m_wr_rx));
+ m_wr_rx->org_id = (uint64_t) htonll((uint64_t)m_wr); /* proxy_out WR */
+ m_wr_rx->flags = htonl(m_wr->flags);
+ m_wr_rx->w_idx = htonl(wc_tl); /* snd back wc tail */
+ m_wr_rx->wr.num_sge = htonl(m_wr->wr.num_sge);
+ m_wr_rx->wr.opcode = htonl(m_wr->wr.opcode);
+ m_wr_rx->wr.send_flags = htonl(m_wr->wr.send_flags);
+ m_wr_rx->wr.imm_data = htonl(m_wr->wr.imm_data);
+ m_wr_rx->wr.wr.rdma.remote_addr = htonll(m_wr->wr.wr.rdma.remote_addr); /* final dst on MIC */
+ m_wr_rx->wr.wr.rdma.rkey = htonl(m_wr->wr.wr.rdma.rkey);
+ printf(" hton_wr_rx: op %x num_sge %d, raddr %Lx rkey %x\n",
+ m_wr->wr.opcode, m_wr->wr.num_sge,
+ (long long unsigned int)m_wr->wr.wr.rdma.remote_addr,
+ m_wr->wr.wr.rdma.rkey);
+ for (i=0;i<m_wr->wr.num_sge;i++) {
+ m_wr_rx->sg[i].addr = htonll(m_wr->sg[i].addr); /* proxy-out buffer */
+ m_wr_rx->sg[i].lkey = htonl(m_wr->sg[i].lkey);
+ m_wr_rx->sg[i].length = htonl(m_wr->sg[i].length);
+ printf(" hton_wr_rx: [%d] addr %Lx key %x len %d\n",
+ i, (long long unsigned int)m_wr->sg[i].addr,
+ m_wr->sg[i].lkey, m_wr->sg[i].length);
+ }
+}
+
+/* convert rx wr, arrived across fabric from remote proxy-out service in network order */
+static inline void mcm_ntoh_wr_rx(struct mcm_wr_rx *m_wr_rx)
+{
+ int i;
+ m_wr_rx->org_id = ntohll(m_wr_rx->org_id); /* proxy_out WR */
+ m_wr_rx->flags = ntohl(m_wr_rx->flags);
+ m_wr_rx->w_idx = ntohl(m_wr_rx->w_idx); /* WC tail update from proxy_out */
+ m_wr_rx->wr.num_sge = ntohl(m_wr_rx->wr.num_sge);
+ m_wr_rx->wr.opcode = ntohl(m_wr_rx->wr.opcode);
+ m_wr_rx->wr.send_flags = ntohl(m_wr_rx->wr.send_flags);
+ m_wr_rx->wr.imm_data = ntohl(m_wr_rx->wr.imm_data);
+ m_wr_rx->wr.wr.rdma.remote_addr = ntohll(m_wr_rx->wr.wr.rdma.remote_addr); /* final dest on MIC */
+ m_wr_rx->wr.wr.rdma.rkey = ntohl(m_wr_rx->wr.wr.rdma.rkey);
+ printf(" ntoh_wr_rx: op %x num_sge %d, raddr %Lx rkey %x\n",
+ m_wr_rx->wr.opcode, m_wr_rx->wr.num_sge,
+ (long long unsigned int)m_wr_rx->wr.wr.rdma.remote_addr,
+ m_wr_rx->wr.wr.rdma.rkey);
+ for (i=0;i<m_wr_rx->wr.num_sge;i++) {
+ m_wr_rx->sg[i].addr = ntohll(m_wr_rx->sg[i].addr); /* proxy-out buffer segment, ibv */
+ m_wr_rx->sg[i].lkey = ntohl(m_wr_rx->sg[i].lkey);
+ m_wr_rx->sg[i].length = ntohl(m_wr_rx->sg[i].length);
+ printf(" ntoh_wr_rx: [%d] addr %Lx key %x len %d\n",
+ i, (long long unsigned int)m_wr_rx->sg[i].addr,
+ m_wr_rx->sg[i].lkey, m_wr_rx->sg[i].length);
+ }
+ /* For HST->MXS sg[0-3] can be direct SRC segments for RR, all others will be 1 seg */
+ /* sg[1] == proxy-in buffer segment, ibv */
+ /* sg[2] == proxy-in scif sendto src segment, scif offset */
+ /* sg[3] == proxy-in scif sendto dst segment, scif offset */
+}
+
+/* construct a rx_wc in network order to send to remote proxy-in service */
+static inline void mcm_hton_wc_rx(struct mcm_wc_rx *m_wc_rx, struct mcm_wr_rx *m_wr_rx, int wr_tl, int status)
+{
+ memset((void*)m_wc_rx, 0, sizeof(*m_wc_rx));
+ m_wc_rx->wr_idx = htonl(m_wr_rx->w_idx); /* proxy-in WR idx == proxy-out WR idx */
+ m_wc_rx->wr_tl = htonl(wr_tl); /* proxy-in WR tail update, moves slower than proxy-out */
+ m_wc_rx->flags = htonl(m_wr_rx->flags);
+ m_wc_rx->wc.wr_id = htonll(m_wr_rx->org_id);
+ m_wc_rx->wc.status = htonl(status);
+ m_wc_rx->wc.byte_len = htonl(m_wr_rx->sg[0].length);
+ if (m_wr_rx->wr.send_flags & IBV_WR_RDMA_WRITE)
+ m_wc_rx->wc.opcode = htonl(IBV_WC_RDMA_WRITE);
+ else
+ m_wc_rx->wc.opcode = htonl(IBV_WC_SEND);
+}
+
+/* convert rx wc, arrived across fabric from remote proxy-in service in network order */
+static inline void mcm_ntoh_wc_rx(struct mcm_wc_rx *m_wc_rx)
+{
+ m_wc_rx->wr_idx = ntohl(m_wc_rx->wr_idx);
+ m_wc_rx->wr_tl = ntohl(m_wc_rx->wr_tl);
+ m_wc_rx->flags = ntohl(m_wc_rx->flags);
+ m_wc_rx->wc.wr_id = ntohll(m_wc_rx->wc.wr_id);
+ m_wc_rx->wc.status = ntohl(m_wc_rx->wc.status);
+ m_wc_rx->wc.byte_len = ntohl(m_wc_rx->wc.byte_len);
+ m_wc_rx->wc.opcode = ntohl(m_wc_rx->wc.opcode);
+}
+
+static inline void mcm_const_mix_wr(struct dat_mix_wr *mwr, struct ibv_send_wr *iwr)
+{
+ memset((void*)mwr, 0, sizeof(*mwr));
+ mwr->wr_id = iwr->wr_id;
+ mwr->num_sge = iwr->num_sge;
+ mwr->opcode = iwr->opcode;
+ mwr->send_flags = iwr->send_flags;
+ mwr->imm_data = iwr->imm_data;
+ mwr->wr.rdma.remote_addr = iwr->wr.rdma.remote_addr;
+ mwr->wr.rdma.rkey = iwr->wr.rdma.rkey;
+}
+
+static inline void mcm_const_ib_wc(struct ibv_wc *iwc, struct dat_mix_wc *mwc, int entries)
+{
+ int i;
+
+ for (i=0;i<entries;i++) {
+ memset((void*)&iwc[i].wr_id, 0, sizeof(*iwc));
+ iwc[i].wr_id = mwc[i].wr_id;
+ iwc[i].status = mwc[i].status;
+ iwc[i].opcode = mwc[i].opcode;
+ iwc[i].vendor_err = mwc[i].vendor_err;
+ iwc[i].byte_len = mwc[i].byte_len;
+ iwc[i].imm_data = mwc[i].imm_data;
+ iwc[i].qp_num = mwc[i].qp_num;
+ iwc[i].src_qp = mwc[i].src_qp;
+ iwc[i].wc_flags = mwc[i].wc_flags;
+ iwc[i].pkey_index = mwc[i].pkey_index;
+ iwc[i].slid = mwc[i].slid;
+ iwc[i].sl = mwc[i].sl;
+ iwc[i].dlid_path_bits = mwc[i].dlid_path_bits;
+ }
+}
+
+#endif /* _MPXY_H_ */
void m_pi_destroy_wc_q(struct mcm_qp *m_qp)
{
- mlog(2, " Destroying QP %p PI WC_q %p\n", m_qp, m_qp->wc_rbuf);
+ mlog(2, " Destroying QP %p PI WC_q %p\n", m_qp, m_qp->wrc.wc_addr);
if (m_qp->wc_rbuf_mr) {
ibv_dereg_mr(m_qp->wc_rbuf_mr);
m_qp->wc_rbuf_mr = NULL;
}
- if (m_qp->wc_rbuf) {
- free(m_qp->wc_rbuf);
- m_qp->wc_rbuf = 0;
+ if (m_qp->wrc.wc_addr) {
+ free((void*)m_qp->wrc.wc_addr);
+ m_qp->wrc.wc_addr = 0;
}
}
ibv_dereg_mr(m_qp->wr_rbuf_mr);
m_qp->wr_rbuf_mr = NULL;
}
- if (m_qp->wr_rbuf) {
- free(m_qp->wr_rbuf);
- m_qp->wr_rbuf = 0;
+ if (m_qp->wrc.wr_addr) {
+ free((void*)m_qp->wrc.wr_addr);
+ m_qp->wrc.wr_addr = 0;
}
m_pi_destroy_wc_q(m_qp);
int m_pi_create_wr_q(struct mcm_qp *m_qp, int entries)
{
/* RDMA proxy WR pool, register with SCIF and IB, set pool and segm size with parameters */
- m_qp->wr_rbuf_sz = ALIGN_64(sizeof(struct mcm_wr_rx));
- m_qp->wr_rbuf_len = m_qp->wr_rbuf_sz * entries; /* 64 byte aligned for signal_fence */
- m_qp->wr_rbuf_end = entries - 1;
+ m_qp->wrc.wr_sz = ALIGN_64(sizeof(struct mcm_wr_rx));
+ m_qp->wrc.wr_len = m_qp->wrc.wr_sz * entries; /* 64 byte aligned for signal_fence */
+ m_qp->wrc.wr_end = entries - 1;
m_qp->wr_hd_r = 0;
m_qp->wr_tl_r = 0;
m_qp->wr_tl_r_wt = 1; /* start at tl+1 */
- if (posix_memalign((void **)&m_qp->wr_rbuf, 4096, ALIGN_PAGE(m_qp->wr_rbuf_len))) {
+ if (posix_memalign((void **)&m_qp->wrc.wr_addr, 4096, ALIGN_PAGE(m_qp->wrc.wr_len))) {
mlog(0, "failed to allocate wr_rbuf, m_qp=%p, wr_len=%d, entries=%d\n",
- m_qp, m_qp->wr_rbuf_len, entries);
+ m_qp, m_qp->wrc.wr_len, entries);
return -1;
}
- memset(m_qp->wr_rbuf, 0, ALIGN_PAGE(m_qp->wr_rbuf_len));
+ memset((void*)m_qp->wrc.wr_addr, 0, ALIGN_PAGE(m_qp->wrc.wr_len));
mlog(4, " WR rbuf pool %p, LEN req=%d, act=%d\n",
- m_qp->wr_rbuf, m_qp->wr_rbuf_len, ALIGN_PAGE(m_qp->wr_rbuf_len) );
+ m_qp->wrc.wr_addr, m_qp->wrc.wr_len, ALIGN_PAGE(m_qp->wrc.wr_len) );
- m_qp->wr_rbuf_mr = ibv_reg_mr(m_qp->smd->md->pd, m_qp->wr_rbuf, m_qp->wr_rbuf_len,
+ m_qp->wr_rbuf_mr = ibv_reg_mr(m_qp->smd->md->pd, (void*)m_qp->wrc.wr_addr, m_qp->wrc.wr_len,
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
if (!m_qp->wr_rbuf_mr) {
mlog(0, " IB_register addr=%p,%d failed %s\n",
- m_qp->wr_rbuf, ALIGN_PAGE(m_qp->wr_rbuf_len), strerror(errno));
+ m_qp->wrc.wr_addr, ALIGN_PAGE(m_qp->wrc.wr_len), strerror(errno));
return -1;;
}
+ m_qp->wrc.wr_addr = (uint64_t)(uintptr_t)m_qp->wr_rbuf_mr->addr;
+ m_qp->wrc.wr_rkey = m_qp->wr_rbuf_mr->rkey;
+
mlog(4, " IB_mr for wr_buf addr %p, off 0x%llx, len %d, entries %d, rkey %x lkey %x\n",
- m_qp->wr_rbuf, m_qp->wr_rbuf_mr->addr, ALIGN_PAGE(m_qp->wr_rbuf_len),
+ m_qp->wrc.wr_addr, m_qp->wr_rbuf_mr->addr, ALIGN_PAGE(m_qp->wrc.wr_len),
entries, m_qp->wr_rbuf_mr->rkey, m_qp->wr_rbuf_mr->rkey);
- m_qp->wr_off_r = scif_register(m_qp->smd->scif_tx_ep, m_qp->wr_rbuf, ALIGN_PAGE(m_qp->wr_rbuf_len),
+ m_qp->wr_off_r = scif_register(m_qp->smd->scif_tx_ep, (void*)m_qp->wrc.wr_addr, ALIGN_PAGE(m_qp->wrc.wr_len),
(off_t)0, SCIF_PROT_READ | SCIF_PROT_WRITE, 0);
if (m_qp->wr_off_r == (off_t)(-1)) {
mlog(0, " SCIF_register addr=%p,%d failed %s\n",
- m_qp->wr_rbuf, ALIGN_PAGE(m_qp->wr_rbuf_len), strerror(errno));
+ m_qp->wrc.wr_addr, ALIGN_PAGE(m_qp->wrc.wr_len), strerror(errno));
return -1;
}
- mlog(4, " WR rbuf pool %p, LEN req=%d, act=%d\n", m_qp->wr_buf, m_qp->wr_len, ALIGN_PAGE(m_qp->wr_len));
+ mlog(4, " WR rbuf pool %p, LEN req=%d, act=%d\n", m_qp->wr_buf, m_qp->wr_len, ALIGN_PAGE(m_qp->wrc.wr_len));
mlog(4, " SCIF_mr for wr_rbuf addr %p, off 0x%llx, len %d, entries %d\n",
- m_qp->wr_rbuf, m_qp->wr_off_r, ALIGN_PAGE(m_qp->wr_rbuf_len), entries);
+ m_qp->wrc.wr_addr, m_qp->wr_off_r, ALIGN_PAGE(m_qp->wrc.wr_len), entries);
return 0;
}
int m_pi_create_wc_q(struct mcm_qp *m_qp, int entries)
{
/* RDMA proxy WC pool, register with SCIF and IB, set pool and segm size with parameters */
- m_qp->wc_rbuf_sz = ALIGN_64(sizeof(struct mcm_wc_rx));
- m_qp->wc_rbuf_len = m_qp->wc_rbuf_sz * entries; /* 64 byte aligned for signal_fence */
- m_qp->wc_rbuf_end = entries - 1;
+ m_qp->wrc.wc_sz = ALIGN_64(sizeof(struct mcm_wc_rx));
+ m_qp->wrc.wc_len = m_qp->wrc.wc_sz * entries; /* 64 byte aligned for signal_fence */
+ m_qp->wrc.wc_end = entries - 1;
m_qp->wc_hd_rem = 0;
m_qp->wc_tl_rem = 0;
- if (posix_memalign((void **)&m_qp->wc_rbuf, 4096, ALIGN_PAGE(m_qp->wc_rbuf_len))) {
+ if (posix_memalign((void **)&m_qp->wrc.wc_addr, 4096, ALIGN_PAGE(m_qp->wrc.wc_len))) {
mlog(0, "failed to allocate wc_rbuf, m_qp=%p, wc_len=%d, entries=%d\n",
- m_qp, m_qp->wc_rbuf_len, entries);
+ m_qp, m_qp->wrc.wc_len, entries);
return -1;
}
- memset(m_qp->wc_rbuf, 0, ALIGN_PAGE(m_qp->wc_rbuf_len));
+ memset((void*)m_qp->wrc.wc_addr, 0, ALIGN_PAGE(m_qp->wrc.wc_len));
mlog(4, " WC rbuf pool %p, LEN req=%d, act=%d\n",
- m_qp->wc_rbuf, m_qp->wc_rbuf_len, ALIGN_PAGE(m_qp->wc_rbuf_len) );
+ m_qp->wrc.wc_addr, m_qp->wrc.wc_len, ALIGN_PAGE(m_qp->wrc.wc_len));
- m_qp->wc_rbuf_mr = ibv_reg_mr(m_qp->smd->md->pd, m_qp->wc_rbuf, m_qp->wc_rbuf_len,
+ m_qp->wc_rbuf_mr = ibv_reg_mr(m_qp->smd->md->pd, (void*)m_qp->wrc.wc_addr, m_qp->wrc.wc_len,
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
if (!m_qp->wc_rbuf_mr) {
mlog(0, " IB_register addr=%p,%d failed %s\n",
- m_qp->wc_rbuf, ALIGN_PAGE(m_qp->wc_rbuf_len), strerror(errno));
+ m_qp->wrc.wc_addr, ALIGN_PAGE(m_qp->wrc.wc_len), strerror(errno));
return -1;
}
+ m_qp->wrc.wc_addr = (uint64_t)(uintptr_t)m_qp->wc_rbuf_mr->addr;
+ m_qp->wrc.wc_rkey = m_qp->wc_rbuf_mr->rkey;
- mlog(4, " IB_mr for wc_buf addr %p, off 0x%llx, len %d, entries %d rkey %x lkey %x\n",
- m_qp->wc_rbuf, m_qp->wc_rbuf_mr->addr, ALIGN_PAGE(m_qp->wc_rbuf_len),
+ mlog(4, " IB_mr for wc_buf addr %p, mr 0x%llx, len %d, entries %d rkey %x lkey %x\n",
+ m_qp->wrc.wc_addr, m_qp->wc_rbuf_mr->addr, ALIGN_PAGE(m_qp->wrc.wc_len),
entries, m_qp->wc_rbuf_mr->rkey, m_qp->wc_rbuf_mr->lkey);
return 0;
idx = (idx + 1) & smd->m_buf_end_r;
}
for (idx = m_qp->wr_tl_r;;) {
- wr_rx = (struct mcm_wr_rx *)(m_qp->wr_rbuf + (m_qp->wr_rbuf_sz * idx));
+ wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * idx));
mlog(8, " RX_wr[%d] %p RR(%d,%d,%d): WT(%d,%d) flgs %x op %x"
" tl %d tl_wt %d hd %d m_idx %x-%x ln %d\n",
wr_rx->w_idx, wr_rx, m_qp->post_cnt_rr, m_qp->stall_cnt_rr,
m_qp->wr_tl_r_wt, m_qp->wr_hd_r, wr_rx->m_idx - wr_rx->sg[0].length,
wr_rx->m_idx, wr_rx->sg[0].length);
- idx = (idx + 1) & m_qp->wr_rbuf_end;
+ idx = (idx + 1) & m_qp->wrc.wr_end;
if (idx == m_qp->wr_hd_r)
break;
}
struct ibv_qp *ib_qp;
int i;
- /* MXS to non-MXS, PI service will be on QP1 */
- if (MXS_EP(&m_qp->smd->md->addr) && !MXS_EP(&m_qp->cm->msg.daddr1))
- ib_qp = m_qp->ib_qp1;
+ /* MXS -> MSS or HST, PI service will be on QP1 */
+ if (MXS_EP(&m_qp->smd->md->addr) &&
+ (MSS_EP(&m_qp->cm->msg.daddr1) || HST_EP(&m_qp->cm->msg.daddr1)))
+ ib_qp = m_qp->ib_qp1;
else
ib_qp = m_qp->ib_qp2;
struct ibv_qp *ib_qp;
int wc_idx, ret;
- mlog(0x10," WC_rem: wr_rx[%d] %p, wc_idx %d flgs %x, WR_r tl %d-%d wt %d hd %d\n",
+ mlog(0x10," WC_rem: wr_rx[%d] %p, wc_idx %d flgs 0x%x, WR_r tl %d-%d wt %d hd %d\n",
wr_rx->w_idx, wr_rx, m_qp->wc_hd_rem, wr_rx->flags,
m_qp->wr_tl_r, wr_rx->w_idx, m_qp->wr_tl_r_wt, m_qp->wr_hd_r);
/* local WR and remote WR are serialized, should never reach tail of remote WR */
- if (((m_qp->wc_hd_rem + 1) & m_qp->wc_end_rem) == m_qp->wc_tl_rem) {
+ if (((m_qp->wc_hd_rem + 1) & m_qp->wrc.wc_end) == m_qp->wc_tl_rem) {
mlog(0, " ERR: m_qp %p stalled, peer proxy-out WC queue full hd %d == tl %d\n",
m_qp, m_qp->wc_hd_rem, m_qp->wc_tl_rem);
return -1;
}
- m_qp->wc_hd_rem = (m_qp->wc_hd_rem + 1) & m_qp->wc_end_rem; /* move remote wc_hd */
+ m_qp->wc_hd_rem = (m_qp->wc_hd_rem + 1) & m_qp->wrc.wc_end; /* move remote wc_hd */
m_qp->wr_tl_r = wr_rx->w_idx; /* move wr_rx tail */
wc_idx = m_qp->wc_hd_rem;
wrc.id = (uint16_t)wc_idx; /* imm_data for proxy_out rcv engine */
wrc.type = M_WC_TYPE;
wrc.flags = 0;
- const_rx_wc(&wc_rx, wr_rx, m_qp->wr_tl_r, status);
+ mcm_hton_wc_rx(&wc_rx, wr_rx, m_qp->wr_tl_r, status);
/* P2P on same system, keep it local */
if (htonll(m_qp->cm->msg.sys_guid) == system_guid)
wr.num_sge = 1;
wr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE; /* mcm_wc_rx, 160 bytes */
wr.wr_id = WRID_SET(wr_rx, WRID_RX_RW_IMM);
- wr.wr.rdma.rkey = m_qp->wc_rkey_rem;
+ wr.wr.rdma.rkey = m_qp->wrc_rem.wc_rkey;
wr.wr.rdma.remote_addr = (uint64_t)(uintptr_t)((struct mcm_wc_rx *)
- (m_qp->wc_addr_rem + (m_qp->wc_sz_rem * wc_idx)));
+ (m_qp->wrc_rem.wc_addr + (m_qp->wrc_rem.wc_sz * wc_idx)));
wr.sg_list = &sge;
sge.addr = (uint64_t)(uintptr_t) &wc_rx;
sge.length = (uint32_t) sizeof(struct mcm_wc_rx);
wr.send_flags, ntohl(wr.imm_data), wr.wr.rdma.remote_addr,
wr.wr.rdma.rkey, sge.length);
- /* xsocket to same socket, PI on QP1 and PO on QP2 */
- if (MXS_EP(&m_qp->smd->md->addr) && MSS_EP(&m_qp->cm->msg.daddr1))
- ib_qp = m_qp->ib_qp1;
+ /* MXS -> MSS or HST, PI service will be on QP1 */
+ if (MXS_EP(&m_qp->smd->md->addr) &&
+ (MSS_EP(&m_qp->cm->msg.daddr1) || HST_EP(&m_qp->cm->msg.daddr1)))
+ ib_qp = m_qp->ib_qp1;
else
ib_qp = m_qp->ib_qp2;
}
m_qp->pi_rw_cnt++;
- mlog(4," WC_rem hd %d tl %d\n", m_qp->wc_hd_rem, m_qp->wc_tl_rem );
+ mlog(4," WC_rem hd %d tl %d, qpn 0x%x\n", m_qp->wc_hd_rem, m_qp->wc_tl_rem, ib_qp->qp_num);
return 0;
}
wr_idx = m_qp->wr_tl_r_wt; /* from WT tail, process RR's posted until reaching wr_last */
while (m_qp->pi_rr_cnt) { /* RR's pending */
- wr_rx = (struct mcm_wr_rx *)(m_qp->wr_rbuf + (m_qp->wr_rbuf_sz * wr_idx));
+ wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * wr_idx));
if (!(wr_rx->flags & M_READ_POSTED)) {
if (!m_qp->wt_err) { /* first error dump list */
wr_idx = m_qp->wr_tl_r;
while (m_qp->pi_rr_cnt) {
- wr_rx = (struct mcm_wr_rx *)(m_qp->wr_rbuf + (m_qp->wr_rbuf_sz * wr_idx));
+ wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * wr_idx));
mlog(0, " wr[%d] %p RR(%d,%d,%d) flg %x tl %d tl_wt %d hd %d\n",
wr_rx->w_idx, wr_rx, m_qp->post_cnt_rr, m_qp->stall_cnt_rr,
m_qp->pi_rr_cnt, wr_rx->flags, m_qp->wr_tl_r, m_qp->wr_tl_r_wt,
if (wr_idx == m_qp->wr_hd_r)
break;
- wr_idx = (wr_idx + 1 ) & m_qp->wr_rbuf_end; /* next */
+ wr_idx = (wr_idx + 1 ) & m_qp->wrc.wr_end; /* next */
}
m_qp->wt_err = 1;
}
if (wr_idx == wr_sig->w_idx || wr_idx == m_qp->wr_hd_r)
break;
- wr_idx = (wr_idx + 1) & m_qp->wr_rbuf_end; /* next WR */
+ wr_idx = (wr_idx + 1) & m_qp->wrc.wr_end; /* next WR */
continue;
}
wr_cnt++;
/* sg[2] entry == proxy-in buffer src for scif_sendto */
/* wr.rdma.remote_addr, wr.rdma.rkey, dst for scif_sento - TPT to sci_off */
wr_rx->wr.wr_id = 0;
- l_off_wr = (uint64_t) (m_qp->wr_off_r + (wr_rx->w_idx * m_qp->wr_rbuf_sz));
+ l_off_wr = (uint64_t) (m_qp->wr_off_r + (wr_rx->w_idx * m_qp->wrc.wr_sz));
l_off = wr_rx->sg[2].addr;
l_len = wr_rx->sg[2].length;
l_start = l_off - (uint64_t)smd->m_offset_r;
if (wr_idx == wr_sig->w_idx || wr_idx == m_qp->wr_hd_r)
break;
- wr_idx = (wr_idx + 1) & m_qp->wr_rbuf_end; /* next WR */
+ wr_idx = (wr_idx + 1) & m_qp->wrc.wr_end; /* next WR */
}
#if MCM_PROFILE_DBG
m_qp->wt_last_sig = wr_sig->w_idx;
wr_rx, sbuf, sbuf[0], &sbuf[wr_rx->sg[1].length],
sbuf[wr_rx->sg[1].length], wr_rx->sg[1].length, wr_rx->sg[1].lkey);
mlog(0, " WR ERR: wr_id %Lx sglist %p sge %d op %d flgs"
- " %d idata 0x%x raddr %p rkey %x \n",
+ " %d idata 0x%x raddr %p rkey %x saddr %p key %x ln %d\n",
wr_rx->org_id, wr_rx->sg, wr_rx->wr.num_sge,
wr_rx->wr.opcode, wr_rx->wr.send_flags, wr_rx->wr.imm_data,
- wr_rx->wr.wr.rdma.remote_addr, wr_rx->wr.wr.rdma.rkey);
+ wr_rx->wr.wr.rdma.remote_addr, wr_rx->wr.wr.rdma.rkey,
+ wr_rx->sg[0].addr, wr_rx->sg[0]. lkey,wr_rx->sg[0].length);
/* send WC with ERR to RW initiator, hold rxlock */
if (m_pi_send_wc(m_qp, wr_rx, wc->status))
smd->m_hd_r = l_end;
mpxy_unlock(&smd->rblock);
- /* xsocket to same socket, PI on QP1 and PO on QP2 */
- if (MXS_EP(&m_qp->smd->md->addr) && MSS_EP(&m_qp->cm->msg.daddr1))
- ib_qp = m_qp->ib_qp1;
+ /* MXS -> MSS or HST, PI service will be on QP1 */
+ if (MXS_EP(&m_qp->smd->md->addr) &&
+ (MSS_EP(&m_qp->cm->msg.daddr1) || HST_EP(&m_qp->cm->msg.daddr1)))
+ ib_qp = m_qp->ib_qp1;
else
ib_qp = m_qp->ib_qp2;
if (wrc->type == M_WR_TYPE) {
struct mcm_wr_rx *wr_rx;
- if (wrc->id > m_qp->wr_rbuf_end) {
+ if (wrc->id > m_qp->wrc.wr_end) {
mlog(0," RX imm_data: WR id out of range %x > %x \n",
- wrc->id, m_qp->wr_rbuf_end);
+ wrc->id, m_qp->wrc.wr_end);
return;
}
- wr_rx = (struct mcm_wr_rx *)(m_qp->wr_rbuf + (m_qp->wr_rbuf_sz * wrc->id));
- ntoh_rx_wr(wr_rx); /* received in network order, convert */
+ wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * wrc->id));
+ mcm_ntoh_wr_rx(wr_rx); /* received in network order, convert */
wr_rx->context = (uint64_t)(uintptr_t)m_qp; /* local side QP context */
wr_rx->m_idx = 0;
struct mcm_wc_rx *m_wc;
/* work completion of rdma_write sent to remote proxy-in */
- if (wrc->id > m_qp->wc_rbuf_end) {
+ if (wrc->id > m_qp->wrc.wc_end) {
mlog(0," RX imm_data: WC id out of range %x > %x \n",
- wrc->id, m_qp->wc_rbuf_end);
+ wrc->id, m_qp->wrc.wc_end);
return;
}
- m_wc = (struct mcm_wc_rx *)(m_qp->wc_rbuf + (m_qp->wc_rbuf_sz * wrc->id));
- ntoh_rx_wc(m_wc); /* convert received WC contents */
+ m_wc = (struct mcm_wc_rx *)(m_qp->wrc.wc_addr + (m_qp->wrc.wc_sz * wrc->id));
+ mcm_ntoh_wc_rx(m_wc); /* convert received WC contents */
/* work completion for proxy_out service */
m_po_wc_event(m_qp, m_wc, wrc->id);
r_wr.num_sge = 0;
r_wr.wr_id = (uint64_t)(uintptr_t) m_qp;
- /* xsocket to same socket, PI on QP1 and PO on QP2 */
- if (MXS_EP(&m_qp->smd->md->addr) && MSS_EP(&m_qp->cm->msg.daddr1))
- ib_qp = m_qp->ib_qp1;
+ /* MXS -> MSS or HST, PI service will be on QP1 */
+ if (MXS_EP(&m_qp->smd->md->addr) &&
+ (MSS_EP(&m_qp->cm->msg.daddr1) || HST_EP(&m_qp->cm->msg.daddr1)))
+ ib_qp = m_qp->ib_qp1;
else
ib_qp = m_qp->ib_qp2;
int wr_idx, wr_max, wr_cnt;
if (m_qp->cm && m_qp->cm->state != MCM_CONNECTED) {
- mlog(8," !CONN: qp %p cm %p %s tl_r %d wt_tl_r %d hd_r %d pp %d st %d data %d\n",
- m_qp, m_qp->cm, m_qp->cm ? mcm_state_str(m_qp->cm->state):"",
- m_qp->wr_tl_r, m_qp->wr_tl_r_wt,
- m_qp->wr_hd_r, m_qp->post_cnt_wt,
- m_qp->stall_cnt_rr, *data);
+ if (m_qp->post_cnt_wt) {
+ mlog(8," !CONN: qp %p cm %p %s tl_r %d wt_tl_r %d hd_r %d pp %d st %d data %d\n",
+ m_qp, m_qp->cm, m_qp->cm ? mcm_state_str(m_qp->cm->state):"",
+ m_qp->wr_tl_r, m_qp->wr_tl_r_wt,
+ m_qp->wr_hd_r, m_qp->post_cnt_wt,
+ m_qp->stall_cnt_rr, *data);
+ }
return;
}
while (--wr_max && (m_qp->post_cnt_wt || m_qp->stall_cnt_rr)) {
- wr_rx = (struct mcm_wr_rx *)(m_qp->wr_rbuf + (m_qp->wr_rbuf_sz * wr_idx));
+ wr_rx = (struct mcm_wr_rx *)(m_qp->wrc.wr_addr + (m_qp->wrc.wr_sz * wr_idx));
if (wr_rx->flags & M_READ_WRITE_TO_DONE) {
if (wr_idx == m_qp->wr_hd_r)
goto done;
- wr_idx = (wr_idx + 1) & m_qp->wr_rbuf_end; /* next */
+ wr_idx = (wr_idx + 1) & m_qp->wrc.wr_end; /* next */
continue;
}
wr_cnt++;
m_pi_free_sr(m_qp, m_sr);
}
- /* Last Segment or peer PO wants signaled */
- if ((wr_rx->flags & M_SEND_LS) || (wr_rx->flags & M_SEND_MP_SIG)) {
+ /* Last Segment and !DIRECT (no segments) or peer PO wants signaled */
+ if ((wr_rx->flags & M_SEND_MP_SIG) ||
+ ((wr_rx->flags & M_SEND_LS) && !(wr_rx->flags & M_SEND_DIRECT))) {
mlog(4, "WR_rx[%d] wr %p LastSeg: send WC! tl %d hd %d\n",
wr_rx->w_idx, wr_rx, m_qp->wr_tl_r, m_qp->wr_hd_r);
if (wr_idx == m_qp->wr_hd_r) /* reached head */
goto done;
- wr_idx = (wr_idx + 1) & m_qp->wr_rbuf_end; /* next */
+ wr_idx = (wr_idx + 1) & m_qp->wrc.wr_end; /* next */
if (smd->destroy) {
mlog(0, " SMD destroy - QP %p hd %d tl %d pst %d,%d cmp %d, pp %d, data %d\n",
/* proxy m_wr over to remote m_wr_rem slot, remote will initiate RR and send back WC */
m_wr->flags |= M_SEND_PI;
- const_rx_wr(&wr_rx, m_wr, m_qp); /* build rx_wr for wire transfer, send it */
+ mcm_hton_wr_rx(&wr_rx, m_wr, m_qp->wc_tl); /* build rx_wr for wire transfer, send it */
wrc.id = (uint16_t)wr_idx; /* setup imm_data for proxy_in rcv engine */
wrc.type = M_WR_TYPE;
wr.send_flags = m_wr->wr.send_flags | IBV_SEND_INLINE | IBV_SEND_SIGNALED; /* m_wr_rx, 148 bytes */
wr.imm_data = htonl(*(uint32_t *)&wrc);
- wr.wr.rdma.rkey = m_qp->wr_rkey_rem;
+ wr.wr.rdma.rkey = m_qp->wrc_rem.wr_rkey;
wr.wr.rdma.remote_addr =
(uint64_t)(uintptr_t)
- ((struct mcm_wr_rx *) (m_qp->wr_addr_rem + (m_qp->wr_sz_rem * wr_idx) ));
+ ((struct mcm_wr_rx *) (m_qp->wrc_rem.wr_addr + (m_qp->wrc_rem.wr_sz * wr_idx)));
sge.addr = (uint64_t)(uintptr_t) &wr_rx;
sge.length = (uint32_t) sizeof(struct mcm_wr_rx); /* 160 byte WR */
sge.lkey = 0; /* inline doesn't need registered */
- /* MXS to same socket, PI on QP1 and PO on QP2 */
- if (MSS_EP(&m_qp->cm->msg.daddr1))
- ib_qp = m_qp->ib_qp1;
+ /* MXS -> MSS or HST, PI service will be on QP1 */
+ if (MXS_EP(&m_qp->smd->md->addr) &&
+ (MSS_EP(&m_qp->cm->msg.daddr1) || HST_EP(&m_qp->cm->msg.daddr1)))
+ ib_qp = m_qp->ib_qp1;
else
ib_qp = m_qp->ib_qp2;
wr_max = 40;
wr_idx = m_qp->wr_tl_rf;
- while (wr_max) {
+ while (wr_max && m_qp->wr_pp) {
cn_signal = 0; posted = 0; poll_cnt = 100;
m_wr = (struct mcm_wr *)(m_qp->wr_buf + (m_qp->wr_sz * wr_idx));
seg_len, retries, m_qp->post_cnt,
m_qp->post_sig_cnt, m_qp->comp_cnt, m_qp->wr_pp);
}
- m_qp->wr_hd = (m_qp->wr_hd + 1) & m_qp->wr_end; /* move hd */
mpxy_unlock(&m_qp->txlock);
write(smd->md->mc->tx_pipe[1], "w", sizeof("w"));
mpxy_lock(&m_qp->txlock);
+ if (m_wr->flags & M_SEND_LS)
+ goto bail;
+
/* prepare the next WR */
+ m_qp->wr_hd = (m_qp->wr_hd + 1) & m_qp->wr_end; /* move hd */
m_wr = (struct mcm_wr *)(m_qp->wr_buf + (m_qp->wr_sz * m_qp->wr_hd));
m_sge = m_wr->sg;
m_wr->org_id = pmsg->wr.wr_id;
mpxy_unlock(&m_qp->smd->tblock);
}
- if (m_qp->wr_addr_rem) /* remote MXS: sync PO WR tail with remote PI WR tail */
+ if (m_qp->wrc_rem.wr_addr) /* remote MXS: sync PO WR tail with remote PI WR tail */
m_qp->wr_tl = wc_rx->wr_tl;
m_qp->wc_tl = wc_idx; /* move local wc_tl, for wc_tl_rem on peer PI service */
*/
void m_req_event(struct mcm_cq *m_cq)
{
- struct ibv_cq *ib_cq;
+ struct ibv_cq *ib_cq = NULL;
struct mcm_qp *m_qp;
struct mcm_wr *m_wr;
struct mcm_wr_rx *m_wr_rx;
struct ibv_wc wc[DAT_MIX_WC_MAX];
struct dat_mix_wc wc_ev[DAT_MIX_WC_MAX];
- ret = ibv_get_cq_event(m_cq->ib_ch, &ib_cq, (void *)&cq_ctx);
- if (ret == 0)
- ibv_ack_cq_events(m_cq->ib_cq, 1);
+ ibv_get_cq_event(m_cq->ib_ch, &ib_cq, (void *)&cq_ctx);
+ if (ib_cq && (ib_cq != m_cq->ib_cq))
+ mlog(1," WARNING: ib_cq %p != m_cq->ib_cq %p\n", ib_cq, m_cq->ib_cq);
retry:
ret = ibv_poll_cq(m_cq->ib_cq, DAT_MIX_WC_MAX, wc);
notify = 1;
goto retry;
}
+ if (ib_cq)
+ ibv_ack_cq_events(ib_cq, 1);
return;
} else
notify = 0;
int scif_listen_qlen = 64;
int mix_buffer_mb = 64;
int mix_buffer_sg = 262144;
+int mix_buffer_sg_po2 = 18; /* 256KB */
int mcm_set_priority = 0; /* set to SCHED_FIFO */
int mcm_affinity = 1;
int mcm_affinity_base_mic = 1;
#include <infiniband/verbs.h>
#include "dat2/udat.h"
#include "dat2/dat_mic_extensions.h"
+#include "mpxy.h"
#define min(a, b) ((a < b) ? (a) : (b))
#define max(a, b) ((a > b) ? (a) : (b))
-#if __BYTE_ORDER == __BIG_ENDIAN
-#define htonll(x) (x)
-#define ntohll(x) (x)
-#elif __BYTE_ORDER == __LITTLE_ENDIAN
-#define htonll(x) bswap_64(x)
-#define ntohll(x) bswap_64(x)
-#endif
-
#define MCM_IB_INLINE 160
#define MIX_MAX_MSG_SIZE (8*1024*1024)
} mcm_ib_dev_t;
-/* WRC (work request/completion) imm_data definition, QP ref is in IB wc if sharing CQ */
-#define WRC_MAX_QLEN 1 << 16;
-
-/* types */
-#define M_WR_TYPE 1
-#define M_WC_TYPE 2
-
-/* WR flags */
-#define M_WR_FS 1
-#define M_WR_LS 2
-
-#define WRC_ID_DATA(x) ((x) & 0x0000ffff)
-#define WRC_TYPE_DATA(x) (((x) >> 16) & 0x000000ff)
-#define WRC_FLAGS_DATA(x) (((x) >> 24) & 0x000000ff)
-
-/* wr aligned on 64 bytes, use 4 lower bits for type id */
-#define WRID_TX_RW 0x1 /* proxy out, m_wr type, RW */
-#define WRID_TX_RW_IMM 0x2 /* proxy out, m_wr type, RW_imm op */
-#define WRID_RX_RR 0x3 /* proxy in, m_wr_rx type, RR op */
-#define WRID_RX_RW_IMM 0x4 /* proxy in, m_wr_rx type, RW_immed op */
-#define WRID_MASK 0xfffffffffffffff0
-#define WRID_SET(x,y) (((uint64_t)(x) | (uint64_t)(y)))
-#define WRID_TYPE(x) ((x & ~WRID_MASK))
-#define WRID_ADDR(x) ((x & WRID_MASK))
-
-typedef struct wrc_idata {
-
- uint16_t id; /* work request or completion slot */
- uint8_t type; /* data types, WR, WC, etc */
- uint8_t flags; /* flags */
-
-} __attribute__((packed)) wrc_idata_t;
-
-enum mcm_wr_flags {
- M_SEND_POSTED = 1 << 0, /* m_wr already posted */
- M_SEND_CN_SIG = 1 << 1, /* m_wr consumer signaled, IB completion */
- M_SEND_CN_EAGER_SIG = 1 << 2, /* m_wr consumer eager signaled, SCIF read completion */
- M_SEND_MP_SIG = 1 << 3, /* m_wr mpxyd signaled, segmentation, manage proxy buf/wr resources */
-
- M_SEND_FS = 1 << 4, /* m_wr - first segment */
- M_SEND_LS = 1 << 5, /* m_wr - last segment */
- M_SEND_PI = 1 << 6, /* m_wr - forwarded to proxy in service */
- M_SEND_INLINE = 1 << 7, /* m_wr - data in cmd msg, no scif_readfrom */
-
- M_READ_PAUSED = 1 << 8, /* m_wr_rx waiting for proxy buffer */
- M_RECV_PAUSED = 1 << 9, /* m_wr_rx waiting for posted rcv message */
- M_READ_POSTED = 1 << 10, /* m_wr_rx ibv posted */
- M_READ_DONE = 1 << 11, /* m_wr_rx ibv completed */
-
- M_READ_WRITE_TO = 1 << 12, /* m_wr_rx read data forwarded to MIC scif_writeto */
- M_READ_WRITE_TO_DONE = 1 << 13, /* m_wr_rx read data forwarded to MIC scif_writeto */
- M_READ_CN_SIG = 1 << 14, /* m_wr_rx consumer signaled, IB completion needed */
- M_READ_MP_SIG = 1 << 15, /* m_wr_rx mpxyd signaled, segmentation, manage proxy buf/wr resources */
-
- M_READ_FROM_DONE = 1 << 16, /* m_wr mpxyd read_from_done, ready for posting */
-};
-
/*
* MPXYD shared proxy buffer management, work completion
* Required for out of order completions across multiple QP's
uint32_t done;
} mcm_buf_wc_t;
-/* 80 bytes */
-typedef struct mcm_sr {
- uint64_t wr_id; /* from consumer post_recv */
- uint32_t len; /* total len */
- uint32_t num_sge; /* number of sglist entries, max 4 */
- uint32_t m_idx; /* proxy buffer, src */
- uint32_t w_idx; /* wr_rx WR idx, data xfer in process */
- uint32_t s_idx; /* my idx, sr_tl update */
- struct dat_mix_sge sg[DAT_MIX_SGE_MAX]; /* consumer buffer on MIC, off_t */
-} mcm_sr_t;
-
-/* 128 bytes */
-typedef struct mcm_wr {
- struct ibv_send_wr wr;
- struct ibv_sge sg[DAT_MIX_SGE_MAX];
- uint64_t org_id;
- uint64_t context;
- uint32_t m_idx;
- uint32_t w_idx;
- uint32_t flags;
-} mcm_wr_t;
-
-/* DAT_MCM_PROXY_DATA private data max (40 bytes), proxy-in WR and WC exchange */
-typedef struct mcm_wrc_pdata {
- uint64_t wr_addr;
- uint32_t wr_key;
- uint16_t wr_size;
- uint16_t wr_depth;
- uint64_t wc_addr;
- uint32_t wc_key;
- uint16_t wc_size;
- uint16_t wc_depth;
- uint8_t rsvd[8];
-} __attribute__((packed)) mcm_wrc_pdata_t;
-
-/* 160 bytes, direct RDMA write from remote Proxy-in service */
-typedef struct mcm_wr_rx {
- struct dat_mix_wr wr;
- struct dat_mix_sge sg[DAT_MIX_SGE_MAX];
- uint64_t org_id;
- uint64_t context;
- uint32_t m_idx;
- uint32_t w_idx;
- uint32_t s_idx;
- uint32_t flags;
- uint32_t time;
- uint32_t qcnt;
-} __attribute__((packed)) mcm_wr_rx_t;
-
-/* 80 bytes, direct RDMA write from remote Proxy-in service */
-typedef struct mcm_wc_rx {
- struct dat_mix_wc wc;
- uint64_t org_id;
- uint64_t context;
- uint32_t wr_idx; /* proxy-out, proxy-in WR idx */
- uint32_t wr_tl; /* proxy-in WR tl update */
- uint32_t flags;
- uint8_t rsv[6];
-} __attribute__((packed)) mcm_wc_rx_t;
-
/* performance profiling */
enum mcm_prof_type
{
int post_sig_cnt;
int comp_cnt;
/* Proxy-in: WR management, remote view from TX side */
- uint64_t wr_addr_rem; /* remote IB address, CM reply */
- uint32_t wr_rkey_rem; /* remote IB key, CM reply */
- int wr_end_rem; /* work request pool end */
- int wr_len_rem; /* work request pool size */
+ mcm_wrc_info_t wrc_rem; /* WR and WC buffers: remote, in CM req and reply */
int wr_pp_rem; /* work request pending */
int wr_sz_rem; /* work request entry size, 64 byte aligned */
int wc_tl; /* WC tail update, back to proxy_in via wr_rx writes */
/* Proxy-in: WC management, remote view from RX side */
- uint64_t wc_addr_rem; /* remote IB address, CM request */
- uint32_t wc_rkey_rem; /* remote IB key, CM request */
int wc_hd_rem; /* work completion pool head */
int wc_tl_rem; /* work completion pool tail */
- int wc_end_rem; /* work completion pool end */
- int wc_len_rem; /* work completion pool size */
int wc_sz_rem; /* work request entry size, 64 byte aligned */
/* Proxy-in: WR and WC buffer resources, local on RX side */
- int wr_rbuf_len; /* RX WR buffer pool size */
- int wr_rbuf_sz; /* RX WR entry size */
- int wr_rbuf_end; /* RX WR pool end */
- char *wr_rbuf; /* RX WR entries, proxy-in RR, scif_sento: written from proxy-out */
+ mcm_wrc_info_t wrc; /* WR and WC buffers: local, addr, key, len, end */
off_t wr_off_r; /* SCIF registered, for scif_fence_signal @ wr->wr_id */
struct ibv_mr *wr_rbuf_mr; /* IB WR - MR address and key */
int wr_hd_r; /* RX side, WR pool head */
int wr_tl_r; /* RX side, WR pool tail */
int wr_tl_r_wt; /* RX side, WR pool tail, writeto pending tail */
- int wc_rbuf_len; /* RX WC buffer pool size */
- int wc_rbuf_end; /* RX WC buffer pool end */
- int wc_rbuf_sz; /* RX WC entry size */
- char *wc_rbuf; /* RX WC entries, proxy-in RR completions: written from proxy-in */
struct ibv_mr *wc_rbuf_mr; /* RX WC - IB MR address and key */
int post_cnt_rr; /* RX WR - total RR posted count */
int pi_rw_cnt; /* Proxy-in pending, RW_imm for WC's */
p_port[port] = 0;
}
-static inline void mcm_init_wrc(struct mcm_cm *m_cm)
-{
- mcm_qp_t *m_qp = m_cm->m_qp;
- mcm_wrc_pdata_t wrc;
-
- if (!m_qp)
- return;
-
- if (m_qp->wr_rbuf_mr) {
- mlog(2, "WR: addr %p key %x sz %d cnt %d\n",
- m_qp->wr_rbuf_mr->addr, m_qp->wr_rbuf_mr->rkey,
- m_qp->wr_rbuf_sz, m_qp->wr_rbuf_end);
-
- wrc.wr_addr = htonll((uint64_t)(uintptr_t)m_qp->wr_rbuf_mr->addr);
- wrc.wr_key = htonl(m_qp->wr_rbuf_mr->rkey);
- wrc.wr_size = htons(m_qp->wr_rbuf_sz);
- wrc.wr_depth = htons(m_qp->wr_rbuf_end);
- }
- if (m_qp->wc_rbuf_mr) {
- mlog(2, "WC: addr %p key %x sz %d cnt %d\n",
- m_qp->wc_rbuf_mr->addr, m_qp->wc_rbuf_mr->rkey,
- m_qp->wc_rbuf_sz, m_qp->wc_rbuf_end);
-
- wrc.wc_addr = htonll((uint64_t)(uintptr_t)m_qp->wc_rbuf_mr->addr);
- wrc.wc_key = htonl(m_qp->wc_rbuf_mr->rkey);
- wrc.wc_size = htons(m_qp->wc_rbuf_sz);
- wrc.wc_depth = htons(m_qp->wc_rbuf_end);
- }
-
- memcpy(m_cm->msg.p_proxy, &wrc, sizeof(wrc));
-}
-
-static inline void mcm_save_wrc(struct mcm_cm *m_cm)
-{
- mcm_qp_t *m_qp = m_cm->m_qp;
- mcm_wrc_pdata_t *wrc = (mcm_wrc_pdata_t *)m_cm->msg.p_proxy;
-
- if (!m_qp)
- return;
-
- m_qp->wr_addr_rem = ntohll(wrc->wr_addr);
- m_qp->wr_rkey_rem = ntohl(wrc->wr_key);
- m_qp->wr_sz_rem = ntohs(wrc->wr_size);
- m_qp->wr_end_rem = ntohs(wrc->wr_depth);
-
- mlog(2, "WR: addr %p key %x sz %d cnt %d\n",
- m_qp->wr_addr_rem, m_qp->wr_rkey_rem,
- m_qp->wr_sz_rem, m_qp->wr_end_rem);
-
- m_qp->wc_addr_rem = ntohll(wrc->wc_addr);
- m_qp->wc_rkey_rem = ntohl(wrc->wc_key);
- m_qp->wc_sz_rem = ntohs(wrc->wc_size);
- m_qp->wc_end_rem = ntohs(wrc->wc_depth);
-
- mlog(2, "WC: addr %p key %x sz %d cnt %d\n",
- m_qp->wc_addr_rem, m_qp->wc_rkey_rem,
- m_qp->wc_sz_rem, m_qp->wc_end_rem);
-}
static inline int scif_send_msg(scif_epd_t ep, void *msg, int len)
{
iwr->wr.rdma.rkey = sg[0].lkey;
}
-/*
- * construct a rx_wr in network order to send to remote proxy-in service
- * NOTE: network order might be overkill, can we assume all x86_64 platforms?
- */
-static inline void const_rx_wr(struct mcm_wr_rx *m_wr_rx, struct mcm_wr *m_wr, struct mcm_qp *m_qp)
-{
- memset((void*)m_wr_rx, 0, sizeof(*m_wr_rx));
- m_wr_rx->org_id = (uint64_t) htonll((uint64_t)m_wr); /* proxy_out WR */
- m_wr_rx->flags = htonl(m_wr->flags);
- m_wr_rx->w_idx = htonl(m_qp->wc_tl); /* snd back wc tail */
- m_wr_rx->wr.opcode = ntohl(m_wr->wr.opcode);
- m_wr_rx->wr.send_flags = ntohl(m_wr->wr.send_flags);
- m_wr_rx->wr.imm_data = htonl(m_wr->wr.imm_data);
- m_wr_rx->wr.wr.rdma.remote_addr = htonll(m_wr->wr.wr.rdma.remote_addr); /* final dst on MIC */
- m_wr_rx->wr.wr.rdma.rkey = htonl(m_wr->wr.wr.rdma.rkey);
- m_wr_rx->sg[0].addr = htonll(m_wr->sg[0].addr); /* proxy-out buffer */
- m_wr_rx->sg[0].lkey = htonl(m_wr->sg[0].lkey);
- m_wr_rx->sg[0].length = htonl(m_wr->sg[0].length);
-}
-
-/* convert rx wr, arrived across fabric from remote proxy-out service in network order */
-static inline void ntoh_rx_wr(struct mcm_wr_rx *m_wr_rx)
-{
- m_wr_rx->org_id = ntohll(m_wr_rx->org_id); /* proxy_out WR */
- m_wr_rx->flags = ntohl(m_wr_rx->flags);
- m_wr_rx->w_idx = ntohl(m_wr_rx->w_idx); /* WC tail update from proxy_out */
- m_wr_rx->wr.opcode = ntohl(m_wr_rx->wr.opcode);
- m_wr_rx->wr.send_flags = ntohl(m_wr_rx->wr.send_flags);
- m_wr_rx->wr.imm_data = ntohl(m_wr_rx->wr.imm_data);
- m_wr_rx->wr.wr.rdma.remote_addr = ntohll(m_wr_rx->wr.wr.rdma.remote_addr); /* final dest on MIC */
- m_wr_rx->wr.wr.rdma.rkey = ntohl(m_wr_rx->wr.wr.rdma.rkey);
- m_wr_rx->sg[0].addr = ntohll(m_wr_rx->sg[0].addr); /* proxy-out buffer segment, ibv */
- m_wr_rx->sg[0].lkey = ntohl(m_wr_rx->sg[0].lkey);
- m_wr_rx->sg[0].length = ntohl(m_wr_rx->sg[0].length);
- /* sg[1] == proxy-in buffer segment, ibv */
- /* sg[2] == proxy-in scif sendto src segment, scif offset */
- /* sg[3] == proxy-in scif sendto dst segment, scif offset */
-}
-
-/*
- * Construct a rx_wc in network order to send to remote proxy-in service
- * NOTE: network order might be overkill, can we assume all x86_64 platforms?
- */
-static inline void const_rx_wc(struct mcm_wc_rx *m_wc_rx, struct mcm_wr_rx *m_wr_rx, int wr_tl, int status)
-{
- memset((void*)m_wc_rx, 0, sizeof(*m_wc_rx));
- m_wc_rx->wr_idx = htonl(m_wr_rx->w_idx); /* proxy-in WR idx == proxy-out WR idx */
- m_wc_rx->wr_tl = htonl(wr_tl); /* proxy-in WR tail update, moves slower than proxy-out */
- m_wc_rx->flags = htonl(m_wr_rx->flags);
- m_wc_rx->wc.wr_id = htonll(m_wr_rx->org_id);
- m_wc_rx->wc.status = htonl(status);
- m_wc_rx->wc.byte_len = htonl(m_wr_rx->sg[0].length);
- if (m_wr_rx->wr.send_flags & IBV_WR_RDMA_WRITE)
- m_wc_rx->wc.opcode = htonl(IBV_WC_RDMA_WRITE);
- else
- m_wc_rx->wc.opcode = htonl(IBV_WC_SEND);
-}
-
-/* convert rx wc, arrived across fabric from remote proxy-in service in network order */
-static inline void ntoh_rx_wc(struct mcm_wc_rx *m_wc_rx)
-{
- m_wc_rx->wr_idx = ntohl(m_wc_rx->wr_idx);
- m_wc_rx->wr_tl = ntohl(m_wc_rx->wr_tl);
- m_wc_rx->flags = ntohl(m_wc_rx->flags);
- m_wc_rx->wc.wr_id = ntohll(m_wc_rx->wc.wr_id);
- m_wc_rx->wc.status = ntohl(m_wc_rx->wc.status);
- m_wc_rx->wc.byte_len = ntohl(m_wc_rx->wc.byte_len);
- m_wc_rx->wc.opcode = ntohl(m_wc_rx->wc.opcode);
-}
static inline void mcm_pr_addrs(int lvl, struct dat_mcm_msg *msg, int state, int in)
{
extern int scif_listen_qlen;
extern int mix_buffer_mb;
extern int mix_buffer_sg;
+extern int mix_buffer_sg_po2;
extern int mcm_affinity;
extern int mcm_affinity_base_mic;
extern int mcm_affinity_base_hca;
return f;
}
-
void mpxy_set_options( int debug_mode )
{
FILE *f;
else if (!strcasecmp("buffer_pool_mb", opt))
mix_buffer_mb = atoi(value);
else if (!strcasecmp("buffer_segment_size", opt))
- mix_buffer_sg = atoi(value);
+ {
+ int i = 0, ssize = atoi(value); /* power of 2 */
+ mix_buffer_sg = 1;
+ while ((mix_buffer_sg < ssize) &&
+ (mix_buffer_sg < DAT_MIX_RDMA_MAX)) {
+ mix_buffer_sg <<= 1;
+ i++;
+ }
+ mix_buffer_sg_po2 = i;
+ }
else if (!strcasecmp("buffer_alignment", opt))
mix_align = atoi(value);
else if (!strcasecmp("buffer_inline_threshold", opt))
{
DAPL_EVD *evd_ptr;
DAT_RETURN dat_status;
- DAT_EVENT *local_event;
+ DAT_EVENT *local_event = NULL;
DAT_BOOLEAN notify_requested = DAT_FALSE;
DAT_BOOLEAN waitable;
DAPL_EVD_STATE evd_state;
} else {
dat_status = dapl_os_wait_object_wait(&evd_ptr->wait_object, time_out);
}
-
+ dapl_dbg_log(DAPL_DBG_TYPE_EVD, "dapl_evd_wait () wake\n");
dapl_os_lock(&evd_ptr->header.lock);
/*
*event = *local_event;
dapls_rbuf_add(&evd_ptr->free_event_queue, local_event);
}
+ dapl_dbg_log(DAPL_DBG_TYPE_EVD, "dapl_evd_wait() return EVENT %p=0x%x\n",
+ local_event, local_event ? local_event->event_number:0);
/*
* Valid if dat_status == DAT_SUCCESS || dat_status == DAT_TIMEOUT
}
/* ep_map: mappings hint: node type and locality to device */
-#define HOST_SSOCK_DEV 1 /* host core to HCA, same socket */
-#define HOST_XSOCK_DEV 2 /* host core to HCA, cross socket */
-#define MIC_SSOCK_DEV 3 /* MIC to HCA, same socket */
-#define MIC_XSOCK_DEV 4 /* MIC to HCA, cross socket */
+#define HOST_SOCK_DEV 1 /* host to HCA, any socket */
+#define MIC_SSOCK_DEV 2 /* MIC to HCA, same socket */
+#define MIC_XSOCK_DEV 3 /* MIC to HCA, cross socket */
#define UND_EP(x) ((x)->ep_map < 1 || (x)->ep_map > 4)
-#define HSS_EP(x) ((x)->ep_map == HOST_SSOCK_DEV)
-#define HXS_EP(x) ((x)->ep_map == HOST_XSOCK_DEV)
+#define HST_EP(x) ((x)->ep_map == HOST_SOCK_DEV)
#define MXS_EP(x) ((x)->ep_map == MIC_XSOCK_DEV)
#define MSS_EP(x) ((x)->ep_map == MIC_SSOCK_DEV)
{
static char *map[] = {
"",
- "HSS",
- "HXS",
+ "HST",
"MSS",
"MXS",
};
- return ((ep_map < 1 || ep_map > 4) ? "???" : map[ep_map]);
+ return ((ep_map < 1 || ep_map > 3) ? "???" : map[ep_map]);
}
/* MCM address, 28 bytes */
region,
4096,
h_pz,
- DAT_MEM_PRIV_LOCAL_WRITE_FLAG,
+ DAT_MEM_PRIV_ALL_FLAG,
DAT_VA_TYPE_VA,
&h_lmr_send_msg,
&lmr_context_send_msg,
region,
4096,
h_pz,
- DAT_MEM_PRIV_LOCAL_WRITE_FLAG,
+ DAT_MEM_PRIV_ALL_FLAG,
DAT_VA_TYPE_VA,
&h_lmr_recv_msg,
&lmr_context_recv_msg,