From 1a4488be3e29360f3316ffb949aff55e512aeac9 Mon Sep 17 00:00:00 2001 From: Arlin Davis Date: Wed, 2 Jul 2014 13:37:18 -0700 Subject: [PATCH] mcm/mpxyd: cleanup ahead of master branch merge combine mpxy.h and dat_mic_extensions.h into dapl_mic_commom.h since the MIC message and cm protocol is internal only and is not an exposed extension. update copyright dates. Signed-off-by: Arlin Davis --- Makefile.am | 6 +- .../openib_common/dapl_mic_common.h | 338 +++++++++++++++++- dapl/openib_mcm/cm.c | 2 +- dapl/openib_mcm/dapl_ib_util.h | 3 +- dapl/openib_mcm/device.c | 2 +- dapl/openib_mcm/mix.c | 2 +- dapl/openib_mcm/proxy.c | 2 +- dapl/svc/mcm.c | 2 +- dapl/svc/mix.c | 2 +- dapl/svc/mpxy.h | 336 ----------------- dapl/svc/mpxy_in.c | 2 +- dapl/svc/mpxy_out.c | 2 +- dapl/svc/mpxyd.c | 2 +- dapl/svc/mpxyd.h | 5 +- dapl/svc/util.c | 2 +- 15 files changed, 338 insertions(+), 370 deletions(-) rename dat/include/dat2/dat_mic_extensions.h => dapl/openib_common/dapl_mic_common.h (57%) delete mode 100644 dapl/svc/mpxy.h diff --git a/Makefile.am b/Makefile.am index 31b0130..aee7971 100755 --- a/Makefile.am +++ b/Makefile.am @@ -21,7 +21,7 @@ if EXT_TYPE_IB XFLAGS = -DDAT_EXTENSIONS XPROGRAMS = dapl/openib_common/ib_extensions.c XHEADERS = -XDAT = dat/include/dat2/dat_ib_extensions.h dat/include/dat2/dat_mic_extensions.h +XDAT = dat/include/dat2/dat_ib_extensions.h XLIBS = if DEFINE_COUNTERS XFLAGS += -DDAPL_COUNTERS @@ -699,6 +699,7 @@ EXTRA_DIST = dat/common/dat_dictionary.h \ dapl/include/dapl_vendor.h \ dapl/openib_common/dapl_ib_dto.h \ dapl/openib_common/dapl_ib_common.h \ + dapl/openib_common/dapl_mic_common.h \ dapl/openib_common/collectives/ib_collectives.h \ dapl/openib_common/collectives/fca_provider.h \ dapl/openib_cma/dapl_ib_util.h \ @@ -709,7 +710,6 @@ EXTRA_DIST = dat/common/dat_dictionary.h \ dapl/openib_ucm/linux/openib_osd.h \ dapl/openib_mcm/dapl_ib_util.h \ dapl/openib_mcm/linux/openib_osd.h \ - dapl/svc/mpxy.h \ dapl/svc/mpxyd.h \ dat/udat/libdat2.map \ dapl/udapl/libdaplofa.map \ @@ -763,7 +763,7 @@ svc_mpxyd_SOURCES = dapl/svc/mpxyd.c \ dapl/svc/mcm.c \ dapl/svc/mpxy_out.c \ dapl/svc/mpxy_in.c -svc_mpxyd_CFLAGS = $(AM_CFLAGS) -lscif -libverbs -lpthread -Idat/include/ +svc_mpxyd_CFLAGS = $(AM_CFLAGS) -lscif -libverbs -lpthread -Idat/include -I$(srcdir)/dapl/openib_common sysconf_DATA += doc/mpxyd.conf install-exec-hook: diff --git a/dat/include/dat2/dat_mic_extensions.h b/dapl/openib_common/dapl_mic_common.h similarity index 57% rename from dat/include/dat2/dat_mic_extensions.h rename to dapl/openib_common/dapl_mic_common.h index 2451c91..dc61eb5 100755 --- a/dat/include/dat2/dat_mic_extensions.h +++ b/dapl/openib_common/dapl_mic_common.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2012-2014 Intel Corporation. All rights reserved. * * This Software is licensed under one of the following licenses: * @@ -30,29 +30,28 @@ /********************************************************************** * - * HEADER: dat_mic_extensions.h + * HEADER: dapl_mic_common.h * - * PURPOSE: extensions to the DAT API for MIC Proxy RDMA services + * PURPOSE: Definitions for MIC Proxy RDMA services * + * MCM provider <-> MPXYD service * - * This extension/service enables MIC based DAPL providers to use a - * proxy service for sends and RDMA write operations. RDMA reads and - * receives are NOT supported. This service + * This service enables MIC based DAPL provider (MCM) to use + * proxy service (host CPU) for sends and RDMA write operations. + * proxy RDMA reads are not supported. This service * communicates within a server platform over PCI-E bus using SCIF * and a new MIX within messaging protocol. The MCM provider uses - * DAPL CM messaging protocols on the wire. MIX protocol is defined - * as part of the new MIC extensions. + * DAPL MCM messaging protocols on the wire. MIX protocol is defined + * as part of the communication protocol between MCM provider on MIC + * and the MPXYD service on the host CPU. * ***********************************************************************/ -#ifndef _DAT_MIC_EXTENSIONS_H_ -#define _DAT_MIC_EXTENSIONS_H_ +#ifndef _DAPL_MIC_COMMON_H_ +#define _DAPL_MIC_COMMON_H_ #include #include -#define DAT_MIC_EXTENSION_VERSION 1 -#define DAT_MIC_ATTR_MIC "DAT_MIC_SUPPORT" - /***** MIC Indirect CM (MCM) protocol over IB fabrics *****/ #define DAT_MCM_VER 1 #define DAT_MCM_UD_QKEY 0x78655322 @@ -67,7 +66,6 @@ #define ALIGN_UP_PPAGE(o) ((((uintptr_t)o) + 4096 - 1)& ~(4096-1)) #define ALIGN_DOWN_PPAGE(o) ((((uintptr_t)o)) & ~(4096-1)) - static inline char * mcm_qp_state_str(IN int st) { static char *qp_state[] = { @@ -274,7 +272,6 @@ typedef struct dat_mcm_msg_compat * v5 - CM services with proxy_in, private data */ #define DAT_MIX_VER 5 -#define DAT_MIX_MSG_MAX 256 #define DAT_MIX_INLINE_MAX 256 #define DAT_MIX_RDMA_MAX (8*1024*1024) #define DAT_MIX_WR_MAX 500 @@ -654,4 +651,313 @@ typedef struct dat_mix_sr } __attribute__((packed)) dat_mix_sr_t; -#endif /* _DAT_MIC_EXTENSIONS_H_ */ +typedef union dat_mix_msg +{ + dat_mix_open_t op; + dat_mix_dev_attr_t dev; + dat_mix_prov_attr_t prv; + dat_mix_mr_t mr; + dat_mix_listen_t ls; + dat_mix_qp_t qp; + dat_mix_cq_t cq; + dat_mix_cm_t cm; + dat_mix_cm_compat_t cm_comp; + dat_mix_wc_t wc; + dat_mix_wr_t wr; + dat_mix_dto_comp_t dto; + dat_mix_sr_t sr; + +} DAT_MIX_MSG; + +#define DAT_MIX_MSG_MAX sizeof(DAT_MIX_MSG) + +/* + * MCM to MPXYD: work request and completion definitions + * + * Messaging Protocol between MCM Proxy-out and Proxy-in service agents + * - WR and WC management vi IB RDMA write_imm and RDMA reads + * - WR and WC written directly from remote proxy peer agent, + * - Proxy-in buffer management on receive side, IB RR + * - Proxy-out buffer management on send side + * IB RW directly to user buffer if peer is MIC same socket + * IB RW_imm to PI WR, PI RR, scif_writeto if MIC is remote socket + * + */ +#if __BYTE_ORDER == __BIG_ENDIAN +#define htonll(x) (x) +#define ntohll(x) (x) +#elif __BYTE_ORDER == __LITTLE_ENDIAN +#define htonll(x) bswap_64(x) +#define ntohll(x) bswap_64(x) +#endif + +/* WRC (work request/completion) imm_data definition, qdepth limits of 16 bits */ +#define WRC_MAX_QLEN 1 << 16; +#define MCM_WRC_QLEN 512 + +/* data types, WR or WC */ +#define M_WR_TYPE 1 +#define M_WC_TYPE 2 + +/* WR flags */ +#define M_WR_FS 1 +#define M_WR_LS 2 + +#define WRC_ID_DATA(x) ((x) & 0x0000ffff) +#define WRC_TYPE_DATA(x) (((x) >> 16) & 0x000000ff) +#define WRC_FLAGS_DATA(x) (((x) >> 24) & 0x000000ff) + +/* wr aligned on 64 bytes, use 4 lower bits for type id */ +#define WRID_TX_RW 0x1 /* proxy out, m_wr type, RW */ +#define WRID_TX_RW_IMM 0x2 /* proxy out, m_wr type, RW_imm op */ +#define WRID_RX_RR 0x3 /* proxy in, m_wr_rx type, RR op */ +#define WRID_RX_RW_IMM 0x4 /* proxy in, m_wr_rx type, RW_immed op */ +#define WRID_MASK 0xfffffffffffffff0 +#define WRID_SET(x,y) (((uint64_t)(x) | (uint64_t)(y))) +#define WRID_TYPE(x) ((x & ~WRID_MASK)) +#define WRID_ADDR(x) ((x & WRID_MASK)) + +typedef struct wrc_idata { + + uint16_t id; /* work request or completion slot */ + uint8_t type; /* data types, WR, WC, etc */ + uint8_t flags; /* flags */ + +} __attribute__((packed)) wrc_idata_t; + +enum mcm_wr_flags { + M_SEND_POSTED = 1 << 0, /* m_wr already posted */ + M_SEND_CN_SIG = 1 << 1, /* m_wr consumer signaled, IB completion */ + M_SEND_CN_EAGER_SIG = 1 << 2, /* m_wr consumer eager signaled, SCIF read completion */ + M_SEND_MP_SIG = 1 << 3, /* m_wr mpxyd signaled, segmentation, manage proxy buf/wr resources */ + + M_SEND_FS = 1 << 4, /* m_wr - first segment */ + M_SEND_LS = 1 << 5, /* m_wr - last segment */ + M_SEND_PI = 1 << 6, /* m_wr - forwarded to proxy in service */ + M_SEND_INLINE = 1 << 7, /* m_wr - data in cmd msg, no scif_readfrom */ + + M_READ_PAUSED = 1 << 8, /* m_wr_rx waiting for proxy buffer */ + M_RECV_PAUSED = 1 << 9, /* m_wr_rx waiting for posted rcv message */ + M_READ_POSTED = 1 << 10, /* m_wr_rx ibv posted */ + M_READ_DONE = 1 << 11, /* m_wr_rx ibv completed */ + + M_READ_WRITE_TO = 1 << 12, /* m_wr_rx read data forwarded to MIC scif_writeto */ + M_READ_WRITE_TO_DONE = 1 << 13, /* m_wr_rx read data forwarded to MIC scif_writeto */ + M_READ_CN_SIG = 1 << 14, /* m_wr_rx consumer signaled, IB completion needed */ + M_READ_MP_SIG = 1 << 15, /* m_wr_rx mpxyd signaled, segmentation, manage proxy buf/wr resources */ + + M_READ_FROM_DONE = 1 << 16, /* m_wr mpxyd read_from_done, ready for posting */ + M_SEND_DIRECT = 1 << 17, /* m_wr SEND direct from host memory, no proxy out buffer */ +}; + +/* 80 bytes */ +typedef struct mcm_sr { + uint64_t wr_id; /* from consumer post_recv */ + uint32_t len; /* total len */ + uint32_t num_sge; /* number of sglist entries, max 4 */ + uint32_t m_idx; /* proxy buffer, src */ + uint32_t w_idx; /* wr_rx WR idx, data xfer in process */ + uint32_t s_idx; /* my idx, sr_tl update */ + struct dat_mix_sge sg[DAT_MIX_SGE_MAX]; /* consumer buffer on MIC, off_t */ +} mcm_sr_t; + +/* 128 bytes */ +typedef struct mcm_wr { + struct ibv_send_wr wr; + struct ibv_sge sg[DAT_MIX_SGE_MAX]; + uint64_t org_id; + uint64_t context; + uint32_t m_idx; + uint32_t w_idx; + uint32_t flags; +} mcm_wr_t; + +/* DAT_MCM_PROXY_DATA private data max (40 bytes), Proxy-in WR and WC info exchange */ +typedef struct mcm_wrc_info { + uint64_t wr_addr; + uint32_t wr_rkey; + uint32_t wr_len; + uint16_t wr_sz; + uint16_t wr_end; + uint64_t wc_addr; + uint32_t wc_rkey; + uint32_t wc_len; + uint16_t wc_sz; + uint16_t wc_end; +} __attribute__((packed)) mcm_wrc_info_t; + +/* WR: 160 bytes, direct RDMA write from remote Proxy-in service */ +typedef struct mcm_wr_rx { + struct dat_mix_wr wr; + struct dat_mix_sge sg[DAT_MIX_SGE_MAX]; + uint64_t org_id; + uint64_t context; + uint32_t m_idx; + uint32_t w_idx; + uint32_t s_idx; + uint32_t flags; + uint32_t time; + uint32_t qcnt; +} __attribute__((packed)) mcm_wr_rx_t; + +/* WC: 80 bytes, direct RDMA write from remote Proxy-in service */ +typedef struct mcm_wc_rx { + struct dat_mix_wc wc; + uint64_t org_id; + uint64_t context; + uint32_t wr_idx; /* proxy-out, proxy-in WR idx */ + uint32_t wr_tl; /* proxy-in WR tl update */ + uint32_t flags; + uint8_t rsv[6]; +} __attribute__((packed)) mcm_wc_rx_t; + +/* Helper functions */ + +/* construct WRC info to msg->p_proxy, network order, during outbound CM request or reply */ +static inline void mcm_hton_wrc(mcm_wrc_info_t *dst, mcm_wrc_info_t *src) +{ + if (src->wr_addr) { + dst->wr_addr = htonll(src->wr_addr); + dst->wr_rkey = htonl(src->wr_rkey); + dst->wr_len = htons(src->wr_len); + dst->wr_sz = htons(src->wr_sz); + dst->wr_end = htons(src->wr_end); + } + if (src->wc_addr) { + dst->wc_addr = htonll(src->wc_addr); + dst->wc_rkey = htonl(src->wc_rkey); + dst->wc_len = htons(src->wc_len); + dst->wc_sz = htons(src->wc_sz); + dst->wc_end = htons(src->wc_end); + } +} + +/* get WRC info from msg->p_proxy, network order, during inbound CM request or reply */ +static inline void mcm_ntoh_wrc(mcm_wrc_info_t *dst, mcm_wrc_info_t *src) +{ + dst->wr_addr = ntohll(src->wr_addr); + dst->wr_rkey = ntohl(src->wr_rkey); + dst->wr_len = ntohs(src->wr_len); + dst->wr_sz = ntohs(src->wr_sz); + dst->wr_end = ntohs(src->wr_end); + + dst->wc_addr = ntohll(src->wc_addr); + dst->wc_rkey = ntohl(src->wc_rkey); + dst->wc_len = ntohs(src->wc_len); + dst->wc_sz = ntohs(src->wc_sz); + dst->wc_end = ntohs(src->wc_end); +} + +/* construct rx_wr, network order, to send to remote proxy-in service */ +static inline void mcm_hton_wr_rx(struct mcm_wr_rx *m_wr_rx, struct mcm_wr *m_wr, int wc_tl) +{ + int i; + + memset((void*)m_wr_rx, 0, sizeof(*m_wr_rx)); + m_wr_rx->org_id = (uint64_t) htonll((uint64_t)m_wr); /* proxy_out WR */ + m_wr_rx->flags = htonl(m_wr->flags); + m_wr_rx->w_idx = htonl(wc_tl); /* snd back wc tail */ + m_wr_rx->wr.num_sge = htonl(m_wr->wr.num_sge); + m_wr_rx->wr.opcode = htonl(m_wr->wr.opcode); + m_wr_rx->wr.send_flags = htonl(m_wr->wr.send_flags); + m_wr_rx->wr.imm_data = htonl(m_wr->wr.imm_data); + m_wr_rx->wr.wr.rdma.remote_addr = htonll(m_wr->wr.wr.rdma.remote_addr); /* final dst on MIC */ + m_wr_rx->wr.wr.rdma.rkey = htonl(m_wr->wr.wr.rdma.rkey); + + for (i=0;iwr.num_sge;i++) { + m_wr_rx->sg[i].addr = htonll(m_wr->sg[i].addr); /* proxy-out buffer */ + m_wr_rx->sg[i].lkey = htonl(m_wr->sg[i].lkey); + m_wr_rx->sg[i].length = htonl(m_wr->sg[i].length); + } +} + +/* convert rx wr, arrived across fabric from remote proxy-out service in network order */ +static inline void mcm_ntoh_wr_rx(struct mcm_wr_rx *m_wr_rx) +{ + int i; + + m_wr_rx->org_id = ntohll(m_wr_rx->org_id); /* proxy_out WR */ + m_wr_rx->flags = ntohl(m_wr_rx->flags); + m_wr_rx->w_idx = ntohl(m_wr_rx->w_idx); /* WC tail update from proxy_out */ + m_wr_rx->wr.num_sge = ntohl(m_wr_rx->wr.num_sge); + m_wr_rx->wr.opcode = ntohl(m_wr_rx->wr.opcode); + m_wr_rx->wr.send_flags = ntohl(m_wr_rx->wr.send_flags); + m_wr_rx->wr.imm_data = ntohl(m_wr_rx->wr.imm_data); + m_wr_rx->wr.wr.rdma.remote_addr = ntohll(m_wr_rx->wr.wr.rdma.remote_addr); /* final dest on MIC */ + m_wr_rx->wr.wr.rdma.rkey = ntohl(m_wr_rx->wr.wr.rdma.rkey); + + for (i=0;iwr.num_sge;i++) { + m_wr_rx->sg[i].addr = ntohll(m_wr_rx->sg[i].addr); /* proxy-out buffer segment, ibv */ + m_wr_rx->sg[i].lkey = ntohl(m_wr_rx->sg[i].lkey); + m_wr_rx->sg[i].length = ntohl(m_wr_rx->sg[i].length); + } + + /* For HST->MXS sg[0-3] can be direct SRC segments for RR, all others will be 1 seg */ + /* sg[1] == proxy-in buffer segment, ibv */ + /* sg[2] == proxy-in scif sendto src segment, scif offset */ + /* sg[3] == proxy-in scif sendto dst segment, scif offset */ +} + +/* construct a rx_wc in network order to send to remote proxy-in service */ +static inline void mcm_hton_wc_rx(struct mcm_wc_rx *m_wc_rx, struct mcm_wr_rx *m_wr_rx, int wr_tl, int status) +{ + memset((void*)m_wc_rx, 0, sizeof(*m_wc_rx)); + m_wc_rx->wr_idx = htonl(m_wr_rx->w_idx); /* proxy-in WR idx == proxy-out WR idx */ + m_wc_rx->wr_tl = htonl(wr_tl); /* proxy-in WR tail update, moves slower than proxy-out */ + m_wc_rx->flags = htonl(m_wr_rx->flags); + m_wc_rx->wc.wr_id = htonll(m_wr_rx->org_id); + m_wc_rx->wc.status = htonl(status); + m_wc_rx->wc.byte_len = htonl(m_wr_rx->sg[0].length); + if (m_wr_rx->wr.send_flags & IBV_WR_RDMA_WRITE) + m_wc_rx->wc.opcode = htonl(IBV_WC_RDMA_WRITE); + else + m_wc_rx->wc.opcode = htonl(IBV_WC_SEND); +} + +/* convert rx wc, arrived across fabric from remote proxy-in service in network order */ +static inline void mcm_ntoh_wc_rx(struct mcm_wc_rx *m_wc_rx) +{ + m_wc_rx->wr_idx = ntohl(m_wc_rx->wr_idx); + m_wc_rx->wr_tl = ntohl(m_wc_rx->wr_tl); + m_wc_rx->flags = ntohl(m_wc_rx->flags); + m_wc_rx->wc.wr_id = ntohll(m_wc_rx->wc.wr_id); + m_wc_rx->wc.status = ntohl(m_wc_rx->wc.status); + m_wc_rx->wc.byte_len = ntohl(m_wc_rx->wc.byte_len); + m_wc_rx->wc.opcode = ntohl(m_wc_rx->wc.opcode); +} + +static inline void mcm_const_mix_wr(struct dat_mix_wr *mwr, struct ibv_send_wr *iwr) +{ + memset((void*)mwr, 0, sizeof(*mwr)); + mwr->wr_id = iwr->wr_id; + mwr->num_sge = iwr->num_sge; + mwr->opcode = iwr->opcode; + mwr->send_flags = iwr->send_flags; + mwr->imm_data = iwr->imm_data; + mwr->wr.rdma.remote_addr = iwr->wr.rdma.remote_addr; + mwr->wr.rdma.rkey = iwr->wr.rdma.rkey; +} + +static inline void mcm_const_ib_wc(struct ibv_wc *iwc, struct dat_mix_wc *mwc, int entries) +{ + int i; + + for (i=0;i -#include #include -#include "mpxy.h" #include "openib_osd.h" +#include "dapl_mic_common.h" #include "dapl_ib_common.h" #define MCM_RETRY_CNT 10 diff --git a/dapl/openib_mcm/device.c b/dapl/openib_mcm/device.c index 9531aba..efad029 100644 --- a/dapl/openib_mcm/device.c +++ b/dapl/openib_mcm/device.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Intel Corporation. All rights reserved. + * Copyright (c) 2009-2014 Intel Corporation. All rights reserved. * * This Software is licensed under one of the following licenses: * diff --git a/dapl/openib_mcm/mix.c b/dapl/openib_mcm/mix.c index d9051a4..df37b33 100644 --- a/dapl/openib_mcm/mix.c +++ b/dapl/openib_mcm/mix.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Intel Corporation. All rights reserved. + * Copyright (c) 2009-2014 Intel Corporation. All rights reserved. * * This Software is licensed under one of the following licenses: * diff --git a/dapl/openib_mcm/proxy.c b/dapl/openib_mcm/proxy.c index 36c9b35..d20456c 100644 --- a/dapl/openib_mcm/proxy.c +++ b/dapl/openib_mcm/proxy.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Intel Corporation. All rights reserved. + * Copyright (c) 2009-2014 Intel Corporation. All rights reserved. * * This Software is licensed under one of the following licenses: * diff --git a/dapl/svc/mcm.c b/dapl/svc/mcm.c index 14cb969..32ec0c9 100644 --- a/dapl/svc/mcm.c +++ b/dapl/svc/mcm.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2012-2014 Intel Corporation. All rights reserved. * * This software is available to you under the OpenIB.org BSD license * below: diff --git a/dapl/svc/mix.c b/dapl/svc/mix.c index d2043ab..46e036f 100644 --- a/dapl/svc/mix.c +++ b/dapl/svc/mix.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2012-2014 Intel Corporation. All rights reserved. * * This software is available to you under the OpenIB.org BSD license * below: diff --git a/dapl/svc/mpxy.h b/dapl/svc/mpxy.h deleted file mode 100644 index fcba7b6..0000000 --- a/dapl/svc/mpxy.h +++ /dev/null @@ -1,336 +0,0 @@ -/* - * Copyright (c) 2012 Intel Corporation. All rights reserved. - * - * This software is available to you under the OpenIB.org BSD license - * below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AWV - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -/* - * MIC Proxy Data Service Definitions - used by MCM provider and MPXYD service - * - * Communication Protocol between MCM Proxy-out and Proxy-in service agents - * - WR and WC management vi IB RDMA write_imm and RDMA reads - * - WR and WC written directly from remote proxy peer agent, - * - Proxy-in buffer management on receive side, IB RR - * - Proxy-out buffer management on send side - * IB RW directly to user buffer if peer is MIC same socket - * IB RW_imm to PI WR, PI RR, scif_writeto if MIC is remote socket - * - */ -#ifndef _MPXY_H_ -#define _MPXY_H_ - -#if __BYTE_ORDER == __BIG_ENDIAN -#define htonll(x) (x) -#define ntohll(x) (x) -#elif __BYTE_ORDER == __LITTLE_ENDIAN -#define htonll(x) bswap_64(x) -#define ntohll(x) bswap_64(x) -#endif - -/* WRC (work request/completion) imm_data definition, qdepth limits of 16 bits */ -#define WRC_MAX_QLEN 1 << 16; -#define MCM_WRC_QLEN 512 - -/* data types, WR or WC */ -#define M_WR_TYPE 1 -#define M_WC_TYPE 2 - -/* WR flags */ -#define M_WR_FS 1 -#define M_WR_LS 2 - -#define WRC_ID_DATA(x) ((x) & 0x0000ffff) -#define WRC_TYPE_DATA(x) (((x) >> 16) & 0x000000ff) -#define WRC_FLAGS_DATA(x) (((x) >> 24) & 0x000000ff) - -/* wr aligned on 64 bytes, use 4 lower bits for type id */ -#define WRID_TX_RW 0x1 /* proxy out, m_wr type, RW */ -#define WRID_TX_RW_IMM 0x2 /* proxy out, m_wr type, RW_imm op */ -#define WRID_RX_RR 0x3 /* proxy in, m_wr_rx type, RR op */ -#define WRID_RX_RW_IMM 0x4 /* proxy in, m_wr_rx type, RW_immed op */ -#define WRID_MASK 0xfffffffffffffff0 -#define WRID_SET(x,y) (((uint64_t)(x) | (uint64_t)(y))) -#define WRID_TYPE(x) ((x & ~WRID_MASK)) -#define WRID_ADDR(x) ((x & WRID_MASK)) - -typedef struct wrc_idata { - - uint16_t id; /* work request or completion slot */ - uint8_t type; /* data types, WR, WC, etc */ - uint8_t flags; /* flags */ - -} __attribute__((packed)) wrc_idata_t; - -enum mcm_wr_flags { - M_SEND_POSTED = 1 << 0, /* m_wr already posted */ - M_SEND_CN_SIG = 1 << 1, /* m_wr consumer signaled, IB completion */ - M_SEND_CN_EAGER_SIG = 1 << 2, /* m_wr consumer eager signaled, SCIF read completion */ - M_SEND_MP_SIG = 1 << 3, /* m_wr mpxyd signaled, segmentation, manage proxy buf/wr resources */ - - M_SEND_FS = 1 << 4, /* m_wr - first segment */ - M_SEND_LS = 1 << 5, /* m_wr - last segment */ - M_SEND_PI = 1 << 6, /* m_wr - forwarded to proxy in service */ - M_SEND_INLINE = 1 << 7, /* m_wr - data in cmd msg, no scif_readfrom */ - - M_READ_PAUSED = 1 << 8, /* m_wr_rx waiting for proxy buffer */ - M_RECV_PAUSED = 1 << 9, /* m_wr_rx waiting for posted rcv message */ - M_READ_POSTED = 1 << 10, /* m_wr_rx ibv posted */ - M_READ_DONE = 1 << 11, /* m_wr_rx ibv completed */ - - M_READ_WRITE_TO = 1 << 12, /* m_wr_rx read data forwarded to MIC scif_writeto */ - M_READ_WRITE_TO_DONE = 1 << 13, /* m_wr_rx read data forwarded to MIC scif_writeto */ - M_READ_CN_SIG = 1 << 14, /* m_wr_rx consumer signaled, IB completion needed */ - M_READ_MP_SIG = 1 << 15, /* m_wr_rx mpxyd signaled, segmentation, manage proxy buf/wr resources */ - - M_READ_FROM_DONE = 1 << 16, /* m_wr mpxyd read_from_done, ready for posting */ - M_SEND_DIRECT = 1 << 17, /* m_wr SEND direct from host memory, no proxy out buffer */ -}; - -/* 80 bytes */ -typedef struct mcm_sr { - uint64_t wr_id; /* from consumer post_recv */ - uint32_t len; /* total len */ - uint32_t num_sge; /* number of sglist entries, max 4 */ - uint32_t m_idx; /* proxy buffer, src */ - uint32_t w_idx; /* wr_rx WR idx, data xfer in process */ - uint32_t s_idx; /* my idx, sr_tl update */ - struct dat_mix_sge sg[DAT_MIX_SGE_MAX]; /* consumer buffer on MIC, off_t */ -} mcm_sr_t; - -/* 128 bytes */ -typedef struct mcm_wr { - struct ibv_send_wr wr; - struct ibv_sge sg[DAT_MIX_SGE_MAX]; - uint64_t org_id; - uint64_t context; - uint32_t m_idx; - uint32_t w_idx; - uint32_t flags; -} mcm_wr_t; - -/* DAT_MCM_PROXY_DATA private data max (40 bytes), Proxy-in WR and WC info exchange */ -typedef struct mcm_wrc_info { - uint64_t wr_addr; - uint32_t wr_rkey; - uint32_t wr_len; - uint16_t wr_sz; - uint16_t wr_end; - uint64_t wc_addr; - uint32_t wc_rkey; - uint32_t wc_len; - uint16_t wc_sz; - uint16_t wc_end; -} __attribute__((packed)) mcm_wrc_info_t; - -/* WR: 160 bytes, direct RDMA write from remote Proxy-in service */ -typedef struct mcm_wr_rx { - struct dat_mix_wr wr; - struct dat_mix_sge sg[DAT_MIX_SGE_MAX]; - uint64_t org_id; - uint64_t context; - uint32_t m_idx; - uint32_t w_idx; - uint32_t s_idx; - uint32_t flags; - uint32_t time; - uint32_t qcnt; -} __attribute__((packed)) mcm_wr_rx_t; - -/* WC: 80 bytes, direct RDMA write from remote Proxy-in service */ -typedef struct mcm_wc_rx { - struct dat_mix_wc wc; - uint64_t org_id; - uint64_t context; - uint32_t wr_idx; /* proxy-out, proxy-in WR idx */ - uint32_t wr_tl; /* proxy-in WR tl update */ - uint32_t flags; - uint8_t rsv[6]; -} __attribute__((packed)) mcm_wc_rx_t; - -/* put WRC info to msg->p_proxy, network order, during outbound CM request or reply */ -static inline void mcm_hton_wrc(mcm_wrc_info_t *dst, mcm_wrc_info_t *src) -{ - if (src->wr_addr) { - dst->wr_addr = htonll(src->wr_addr); - dst->wr_rkey = htonl(src->wr_rkey); - dst->wr_len = htons(src->wr_len); - dst->wr_sz = htons(src->wr_sz); - dst->wr_end = htons(src->wr_end); - } - if (src->wc_addr) { - dst->wc_addr = htonll(src->wc_addr); - dst->wc_rkey = htonl(src->wc_rkey); - dst->wc_len = htons(src->wc_len); - dst->wc_sz = htons(src->wc_sz); - dst->wc_end = htons(src->wc_end); - } -} - -/* get WRC info from msg->p_proxy, network order, during inbound CM request or reply */ -static inline void mcm_ntoh_wrc(mcm_wrc_info_t *dst, mcm_wrc_info_t *src) -{ - dst->wr_addr = ntohll(src->wr_addr); - dst->wr_rkey = ntohl(src->wr_rkey); - dst->wr_len = ntohs(src->wr_len); - dst->wr_sz = ntohs(src->wr_sz); - dst->wr_end = ntohs(src->wr_end); - - dst->wc_addr = ntohll(src->wc_addr); - dst->wc_rkey = ntohl(src->wc_rkey); - dst->wc_len = ntohs(src->wc_len); - dst->wc_sz = ntohs(src->wc_sz); - dst->wc_end = ntohs(src->wc_end); -} - -/* construct a rx_wr in network order to send to remote proxy-in service */ -static inline void mcm_hton_wr_rx(struct mcm_wr_rx *m_wr_rx, struct mcm_wr *m_wr, int wc_tl) -{ - int i; - memset((void*)m_wr_rx, 0, sizeof(*m_wr_rx)); - m_wr_rx->org_id = (uint64_t) htonll((uint64_t)m_wr); /* proxy_out WR */ - m_wr_rx->flags = htonl(m_wr->flags); - m_wr_rx->w_idx = htonl(wc_tl); /* snd back wc tail */ - m_wr_rx->wr.num_sge = htonl(m_wr->wr.num_sge); - m_wr_rx->wr.opcode = htonl(m_wr->wr.opcode); - m_wr_rx->wr.send_flags = htonl(m_wr->wr.send_flags); - m_wr_rx->wr.imm_data = htonl(m_wr->wr.imm_data); - m_wr_rx->wr.wr.rdma.remote_addr = htonll(m_wr->wr.wr.rdma.remote_addr); /* final dst on MIC */ - m_wr_rx->wr.wr.rdma.rkey = htonl(m_wr->wr.wr.rdma.rkey); -#if 0 - printf(" hton_wr_rx: op %x num_sge %d, raddr %Lx rkey %x\n", - m_wr->wr.opcode, m_wr->wr.num_sge, - (long long unsigned int)m_wr->wr.wr.rdma.remote_addr, - m_wr->wr.wr.rdma.rkey); -#endif - for (i=0;iwr.num_sge;i++) { - m_wr_rx->sg[i].addr = htonll(m_wr->sg[i].addr); /* proxy-out buffer */ - m_wr_rx->sg[i].lkey = htonl(m_wr->sg[i].lkey); - m_wr_rx->sg[i].length = htonl(m_wr->sg[i].length); -#if 0 - printf(" hton_wr_rx: [%d] addr %Lx key %x len %d\n", - i, (long long unsigned int)m_wr->sg[i].addr, - m_wr->sg[i].lkey, m_wr->sg[i].length); -#endif - } -} - -/* convert rx wr, arrived across fabric from remote proxy-out service in network order */ -static inline void mcm_ntoh_wr_rx(struct mcm_wr_rx *m_wr_rx) -{ - int i; - m_wr_rx->org_id = ntohll(m_wr_rx->org_id); /* proxy_out WR */ - m_wr_rx->flags = ntohl(m_wr_rx->flags); - m_wr_rx->w_idx = ntohl(m_wr_rx->w_idx); /* WC tail update from proxy_out */ - m_wr_rx->wr.num_sge = ntohl(m_wr_rx->wr.num_sge); - m_wr_rx->wr.opcode = ntohl(m_wr_rx->wr.opcode); - m_wr_rx->wr.send_flags = ntohl(m_wr_rx->wr.send_flags); - m_wr_rx->wr.imm_data = ntohl(m_wr_rx->wr.imm_data); - m_wr_rx->wr.wr.rdma.remote_addr = ntohll(m_wr_rx->wr.wr.rdma.remote_addr); /* final dest on MIC */ - m_wr_rx->wr.wr.rdma.rkey = ntohl(m_wr_rx->wr.wr.rdma.rkey); -#if 0 - printf(" ntoh_wr_rx: op %x num_sge %d, raddr %Lx rkey %x\n", - m_wr_rx->wr.opcode, m_wr_rx->wr.num_sge, - (long long unsigned int)m_wr_rx->wr.wr.rdma.remote_addr, - m_wr_rx->wr.wr.rdma.rkey); -#endif - for (i=0;iwr.num_sge;i++) { - m_wr_rx->sg[i].addr = ntohll(m_wr_rx->sg[i].addr); /* proxy-out buffer segment, ibv */ - m_wr_rx->sg[i].lkey = ntohl(m_wr_rx->sg[i].lkey); - m_wr_rx->sg[i].length = ntohl(m_wr_rx->sg[i].length); -#if 0 - printf(" ntoh_wr_rx: [%d] addr %Lx key %x len %d\n", - i, (long long unsigned int)m_wr_rx->sg[i].addr, - m_wr_rx->sg[i].lkey, m_wr_rx->sg[i].length); -#endif - } - /* For HST->MXS sg[0-3] can be direct SRC segments for RR, all others will be 1 seg */ - /* sg[1] == proxy-in buffer segment, ibv */ - /* sg[2] == proxy-in scif sendto src segment, scif offset */ - /* sg[3] == proxy-in scif sendto dst segment, scif offset */ -} - -/* construct a rx_wc in network order to send to remote proxy-in service */ -static inline void mcm_hton_wc_rx(struct mcm_wc_rx *m_wc_rx, struct mcm_wr_rx *m_wr_rx, int wr_tl, int status) -{ - memset((void*)m_wc_rx, 0, sizeof(*m_wc_rx)); - m_wc_rx->wr_idx = htonl(m_wr_rx->w_idx); /* proxy-in WR idx == proxy-out WR idx */ - m_wc_rx->wr_tl = htonl(wr_tl); /* proxy-in WR tail update, moves slower than proxy-out */ - m_wc_rx->flags = htonl(m_wr_rx->flags); - m_wc_rx->wc.wr_id = htonll(m_wr_rx->org_id); - m_wc_rx->wc.status = htonl(status); - m_wc_rx->wc.byte_len = htonl(m_wr_rx->sg[0].length); - if (m_wr_rx->wr.send_flags & IBV_WR_RDMA_WRITE) - m_wc_rx->wc.opcode = htonl(IBV_WC_RDMA_WRITE); - else - m_wc_rx->wc.opcode = htonl(IBV_WC_SEND); -} - -/* convert rx wc, arrived across fabric from remote proxy-in service in network order */ -static inline void mcm_ntoh_wc_rx(struct mcm_wc_rx *m_wc_rx) -{ - m_wc_rx->wr_idx = ntohl(m_wc_rx->wr_idx); - m_wc_rx->wr_tl = ntohl(m_wc_rx->wr_tl); - m_wc_rx->flags = ntohl(m_wc_rx->flags); - m_wc_rx->wc.wr_id = ntohll(m_wc_rx->wc.wr_id); - m_wc_rx->wc.status = ntohl(m_wc_rx->wc.status); - m_wc_rx->wc.byte_len = ntohl(m_wc_rx->wc.byte_len); - m_wc_rx->wc.opcode = ntohl(m_wc_rx->wc.opcode); -} - -static inline void mcm_const_mix_wr(struct dat_mix_wr *mwr, struct ibv_send_wr *iwr) -{ - memset((void*)mwr, 0, sizeof(*mwr)); - mwr->wr_id = iwr->wr_id; - mwr->num_sge = iwr->num_sge; - mwr->opcode = iwr->opcode; - mwr->send_flags = iwr->send_flags; - mwr->imm_data = iwr->imm_data; - mwr->wr.rdma.remote_addr = iwr->wr.rdma.remote_addr; - mwr->wr.rdma.rkey = iwr->wr.rdma.rkey; -} - -static inline void mcm_const_ib_wc(struct ibv_wc *iwc, struct dat_mix_wc *mwc, int entries) -{ - int i; - - for (i=0;i #include #include "dat2/udat.h" -#include "dat2/dat_mic_extensions.h" -#include "mpxy.h" +#include "dapl_mic_common.h" #define min(a, b) ((a < b) ? (a) : (b)) #define max(a, b) ((a > b) ? (a) : (b)) diff --git a/dapl/svc/util.c b/dapl/svc/util.c index 5fd5d88..c2a3693 100644 --- a/dapl/svc/util.c +++ b/dapl/svc/util.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 Intel Corporation. All rights reserved. + * Copyright (c) 2012-2014 Intel Corporation. All rights reserved. * * This software is available to you under the OpenIB.org BSD license * below: -- 2.41.0