]> git.openfabrics.org - ~ardavis/dapl.git/commitdiff
mcm: add segmentation to HST->MXS mode for improved performance
authorArlin Davis <arlin.r.davis@intel.com>
Thu, 9 Oct 2014 22:23:24 +0000 (15:23 -0700)
committerArlin Davis <arlin.r.davis@intel.com>
Thu, 9 Oct 2014 22:23:24 +0000 (15:23 -0700)
Signed-off-by: Arlin Davis <arlin.r.davis@intel.com>
dapl/openib_common/dapl_ib_common.h
dapl/openib_mcm/cm.c
dapl/openib_mcm/proxy.c
dapl/svc/mcm.c
dapl/svc/mix.c

index b0b346deee664fb11c01f4f08f76d8a7877b95a5..c1b9267b59609d4eaa64528171053459eb5ff31e 100644 (file)
@@ -63,6 +63,7 @@ struct dcm_ib_qp {
        struct mcm_wrc_info      wrc_rem;  /* remote WR info */
        DAPL_OS_LOCK             lock;     /* Proxy WR and WC queues */
        uint8_t                  ep_map;   /* Peer EP mapping, MXS, MSS, HST */
+       uint32_t                 seg_sz;   /* Peer MXS Proxy-in segment size */
 #endif
 };
 
index 71f73553cc32f8fce959e1adfcce2ce27195f65f..204954b241a7966f42583a14a1578f6bd63bba93 100644 (file)
@@ -749,6 +749,7 @@ dp_ib_cm_handle_t dapls_cm_create(DAPL_HCA *hca, DAPL_EP *ep)
        cm->msg.ver = htons(DAT_MCM_VER);
        cm->msg.s_id = htonl(dapl_os_getpid()); /* process id for src id */
        cm->msg.sys_guid = hca->ib_trans.sys_guid;
+       cm->msg.seg_sz = DAT_MCM_SEG_PO2;
        
        /* ACTIVE: init source address QP info from local EP */
        if (ep) {
@@ -1088,6 +1089,9 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg)
        }
 
        /* CM_REP: save remote address information to EP and CM */
+       if (msg->seg_sz) /* set po2 seg_sz, if provided */
+               cm->msg.seg_sz = msg->seg_sz;
+
        cm->msg.d_id = msg->s_id;
        dapl_os_memcpy(&ep->remote_ia_address, &msg->saddr2, sizeof(dat_mcm_addr_t));
        dapl_os_memcpy(&cm->msg.daddr2, &msg->saddr2, sizeof(dat_mcm_addr_t));
@@ -1185,16 +1189,18 @@ void mcm_connect_rtu(dp_ib_cm_handle_t cm, dat_mcm_msg_t *msg)
                        mcm_create_wc_q(ep->qp_handle, ep->qp_handle->wrc_rem.wr_end + 1);
                        mcm_hton_wrc((mcm_wrc_info_t*)cm->msg.p_proxy, &ep->qp_handle->wrc);
                        ep->qp_handle->ep_map = cm->msg.daddr1.ep_map;
+                       ep->qp_handle->seg_sz = (1 << cm->msg.seg_sz);
 
                        /* post 0-byte rcv for inbound WC's via RW_imm */
                        if (mcm_post_rcv_wc(ep->qp_handle, MCM_WRC_QLEN))
                                goto bail;
 
-                       dapl_log(DAPL_DBG_TYPE_CM, "CONN_RTU: WR_rem %p sz %d, WC %p sz %d\n",
+                       dapl_log(DAPL_DBG_TYPE_CM, "CONN_RTU: WR_rem %p sz %d, WC %p sz %d, sg %d\n",
                                 ep->qp_handle->wrc_rem.wr_addr,
                                 ep->qp_handle->wrc_rem.wr_end+1,
                                 ep->qp_handle->wrc.wc_addr,
-                                ep->qp_handle->wrc.wc_end+1);
+                                ep->qp_handle->wrc.wc_end+1,
+                                ep->qp_handle->seg_sz);
                }
        }
        dapl_os_unlock(&cm->ep->header.lock);
@@ -1265,6 +1271,8 @@ static void mcm_accept(ib_cm_srvc_handle_t cm, dat_mcm_msg_t *msg)
        acm->msg.p_size = msg->p_size;
        acm->msg.d_id = msg->s_id;
        acm->msg.rd_in = msg->rd_in;
+       if (msg->seg_sz) /* set po2 seg_sz, if provided */
+               acm->msg.seg_sz = msg->seg_sz;
 
        /* CR saddr1 is CM daddr1 info, need EP for local saddr1 */
        dapl_os_memcpy(&acm->msg.daddr1, &msg->saddr1, sizeof(dat_mcm_addr_t));
@@ -1505,19 +1513,21 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
                        mcm_create_wc_q(ep->qp_handle, ep->qp_handle->wrc_rem.wr_end + 1);
                        mcm_hton_wrc((mcm_wrc_info_t*)cm->msg.p_proxy, &ep->qp_handle->wrc);
                        ep->qp_handle->ep_map = cm->msg.daddr1.ep_map;
+                       ep->qp_handle->seg_sz = (1 << cm->msg.seg_sz);
 
                        /* post 0-byte rcv for inbound WC's via RW_imm */
                        if (mcm_post_rcv_wc(ep->qp_handle, MCM_WRC_QLEN))
                                        goto bail;
 
                        dapl_log(DAPL_DBG_TYPE_CM,
-                                "ACCEPT_USR: WR_rem %p rkey %x sz %d, WC %p rkey %x sz %d\n",
+                                "ACCEPT_USR: WR_rem %p rkey %x sz %d, WC %p rkey %x sz %d sg %d\n",
                                 ep->qp_handle->wrc_rem.wr_addr,
                                 ep->qp_handle->wrc_rem.wr_rkey,
                                 ep->qp_handle->wrc_rem.wr_end+1,
                                 ep->qp_handle->wrc.wc_addr,
                                 ep->qp_handle->wrc.wc_rkey,
-                                ep->qp_handle->wrc.wc_end+1);
+                                ep->qp_handle->wrc.wc_end+1,
+                                ep->qp_handle->seg_sz);
                }
        }
        dapl_os_unlock(&ep->header.lock);
index e81b6f1429df199c12b2e088e356d8f707dde2ce..256b42992ac9f9b006a501b76acc990f1da6081a 100644 (file)
  * non-MIC host to MIC cross socket EP needs to send WR to remote PI service
  * instead of direct IB send or write. Inbound traffic from remote MXS will still be
  * be direct so there is no need for PI service on this MCM providers host side.
- *
- * NOTE: Initial design with no segmentation, set frequent PI MP signal rate
- *      This will avoid creation and management of a local PO WR queue for segments
  */
 #define MCM_MP_SIG_RATE 5
 
-int mcm_send_pi(struct dcm_ib_qp *m_qp, int len, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr)
+int mcm_send_pi(struct dcm_ib_qp *m_qp,
+               int len,
+               struct ibv_send_wr *wr,
+               struct ibv_send_wr **bad_wr)
 {
        struct ibv_send_wr wr_imm;
        struct ibv_sge sge;
        struct mcm_wr_rx m_wr_rx;
-       int i, ret = 0, wr_idx;
+       int i, l_len, seg_len, ret = 0, wr_idx;
        struct wrc_idata wrc;
-       uint32_t wr_flags, offset=0;
+       uint32_t wr_flags, l_off, r_off = 0;
+       uint64_t l_addr;
 
        dapl_log(DAPL_DBG_TYPE_EP,
-                " mcm_send_pi: ep %p qpn %x ln %d WR: tl %d hd %d end %d wr_id %Lx\n",
-                m_qp->ep, m_qp->qp2->qp_num, len,  m_qp->wr_tl,
-                m_qp->wr_hd, m_qp->wrc_rem.wr_end, wr->wr_id);
+                " mcm_send_pi: ep %p qpn %x ln %d sge %d sg %d"
+                " WR: tl %d hd %d end %d wr_id %Lx\n",
+                m_qp->ep, m_qp->qp2->qp_num, len,  wr->num_sge,
+                m_qp->seg_sz, m_qp->wr_tl, m_qp->wr_hd,
+                m_qp->wrc_rem.wr_end, wr->wr_id);
 
        if (wr->num_sge > DAT_MIX_SGE_MAX) {
                ret = EINVAL;
                goto bail;
        }
+
        /* one WR per IB sge, no additional segmentation */
        for (i=0;i<wr->num_sge;i++) {
                wr_flags = M_SEND_DIRECT | M_SEND_PI;
                if (i==0) wr_flags |= M_SEND_FS;
+
+               l_len = wr->sg_list[i].length;
+               l_addr = wr->sg_list[i].addr;
+               l_off = 0;
+
                if (i==(wr->num_sge-1)) {
                        wr_flags |= M_SEND_LS;
                        if (wr->send_flags & IBV_SEND_SIGNALED)
                                wr_flags |= M_SEND_CN_SIG;
                }
-               dapl_os_lock(&m_qp->lock);
-               if (((m_qp->wr_hd + 1) & m_qp->wrc_rem.wr_end) == m_qp->wr_tl) { /* full */
-                       ret = ENOMEM;
+
+               while (l_len) {
+                       if (wr->opcode == IBV_WR_SEND)
+                               seg_len = l_len;
+                       else
+                               seg_len = (l_len > m_qp->seg_sz) ? m_qp->seg_sz : l_len;
+
+                       dapl_os_lock(&m_qp->lock);
+                       if (((m_qp->wr_hd + 1) & m_qp->wrc_rem.wr_end) == m_qp->wr_tl) { /* full */
+                               ret = ENOMEM;
+                               dapl_os_unlock(&m_qp->lock);
+                               goto bail;
+                       }
+                       m_qp->wr_hd = (m_qp->wr_hd + 1) & m_qp->wrc_rem.wr_end; /* move hd */
+                       wr_idx = m_qp->wr_hd;
+                       if (!(wr_idx % MCM_MP_SIG_RATE) || (wr_flags & M_SEND_CN_SIG))
+                               wr_flags |= M_SEND_MP_SIG;
                        dapl_os_unlock(&m_qp->lock);
-                       goto bail;
-               }
-               m_qp->wr_hd = (m_qp->wr_hd + 1) & m_qp->wrc_rem.wr_end; /* move hd */
-               wr_idx = m_qp->wr_hd;
-               if (!(wr_idx % MCM_MP_SIG_RATE) || (wr_flags & M_SEND_CN_SIG))
-                       wr_flags |= M_SEND_MP_SIG;
-               dapl_os_unlock(&m_qp->lock);
-
-               dapl_log(DAPL_DBG_TYPE_EVD,
-                        " mcm_send_pi[%d]: ln %d wr_idx %d, tl %d hd %d\n",
-                        i, wr->sg_list[i].length, wr_idx, m_qp->wr_tl, m_qp->wr_hd);
-
-               /* build local m_wr_rx for remote PI */
-               memset((void*)&m_wr_rx, 0, sizeof(struct mcm_wr_rx));
-               m_wr_rx.org_id = (uint64_t) htonll((uint64_t)wr->wr_id);
-               m_wr_rx.flags = htonl(wr_flags);
-               m_wr_rx.w_idx = htonl(m_qp->wc_tl); /* snd back wc tail */
-               m_wr_rx.wr.num_sge = htonl(wr->num_sge);
-               m_wr_rx.wr.opcode = htonl(wr->opcode);
-               m_wr_rx.wr.send_flags = htonl(wr->send_flags);
-               m_wr_rx.wr.imm_data = htonl(wr->imm_data);
-               m_wr_rx.sg[0].addr = htonll(wr->sg_list[i].addr);
-               m_wr_rx.sg[0].lkey = htonl(wr->sg_list[i].lkey);
-               m_wr_rx.sg[0].length = htonl(wr->sg_list[i].length);
-
-               if ((wr->opcode == IBV_WR_RDMA_WRITE) ||
-                   (wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)) {
-                       m_wr_rx.wr.wr.rdma.remote_addr = htonll(wr->wr.rdma.remote_addr + offset);
-                       m_wr_rx.wr.wr.rdma.rkey = htonl(wr->wr.rdma.rkey);
-                       offset += wr->sg_list[i].length;
-               }
 
-               /* setup imm_data for PI rcv engine */
-               wrc.id = (uint16_t)wr_idx;
-               wrc.type = M_WR_TYPE;
-               wrc.flags = 0;
-
-               /* setup local WR for wr_rx transfer - RW_imm inline */
-               wr_imm.wr_id = wr->wr_id; /* MUST be original cookie, CQ processing */
-               wr_imm.next = 0;
-               wr_imm.sg_list = &sge;
-               wr_imm.num_sge = 1;
-               wr_imm.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
-               wr_imm.send_flags = IBV_SEND_INLINE; /* m_wr_rx, 148 bytes */
-               if (wr_flags & M_SEND_MP_SIG)
-                       wr_imm.send_flags |= IBV_SEND_SIGNALED;
-               wr_imm.imm_data = htonl(*(uint32_t *)&wrc);
-               wr_imm.wr.rdma.rkey = m_qp->wrc_rem.wr_rkey;
-               wr_imm.wr.rdma.remote_addr =
-                       (uint64_t)(uintptr_t)
-                       ((struct mcm_wr_rx *) (m_qp->wrc_rem.wr_addr + (m_qp->wrc_rem.wr_sz * wr_idx)));
-
-               sge.addr = (uint64_t)(uintptr_t) &m_wr_rx;
-               sge.length = (uint32_t) sizeof(struct mcm_wr_rx); /* 160 byte WR */
-               sge.lkey = 0; /* inline doesn't need registered */
-
-               dapl_log(DAPL_DBG_TYPE_EVD,
-                        " mcm_send_pi[%d]: WR_RX wr_id %Lx qn %x op %d flgs 0x%x"
-                        " imm %x raddr %p rkey %x ln %d\n",
-                        i, wr_imm.wr_id, m_qp->qp2->qp_num, wr_imm.opcode,
-                        wr_flags, ntohl(wr_imm.imm_data),
-                        wr_imm.wr.rdma.remote_addr, wr_imm.wr.rdma.rkey,
-                        sizeof(struct mcm_wr_rx));
-               dapl_log(DAPL_DBG_TYPE_EVD,
-                        " mcm_send_pi[%d]: WR wr_id %Lx qn %x op %d flgs %x"
-                        " imm %x raddr %p rkey %x ln %d tl %d me %d hd %d\n",
-                        i, wr->wr_id, m_qp->qp2->qp_num, wr->opcode,
-                        wr->send_flags, wr->imm_data, wr->wr.rdma.remote_addr,
-                        wr->wr.rdma.rkey, wr->sg_list[i].length,
-                        m_qp->wr_tl, wr_idx, m_qp->wr_hd);
-
-               ret = ibv_post_send(m_qp->qp2, &wr_imm, bad_wr);  /* QP2: QPtx - QPrx PI */
-               if (ret) {
-                       dapl_log(DAPL_DBG_TYPE_ERR,
-                               " mcm_send_pi ERR: m_wr %p idx %d laddr=%p ln=%d lkey=%x flgs %x"
-                               " tl %d hd %d\n",
-                               m_wr_rx, wr_idx, wr->sg_list[0].addr,
-                               wr->sg_list[0].length, wr->sg_list[0].lkey,
-                               m_wr_rx.flags, m_qp->wr_tl, m_qp->wr_hd);
-                       dapl_log(DAPL_DBG_TYPE_ERR,
-                               " mcm_send_pi ERR: wr_id %Lx %p sglist %p sge %d op %d flgs %x"
-                               " idata 0x%x raddr %p rkey %x \n",
-                               m_wr_rx.wr.wr_id, wr->sg_list,
-                               m_wr_rx.wr.num_sge, m_wr_rx.wr.opcode,
-                               m_wr_rx.wr.send_flags, m_wr_rx.wr.imm_data,
-                               m_wr_rx.wr.wr.rdma.remote_addr,
-                               m_wr_rx.wr.wr.rdma.rkey);
-                       goto bail;
-               }
+                       dapl_log(DAPL_DBG_TYPE_EP,
+                                " mcm_send_pi[%d]: seg_ln %d wr_idx %d, tl %d hd %d\n",
+                                i, seg_len, wr_idx, m_qp->wr_tl, m_qp->wr_hd);
+
+                       /* build local m_wr_rx for remote PI */
+                       memset((void*)&m_wr_rx, 0, sizeof(struct mcm_wr_rx));
+                       m_wr_rx.org_id = (uint64_t) htonll((uint64_t)wr->wr_id);
+                       m_wr_rx.flags = htonl(wr_flags);
+                       m_wr_rx.w_idx = htonl(m_qp->wc_tl); /* snd back wc tail */
+                       m_wr_rx.wr.num_sge = htonl(wr->num_sge);
+                       m_wr_rx.wr.opcode = htonl(wr->opcode);
+                       m_wr_rx.wr.send_flags = htonl(wr->send_flags);
+                       m_wr_rx.wr.imm_data = htonl(wr->imm_data);
+                       m_wr_rx.sg[0].addr = htonll(l_addr + l_off);
+                       m_wr_rx.sg[0].lkey = htonl(wr->sg_list[i].lkey);
+                       m_wr_rx.sg[0].length = htonl(seg_len);
+
+                       if ((wr->opcode == IBV_WR_RDMA_WRITE) ||
+                           (wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)) {
+                               m_wr_rx.wr.wr.rdma.remote_addr = htonll(wr->wr.rdma.remote_addr + r_off);
+                               m_wr_rx.wr.wr.rdma.rkey = htonl(wr->wr.rdma.rkey);
+                       }
+
+                       /* setup imm_data for PI rcv engine */
+                       wrc.id = (uint16_t)wr_idx;
+                       wrc.type = M_WR_TYPE;
+                       wrc.flags = 0;
+
+                       /* setup local WR for wr_rx transfer - RW_imm inline */
+                       wr_imm.wr_id = wr->wr_id; /* MUST be original cookie, CQ processing */
+                       wr_imm.next = 0;
+                       wr_imm.sg_list = &sge;
+                       wr_imm.num_sge = 1;
+                       wr_imm.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+                       wr_imm.send_flags = IBV_SEND_INLINE; /* m_wr_rx, 148 bytes */
+                       if (wr_flags & M_SEND_MP_SIG)
+                               wr_imm.send_flags |= IBV_SEND_SIGNALED;
+                       wr_imm.imm_data = htonl(*(uint32_t *)&wrc);
+                       wr_imm.wr.rdma.rkey = m_qp->wrc_rem.wr_rkey;
+                       wr_imm.wr.rdma.remote_addr =
+                               (uint64_t)(uintptr_t)
+                               ((struct mcm_wr_rx *) (m_qp->wrc_rem.wr_addr + (m_qp->wrc_rem.wr_sz * wr_idx)));
+
+                       sge.addr = (uint64_t)(uintptr_t) &m_wr_rx;
+                       sge.length = (uint32_t) sizeof(struct mcm_wr_rx); /* 160 byte WR */
+                       sge.lkey = 0; /* inline doesn't need registered */
+
+                       dapl_log(DAPL_DBG_TYPE_EP,
+                                " mcm_send_pi[%d]: WR_RX wr_id %Lx qn %x op %d flgs 0x%x"
+                                " imm %x laddr %p raddr %p rkey %x wr_ln %d ln %d\n",
+                                i, wr_imm.wr_id, m_qp->qp2->qp_num, wr_imm.opcode,
+                                wr_flags, ntohl(wr_imm.imm_data),
+                                l_addr + l_off, wr_imm.wr.rdma.remote_addr,
+                                wr_imm.wr.rdma.rkey, sizeof(struct mcm_wr_rx), l_len);
+                       dapl_log(DAPL_DBG_TYPE_EP,
+                                " mcm_send_pi[%d]: WR wr_id %Lx qn %x op %d flgs %x"
+                                " imm %x raddr %p rkey %x sg_ln %d tl %d me %d hd %d\n",
+                                i, wr->wr_id, m_qp->qp2->qp_num, wr->opcode,
+                                wr->send_flags, wr->imm_data,
+                                wr->wr.rdma.remote_addr + r_off,
+                                wr->wr.rdma.rkey, seg_len, m_qp->wr_tl, wr_idx, m_qp->wr_hd);
+
+                       ret = ibv_post_send(m_qp->qp2, &wr_imm, bad_wr);  /* QP2: QPtx - QPrx PI */
+                       if (ret) {
+                               dapl_log(DAPL_DBG_TYPE_ERR,
+                                       " mcm_send_pi ERR: m_wr %p idx %d laddr=%p ln=%d lkey=%x flgs %x"
+                                       " tl %d hd %d\n",
+                                       m_wr_rx, wr_idx, wr->sg_list[0].addr,
+                                       wr->sg_list[0].length, wr->sg_list[0].lkey,
+                                       m_wr_rx.flags, m_qp->wr_tl, m_qp->wr_hd);
+                               dapl_log(DAPL_DBG_TYPE_ERR,
+                                       " mcm_send_pi ERR: wr_id %Lx %p sglist %p sge %d op %d flgs %x"
+                                       " idata 0x%x raddr %p rkey %x \n",
+                                       m_wr_rx.wr.wr_id, wr->sg_list,
+                                       m_wr_rx.wr.num_sge, m_wr_rx.wr.opcode,
+                                       m_wr_rx.wr.send_flags, m_wr_rx.wr.imm_data,
+                                       m_wr_rx.wr.wr.rdma.remote_addr,
+                                       m_wr_rx.wr.wr.rdma.rkey);
+                               goto bail;
+                       }
+                       l_len -= seg_len;
+                       l_off += seg_len;
+                       r_off += seg_len;
+
+               } /* wire segmentation of each IOV segment */
        }
 bail:
        return ret;
index c36948a93d9e21b795592199fd163dcbc196acd8..31749deca7f873f7d82e3b5b6f72d55b735872ba 100644 (file)
@@ -50,6 +50,7 @@ int mcm_rtu_ms = 2000;
 int mcm_dreq_ms = 1000;
 int mcm_proxy_in = 1;
 
+extern int mix_buffer_sg_po2;
 extern int mcm_rx_entries;
 extern uint64_t system_guid;
 extern char gid_str[INET6_ADDRSTRLEN];
@@ -531,6 +532,7 @@ mcm_cm_t *m_cm_create(mcm_scif_dev_t *smd, mcm_qp_t *m_qp, dat_mcm_addr_t *r_add
        cm->md = smd->md;
        cm->msg.ver = htons(DAT_MCM_VER);
        cm->msg.sqpn = smd->md->addr.qpn; /* ucm, in network order */
+       cm->msg.seg_sz = mix_buffer_sg_po2;
 #ifdef MPXYD_LOCAL_SUPPORT
        cm->msg.sys_guid = system_guid; /* network order */
 #else
index 2ad93cd76938440f13096d4fdac4bb3879af7874..f9058d97e1cebd878e303debc43fbb1c046ba738 100644 (file)
@@ -1661,12 +1661,13 @@ static int mix_cm_rep_out(mcm_scif_dev_t *smd, dat_mix_cm_t *pmsg, scif_epd_t sc
        mcm_ntoh_wrc(&m_cm->m_qp->wrc_rem, (mcm_wrc_info_t *)m_cm->msg.p_proxy); /* save peer PI WRC info */
 
        mlog(2, " WRC: m_qp %p - WR 0x%Lx rkey 0x%x ln %d, sz %d end %d"
-               " WC 0x%Lx rkey 0x%x ln %d, sz %d end %d\n",
+               " WC 0x%Lx rkey 0x%x ln %d, sz %d end %d sg_po2 %d\n",
             m_cm->m_qp, m_cm->m_qp->wrc.wr_addr, m_cm->m_qp->wrc.wr_rkey,
             m_cm->m_qp->wrc.wr_len, m_cm->m_qp->wrc.wr_sz,
             m_cm->m_qp->wrc.wr_end, m_cm->m_qp->wrc.wc_addr,
             m_cm->m_qp->wrc.wc_rkey, m_cm->m_qp->wrc.wc_len,
-            m_cm->m_qp->wrc.wc_sz, m_cm->m_qp->wrc.wc_end);
+            m_cm->m_qp->wrc.wc_sz, m_cm->m_qp->wrc.wc_end,
+            m_cm->msg.seg_sz);
 
        mlog(2, " WRC_rem: m_qp %p - WR 0x%Lx rkey 0x%x ln %d, sz %d end %d"
                " WC 0x%Lx rkey 0x%x ln %d, sz %d end %d\n",