From 5dc60f1aba17becf0788bcb519b17a47dc4be69c Mon Sep 17 00:00:00 2001 From: Arlin Davis Date: Sun, 7 Aug 2011 22:06:09 -0700 Subject: [PATCH] dat: add definitions for MPI offloaded collectives in IB transport extensions The collective extensions are designed to support MPI and general multicast operations over IB fabrics that support offloaded collectives. Where feasible, they come as close to MPI semantics as possible. Unless otherwise stated, all members participating in a data collective operation must call the associated collective routine for the data transfer operation to complete. Unless otherwise stated, the root collective member of a data operation will receive its own portion of the collective data. In most cases, the root member can prevent sending/receiving data when such operations would be redundant. When root data is already "in place" the root member may set the send and/or receive buffer pointer argument to NULL. Unlike standard DAPL movement operations that require registered memory and LMR objects, collective data movement operations employ pointers to user-virtual address space that do not require pre-registration by the application. From a resource usage point of view, the API user should consider that the provider implementation my perform memory registrations/deregistration on behalf of the application to accomplish a data transfer. Most collective calls are asynchronous. Upon completion, an event will be posted to the EVD specified when the collective was created. Signed-off-by: Arlin Davis --- dat/include/dat2/dat_ib_extensions.h | 685 ++++++++++++++++++++++++--- 1 file changed, 624 insertions(+), 61 deletions(-) diff --git a/dat/include/dat2/dat_ib_extensions.h b/dat/include/dat2/dat_ib_extensions.h index a32a4ed..ac69fed 100755 --- a/dat/include/dat2/dat_ib_extensions.h +++ b/dat/include/dat2/dat_ib_extensions.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Intel Corporation. All rights reserved. + * Copyright (c) 2007-2011 Intel Corporation. All rights reserved. * * This Software is licensed under one of the following licenses: * @@ -72,15 +72,36 @@ * * 2.0.4 - Add DAT_IB_UD_CONNECTION_REJECT_EVENT extended UD event * 2.0.5 - Add DAT_IB_UD extended UD connection error events + * 2.0.6 - Add MPI over IB collective extensions * */ -#define DAT_IB_EXTENSION_VERSION 205 /* 2.0.5 */ -#define DAT_ATTR_COUNTERS "DAT_COUNTERS" +#define DAT_IB_EXTENSION_VERSION 206 /* 2.0.6 */ +#define DAT_IB_ATTR_COUNTERS "DAT_COUNTERS" #define DAT_IB_ATTR_FETCH_AND_ADD "DAT_IB_FETCH_AND_ADD" #define DAT_IB_ATTR_CMP_AND_SWAP "DAT_IB_CMP_AND_SWAP" #define DAT_IB_ATTR_IMMED_DATA "DAT_IB_IMMED_DATA" #define DAT_IB_ATTR_UD "DAT_IB_UD" +#define DAT_IB_COLL_SET_CLOCK "DAT_COLL_SET_CLOCK" +#define DAT_IB_COLL_READ_CLOCK "DAT_COLL_READ_CLOCK" +#define DAT_IB_COLL_BROADCAST "DAT_COLL_BROADCAST" +#define DAT_IB_COLL_BARRIER "DAT_COLL_BARRIER" +#define DAT_IB_COLL_SCATTER "DAT_COLL_SCATTER" +#define DAT_IB_COLL_SCATTERV "DAT_COLL_SCATTERV" +#define DAT_IB_COLL_GATHER "DAT_COLL_GATHER" +#define DAT_IB_COLL_GATHERV "DAT_COLL_GATHERV" +#define DAT_IB_COLL_ALLGATHER "DAT_COLL_ALLGATHER" +#define DAT_IB_COLL_ALLGATHERV "DAT_COLL_ALLGATHERV" +#define DAT_IB_COLL_ALLTOALL "DAT_COLL_ALLTOALL" +#define DAT_IB_COLL_ALLTOALLV "DAT_COLL_ALLTOALLV" +#define DAT_IB_COLL_REDUCE "DAT_COLL_REDUCE" +#define DAT_IB_COLL_ALLREDUCE "DAT_COLL_ALLREDUCE" +#define DAT_IB_COLL_REDUCE_SCATTER "DAT_COLL_REDUCE_SCATTER" +#define DAT_IB_COLL_SCAN "DAT_COLL_SCAN" + +/* Collective handle */ +typedef DAT_HANDLE DAT_IB_COLLECTIVE_HANDLE; + /* * Definition for extended EVENT numbers, DAT_IB_EXTENSION_BASE_RANGE * is used by these extensions as a starting point for extended event numbers @@ -94,7 +115,8 @@ typedef enum dat_ib_event_number DAT_IB_UD_CONNECTION_REQUEST_EVENT, DAT_IB_UD_CONNECTION_EVENT_ESTABLISHED, DAT_IB_UD_CONNECTION_REJECT_EVENT, - DAT_IB_UD_CONNECTION_ERROR_EVENT + DAT_IB_UD_CONNECTION_ERROR_EVENT, + DAT_IB_COLLECTIVE_EVENT, } DAT_IB_EVENT_NUMBER; @@ -107,8 +129,28 @@ typedef enum dat_ib_op DAT_IB_CMP_AND_SWAP_OP, DAT_IB_RDMA_WRITE_IMMED_OP, DAT_IB_UD_SEND_OP, - DAT_QUERY_COUNTERS_OP, - DAT_PRINT_COUNTERS_OP + DAT_IB_QUERY_COUNTERS_OP, + DAT_IB_PRINT_COUNTERS_OP, + DAT_IB_COLLECTIVE_CREATE_MEMBER_OP, + DAT_IB_COLLECTIVE_FREE_MEMBER_OP, + DAT_IB_COLLECTIVE_CREATE_GROUP_OP, + DAT_IB_COLLECTIVE_FREE_GROUP_OP, + DAT_IB_COLLECTIVE_SET_CLOCK_OP, + DAT_IB_COLLECTIVE_READ_CLOCK_OP, + DAT_IB_COLLECTIVE_SCATTER_OP, + DAT_IB_COLLECTIVE_SCATTERV_OP, + DAT_IB_COLLECTIVE_GATHER_OP, + DAT_IB_COLLECTIVE_GATHERV_OP, + DAT_IB_COLLECTIVE_ALLGATHER_OP, + DAT_IB_COLLECTIVE_ALLGATHERV_OP, + DAT_IB_COLLECTIVE_ALLTOALL_OP, + DAT_IB_COLLECTIVE_ALLTOALLV_OP, + DAT_IB_COLLECTIVE_REDUCE_OP, + DAT_IB_COLLECTIVE_ALLREDUCE_OP, + DAT_IB_COLLECTIVE_REDUCE_SCATTER_OP, + DAT_IB_COLLECTIVE_SCAN_OP, + DAT_IB_COLLECTIVE_BROADCAST_OP, + DAT_IB_COLLECTIVE_BARRIER_OP, } DAT_IB_OP; @@ -135,6 +177,24 @@ typedef enum dat_ib_ext_type DAT_IB_UD_CONNECT_REJECT, // 10 DAT_IB_UD_CONNECT_ERROR, // 11 + DAT_IB_COLLECTIVE_CREATE_STATUS, // 12 + DAT_IB_COLLECTIVE_CREATE_DATA, // 13 + DAT_IB_COLLECTIVE_CLOCK_SET_STATUS, // 14 + DAT_IB_COLLECTIVE_SCATTER_STATUS, // 15 + DAT_IB_COLLECTIVE_SCATTERV_STATUS, // 16 + DAT_IB_COLLECTIVE_GATHER_STATUS, // 17 + DAT_IB_COLLECTIVE_GATHERV_STATUS, // 18 + DAT_IB_COLLECTIVE_ALLGATHER_STATUS, // 19 + DAT_IB_COLLECTIVE_ALLGATHERV_STATUS, // 20 + DAT_IB_COLLECTIVE_ALLTOALL_STATUS, // 21 + DAT_IB_COLLECTIVE_ALLTOALLV_STATUS, // 22 + DAT_IB_COLLECTIVE_REDUCE_STATUS, // 23 + DAT_IB_COLLECTIVE_ALLREDUCE_STATUS, // 24 + DAT_IB_COLLECTIVE_REDUCE_SCATTER_STATUS,// 25 + DAT_IB_COLLECTIVE_SCAN_STATUS, // 26 + DAT_IB_COLLECTIVE_BROADCAST_STATUS, // 27 + DAT_IB_COLLECTIVE_BARRIER_STATUS, // 28 + } DAT_IB_EXT_TYPE; /* @@ -144,10 +204,10 @@ typedef enum dat_ib_status { DAT_OP_SUCCESS = DAT_SUCCESS, DAT_IB_OP_ERR, + DAT_IB_COLL_COMP_ERR, } DAT_IB_STATUS; - /* * Definitions for additional extension type RETURN codes above * standard DAT types. Included with standard DAT_TYPE_STATUS @@ -156,6 +216,7 @@ typedef enum dat_ib_status typedef enum dat_ib_return { DAT_IB_ERR = DAT_EXTENSION_BASE, + DAT_IB_COLLECTIVE_ERR } DAT_IB_RETURN; @@ -173,7 +234,8 @@ typedef enum dat_ib_dtos DAT_IB_DTO_SEND_UD, DAT_IB_DTO_RECV_UD, DAT_IB_DTO_RECV_UD_IMMED, - + DAT_IB_DTO_COLLECTIVES, + } DAT_IB_DTOS; /* @@ -184,6 +246,7 @@ typedef enum dat_ib_dtos typedef enum dat_ib_handle_type { DAT_IB_HANDLE_TYPE_EXT = DAT_HANDLE_TYPE_EXTENSION_BASE, + DAT_IB_HANDLE_TYPE_COLLECTIVE } DAT_IB_HANDLE_TYPE; @@ -221,14 +284,8 @@ typedef struct dat_ib_addr_handle } DAT_IB_ADDR_HANDLE; -/* - * Definitions for extended event data: - * When dat_event->event_number >= DAT_IB_EXTENSION_BASE_RANGE - * then dat_event->extension_data == DAT_IB_EXT_EVENT_DATA type - * and ((DAT_IB_EXT_EVENT_DATA*)dat_event->extension_data)->type - * specifies extension data values. - * NOTE: DAT_IB_EXT_EVENT_DATA cannot exceed 64 bytes as defined by - * "DAT_UINT64 extension_data[8]" in DAT_EVENT (dat.h) +/* + * Definition for the value filed of extended event that contains immediate data */ typedef struct dat_ib_immed_data { @@ -236,13 +293,21 @@ typedef struct dat_ib_immed_data } DAT_IB_IMMED_DATA; +/* definition for IB collective event data */ +typedef struct dat_ib_collective_event_data +{ + DAT_HANDLE handle; + DAT_CONTEXT context; + +} DAT_IB_COLLECTIVE_EVENT_DATA; + /* * Definitions for extended event data: * When dat_event->event_number >= DAT_IB_EXTENSION_BASE_RANGE - * then dat_event->extension_data == DAT_EXTENSION_EVENT_DATA type - * and ((DAT_EXTENSION_EVENT_DATA*)dat_event->extension_data)->type + * then dat_event->extension_data == DAT_IB_EXTENSION_EVENT_DATA type + * and ((DAT_IB_EXTENSION_EVENT_DATA*)dat_event->extension_data)->type * specifies extension data values. - * NOTE: DAT_EXTENSION_EVENT_DATA cannot exceed 64 bytes as defined by + * NOTE: DAT_IB_EXTENSION_EVENT_DATA cannot exceed 64 bytes as defined by * "DAT_UINT64 extension_data[8]" in DAT_EVENT (dat.h) * * Provide UD address handles via extended connect establishment. @@ -255,7 +320,10 @@ typedef struct dat_ib_extension_event_data union { DAT_IB_IMMED_DATA immed; } val; - DAT_IB_ADDR_HANDLE remote_ah; + union { + DAT_IB_ADDR_HANDLE remote_ah; + DAT_IB_COLLECTIVE_EVENT_DATA coll; + }; } DAT_IB_EXTENSION_EVENT_DATA; @@ -357,6 +425,71 @@ typedef enum dat_evd_counters } DAT_EVD_COUNTERS; +/* + * Data type for reduce operations + */ +typedef enum dat_ib_collective_data_type +{ + DAT_IB_COLLECTIVE_TYPE_INT8, + DAT_IB_COLLECTIVE_TYPE_UINT8, + DAT_IB_COLLECTIVE_TYPE_INT16, + DAT_IB_COLLECTIVE_TYPE_UINT16, + DAT_IB_COLLECTIVE_TYPE_INT32, + DAT_IB_COLLECTIVE_TYPE_UINT32, + DAT_IB_COLLECTIVE_TYPE_INT64, + DAT_IB_COLLECTIVE_TYPE_UINT64, + DAT_IB_COLLECTIVE_TYPE_FLOAT, + DAT_IB_COLLECTIVE_TYPE_DOUBLE, + DAT_IB_COLLECTIVE_TYPE_LONG_DOUBLE, + DAT_IB_COLLECTIVE_TYPE_SHORT_INT, + DAT_IB_COLLECTIVE_TYPE_2INT, + DAT_IB_COLLECTIVE_TYPE_FLOAT_INT, + DAT_IB_COLLECTIVE_TYPE_LONG_INT, + DAT_IB_COLLECTIVE_TYPE_DOUBLE_INT, + +} DAT_IB_COLLECTIVE_DATA_TYPE; + +/* + * Opcode for reduce operations + */ +typedef enum dat_ib_collective_reduce_data_op +{ + DAT_IB_COLLECTIVE_REDUCE_OP_MAX, + DAT_IB_COLLECTIVE_REDUCE_OP_MIN, + DAT_IB_COLLECTIVE_REDUCE_OP_SUM, + DAT_IB_COLLECTIVE_REDUCE_OP_PROD, + DAT_IB_COLLECTIVE_REDUCE_OP_LAND, + DAT_IB_COLLECTIVE_REDUCE_OP_BAND, + DAT_IB_COLLECTIVE_REDUCE_OP_LOR, + DAT_IB_COLLECTIVE_REDUCE_OP_BOR, + DAT_IB_COLLECTIVE_REDUCE_OP_LXOR, + DAT_IB_COLLECTIVE_REDUCE_OP_BXOR, + DAT_IB_COLLECTIVE_REDUCE_OP_MAXLOC, + DAT_IB_COLLECTIVE_REDUCE_OP_MINLOC + +} DAT_IB_COLLECTIVE_REDUCE_DATA_OP; + +/* + * For group creation + */ +typedef unsigned int DAT_IB_COLLECTIVE_RANK; +typedef unsigned int DAT_IB_COLLECTIVE_ID; +typedef void * DAT_IB_COLLECTIVE_MEMBER; + +typedef struct dat_ib_collective_group +{ + int local_size; /* # of processes on this node */ + int local_rank; /* my rank within the node */ + int *local_ranks; /* global rank for each local process */ + int external_size; /* # of nodes, each node has exactly one external process (local root) */ + int external_rank; /* my rank among all external processes if one of them, otherwise -1 */ + int *external_ranks; /* global rank for each external process */ + int *intranode_table; /* mapping from global rank to local rank. -1 if the process is on a different node */ + int *internode_table; /* mapping from global rank to external rank. -1 if the process is >not external */ + int is_comm_world; + +} DAT_IB_COLLECTIVE_GROUP; + /* Extended RETURN and EVENT STATUS string helper functions */ /* DAT_EXT_RETURN error to string */ @@ -397,6 +530,9 @@ dat_strerror_ext_status ( /* * Extended IB transport specific APIs * redirection via DAT extension function + * va_arg function: DAT_HANDLE and OP type MUST be first 2 parameters + * + * RETURN VALUE: DAT_RETURN */ /* @@ -406,13 +542,14 @@ dat_strerror_ext_status ( * and the result is stored in the local_iov. */ #define dat_ib_post_fetch_and_add(ep, add_val, lbuf, cookie, rbuf, flgs) \ - dat_extension_op( ep, \ - DAT_IB_FETCH_AND_ADD_OP, \ - (add_val), \ - (lbuf), \ - (cookie), \ - (rbuf), \ - (flgs)) + dat_extension_op(\ + IN (DAT_EP_HANDLE) (ep), \ + IN (DAT_IB_OP) DAT_IB_FETCH_AND_ADD_OP, \ + IN (DAT_UINT64) (add_val), \ + IN (DAT_LMR_TRIPLET *) (lbuf), \ + IN (cookie), \ + IN (DAT_RMR_TRIPLET *) (rbuf), \ + IN (DAT_COMPLETION_FLAGS) (flgs)) /* * This asynchronous call is modeled after the InfiniBand atomic @@ -423,14 +560,15 @@ dat_strerror_ext_status ( * value stored in the remote memory location is copied to the local_iov. */ #define dat_ib_post_cmp_and_swap(ep, cmp_val, swap_val, lbuf, cookie, rbuf, flgs) \ - dat_extension_op( ep, \ - DAT_IB_CMP_AND_SWAP_OP, \ - (cmp_val), \ - (swap_val), \ - (lbuf), \ - (cookie), \ - (rbuf), \ - (flgs)) + dat_extension_op(\ + IN (DAT_EP_HANDLE) (ep), \ + IN (DAT_IB_OP) DAT_IB_CMP_AND_SWAP_OP, \ + IN (DAT_UINT64) (cmp_val), \ + IN (DAT_UINT64) (swap_val), \ + IN (DAT_LMR_TRIPLET *) (lbuf), \ + IN (cookie), \ + IN (DAT_RMR_TRIPLET *) (rbuf), \ + IN (DAT_COMPLETION_FLAGS) (flgs)) /* * RDMA Write with IMMEDIATE: @@ -449,14 +587,15 @@ dat_strerror_ext_status ( * n/a */ #define dat_ib_post_rdma_write_immed(ep, size, lbuf, cookie, rbuf, idata, flgs) \ - dat_extension_op( ep, \ - DAT_IB_RDMA_WRITE_IMMED_OP, \ - (size), \ - (lbuf), \ - (cookie), \ - (rbuf), \ - (idata), \ - (flgs)) + dat_extension_op(\ + IN (DAT_EP_HANDLE) (ep), \ + IN (DAT_IB_OP) DAT_IB_RDMA_WRITE_IMMED_OP, \ + IN (DAT_COUNT) (size), \ + IN (DAT_LMR_TRIPLET *) (lbuf), \ + IN (cookie), \ + IN (DAT_RMR_TRIPLET *) (rbuf), \ + IN (DAT_UINT32) (idata), \ + IN (DAT_COMPLETION_FLAGS) (flgs)) /* * Unreliable datagram: msg send @@ -471,14 +610,21 @@ dat_strerror_ext_status ( * n/a */ #define dat_ib_post_send_ud(ep, segments, lbuf, ah_ptr, cookie, flgs) \ - dat_extension_op( ep, \ - DAT_IB_UD_SEND_OP, \ - (segments), \ - (lbuf), \ - (ah_ptr), \ - (cookie), \ - (flgs)) + dat_extension_op(\ + IN (DAT_EP_HANDLE) (ep), \ + IN (DAT_IB_OP) DAT_IB_UD_SEND_OP, \ + IN (DAT_COUNT) (segments), \ + IN (DAT_LMR_TRIPLET *) (lbuf), \ + IN (DAT_IB_ADDR_HANDLE *) (ah_ptr), \ + IN (cookie), \ + IN (DAT_COMPLETION_FLAGS) (flgs)) +/* + * Unreliable datagram: msg recv + * + * Mapping to standard EP post call. + */ +#define dat_ib_post_recv_ud dat_ep_post_recv /* * Query counter(s): @@ -487,12 +633,13 @@ dat_strerror_ext_status ( * * use _ALL_COUNTERS to query all */ -#define dat_query_counters(dat_handle, cntr, p_cntrs_out, reset) \ - dat_extension_op( dat_handle, \ - DAT_QUERY_COUNTERS_OP, \ - (cntr), \ - (p_cntrs_out), \ - (reset)) +#define dat_ib_query_counters(dat_handle, cntr, p_cntrs_out, reset) \ + dat_extension_op(\ + IN (DAT_HANDLE) dat_handle, \ + IN (DAT_IB_OP) DAT_QUERY_COUNTERS_OP, \ + IN (int) (cntr), \ + IN (DAT_UINT64 *) (p_cntrs_out), \ + IN (int) (reset)) /* * Print counter(s): * Provide IA, EP, or EVD and call will print appropriate counters @@ -500,11 +647,427 @@ dat_strerror_ext_status ( * * use _ALL_COUNTERS to print all */ -#define dat_print_counters(dat_handle, cntr, reset) \ - dat_extension_op( dat_handle, \ - DAT_PRINT_COUNTERS_OP, \ - (cntr), \ - (reset)) +#define dat_ib_print_counters(dat_handle, cntr, reset) \ + dat_extension_op(\ + IN (DAT_HANDLE) dat_handle, \ + IN (DAT_IB_OP) DAT_PRINT_COUNTERS_OP, \ + IN (int) (cntr), \ + IN (int) (reset)) + +/* + ************************ MPI IB Collective Functions *********************** + */ + +/* MPI collective member and group setup functions */ + +/* + * This synchronous call creates and returns local member + * address information for a collective device or provider + * for each rank. The size of the member address information + * is dependent on the collective device or provider. + * This address information, for each rank, must be exchanged + * and used for group creation on all ranks. + */ +#define dat_ib_collective_create_member(ia_handle, progress_func, member, member_size) \ + dat_extension_op(\ + IN (DAT_IA_HANDLE) (ia_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_CREATE_MEMBER_OP, \ + IN (void *) (progress_func), \ + OUT (DAT_IB_COLLECTIVE_MEMBER *) (member), \ + OUT (DAT_UINT32 *) (member_size)) + +/* + * This synchronous call destroys a previously created member + * information associated with the this device ia_handle argument. + */ +#define dat_ib_collective_free_member(ia_handle, member) \ + dat_extension_op(\ + IN (DAT_IA_HANDLE) (ia_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_FREE_MEMBER_OP, \ + IN (DAT_IB_COLLECTIVE_MEMBER) (member)) + +/* + * This asynchronous call initiates the process of creating a collective + * group and must be called by all group members. The collective_group + * argument points to an array of address/connection qualifier pairs that + * identify the members of the group in rank order. The group_size argument + * specifies the size of the group and therefore the size of the coll_group + * array. The self argument identifies the rank of the caller. + * The group_id argument specifies a network-unique identifier for this + * instance of the collective group. The group_info provides global and local + * rank and process information. All members of the group must specify + * the same group_id value for the same collective instance. The evd_handle + * argument specifies the EVD used for all asynchronous collective completions + * including this call. The user_context argument will be returned in the + * DAT_EXT_COLLECTIVE_CREATE_DATA event. + * + * On a successful completion, each group member will receive a + * DAT_EXT_COLLECTIVE_CREATE_DATA event on the EVD specified by evd_handle. + * The event contains the collective handle, the rank of the receiving + * Endpoint within the collective group, the size of the group, and the + * caller specified user_context. The returned collective handle can be used + * in network clock, Multicast, and other collective operations. + * + * RETURN VALUE: DAT_RETURN + */ +#define dat_ib_collective_create_group(members, group_size, self, group_id, group_info, evd, pd, user_context) \ + dat_extension_op(\ + IN (DAT_EVD_HANDLE) (evd), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_CREATE_GROUP_OP, \ + IN (DAT_IB_COLLECTIVE_MEMBER *) (members), \ + IN (DAT_COUNT) (group_size), \ + IN (DAT_IB_COLLECTIVE_RANK) (self), \ + IN (DAT_IB_COLLECTIVE_ID) (group_id), \ + IN (DAT_IB_COLLECTIVE_GROUP *) (group_info), \ + IN (DAT_PZ_HANDLE) (pd), \ + IN (DAT_CONTEXT) (user_context)) + +/* + * This synchronous call destroys a previously created collective group + * associated with the collective_handle argument. Any pending or + * in-process requests associated with the collective group will be + * terminated and be posted to the appropriate EVD. + * + * RETURN VALUE: DAT_RETURN + */ +#define dat_ib_collective_free_group(coll_handle) \ + dat_extension_op(\ + IN (DAT_IB_COLLECTIVE_HANDLE) (coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_FREE_GROUP_OP) + + +/* MPI collective data operations */ + +/* + * This call sets the network clock associated with + * collective_handle. A provider implementation may keep a single + * global clock for all collective handles. When this is the case, + * this call sets an adjustment for the given handle so that + * subsequent calls to read the clock will be relative to the value + * specified by clock_value. This is an asynchronous call that + * completes on the collective EVD. The network clock will not be + * synchronized until the request is completed. Any member of the + * collective can set the clock and only one member should make + * this call on behave of the entire collective. + */ +#define dat_ib_collective_set_clock(coll_handle, clock_value, user_context ) \ + dat_extension_op( \ + IN (DAT_IB_COLLECTIVE_HANDLE) (coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_READ_CLOCK_OP, \ + IN (DAT_UINT64) (clock_value), \ + IN (DAT_CONTEXT) (user_contex)) + +/* + * This synchronous call returns the current value of the network clock + * associated with the given collective handle. This is a light weight + * call to minimize skew + */ +#define dat_ib_collective_read_clock(coll_handle, clock_value ) \ + dat_extension_op( \ + IN (DAT_IB_COLLECTIVE_HANDLE) (coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_READ_CLOCK_OP, \ + OUT (DAT_UINT64 *) clock_value)) + +/* + * This call performs a scatter of the data specified by the + * send_buffer argument to the collective group specified by coll_handle. + * Data is received in the buffer specified by the recv_buffer argument. + * The recv_byte_count argument specifies the size of the receive buffer. + * Data from the root send_buffer will be divided by the number of members + * in the collective group to form equal and contiguous memory partitions. + * Each member of the collective group will receive its rank relative + * partition. An error is returned if the send_byte_count does not describe + * memory that can be evenly divided by the size of the collective group. + * An "in place" transfer for the root rank can be indicated by passing NULL + * as the recv_buffer argument. The send_buffer and send_byte_count + * arguments are ignored on non-root members. The operation is completed on + * the collective EVD unless completions are suppressed through the + * completion flags. + */ +#define dat_ib_collective_scatter(coll_handle, sendbuf, sendsize, recvbuf, recvsize, root, user_context, flags) \ + dat_extension_op( \ + IN (DAT_IB_COLLECTIVE_HANDLE) (coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_SCATTER_OP, \ + IN (DAT_PVOID) (sendbuf), \ + IN (DAT_COUNT) (sendsize), \ + IN (DAT_PVOID) (recvbuf), \ + IN (DAT_COUNT) (recvsize), \ + IN (DAT_IB_COLLECTIVE_RANK) (root), \ + IN (DAT_CONTEXT) (user_context), \ + IN (DAT_COMPLETION_FLAGS) (flags)) + +/* + * This call performs a non-uniform scatter of the data + * specified by the send_buffers array argument to the collective group + * specified by coll_handle. The send_buffers array contains one buffer + * pointer for each member of the collective group, in rank order. + * The send_byte_counts array contains a byte count for each corresponding + * send buffer pointer. The recv_buffer and recev_byte_count arguments + * specify where received portions of the scatter are to be received. + * An "in place" transfer for the root rank can be indicated by passing + * NULL as the recv_buffer argument. The send_buffers and send_byte_counts + * arguments are ignored on non-root members. The operation is completed + * on the collective EVD unless completions are suppressed through the + * completion flags. + * + */ +#define dat_ib_collective_scatterv(coll_handle, sendbuf, sendsizes, displs, recvbuf, recvsize, root, user_context, flags) \ + dat_extension_op( \ + IN (DAT_IB_COLLECTIVE_HANDLE) (coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_SCATTERV_OP, \ + IN (DAT_PVOID) (sendbuf), \ + IN (DAT_COUNT *) (sendsizes), \ + IN (DAT_COUNT *) (displs), \ + IN (DAT_PVOID) (recvbuf), \ + IN (DAT_COUNT) (recvsize), \ + IN (DAT_IB_COLLECTIVE_RANK) (root), \ + IN (DAT_CONTEXT) (user_context), \ + IN (DAT_COMPLETION_FLAGS) (flags)) + +/* + * This call performs a gather of the data sent by all + * members of the collective specified by the collective_handle argument. + * The data to be sent is specified by the send_buffer and send_byte_count + * arguments. Data is received by the collective member specified by the + * root argument in the buffer specified by the recv_buffer and + * recv_byte_count arguments. Data is placed into the receive buffer in + * collective rank order. An "in place" transfer for the root rank can + * be indicated by passing NULL as the send_buffer argument. + * The recv_buffer and recv_byte_count arguments are ignored on non-root + * members. The operation is completed on the collective EVD unless + * completions are suppressed through the completion flags. + */ +#define dat_ib_collective_gather(coll_handle, sendbuf, sendsize, recvbuf, recvsize, root, user_context, flags) \ + dat_extension_op( \ + IN (DAT_IB_COLLECTIVE_HANDLE) (coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_GATHER_OP, \ + IN (DAT_PVOID) (sendbuf), \ + IN (DAT_COUNT) (sendsize), \ + IN (DAT_PVOID) (recvbuf), \ + IN (DAT_COUNT) (recvsize), \ + IN (DAT_IB_COLLECTIVE_RANK) (root), \ + IN (DAT_CONTEXT) (user_context), \ + IN (DAT_COMPLETION_FLAGS)(flags)) + +/* + * This call performs a non-uniform gather of the data sent by + * all members of the collective specified by the collective_handle argument. + * The data to be sent is specified by the send_buffer and send_byte_count + * arguments. Data is received by the collective member specified by the + * root argument into the buffers specified by the recv_buffers and + * recv_byte_counts array arguments. Data is placed into the receive buffer + * associated with the rank that sent it. An "in place" transfer for the root + * rank can be indicated by passing NULL as the send_buffer argument. + * The recv_buffers and recv_byte_counts arguments are ignored on non-root + * members. The operation is completed on the collective EVD unless + * completions are suppressed through the completion flags. + */ +#define dat_ib_collective_gatherv(coll_handle, sendbuf, sendsize, recvbufs, recvsizes, displs, root, user_context, flags) \ + dat_extension_op( \ + (DAT_IB_COLLECTIVE_HANDLE)(coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_GATHERV_OP, \ + IN (DAT_PVOID) (sendbuf), \ + IN (DAT_COUNT) (sendsize), \ + IN (DAT_PVOID) (recvbuf), \ + IN (DAT_COUNT *) (recvsizes), \ + IN (DAT_COUNT *) (displs), \ + IN (DAT_IB_COLLECTIVE_RANK) (root), \ + IN (DAT_CONTEXT) (user_context), \ + IN (DAT_COMPLETION_FLAGS) (flags)) + +/* + * This call is equivalent to having all members of a collective + * group perform a dat_collective_gather() as the root. This results in all + * members of the collective having identical contents in their receive buffer + */ +#define dat_ib_collective_allgather(coll_handle, sendbuf, sendsize, recvbuf, recvsize, user_context, flags) \ + dat_extension_op( \ + (DAT_IB_COLLECTIVE_HANDLE)(coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_ALLGATHER_OP, \ + IN (DAT_PVOID) (sendbuf), \ + IN (DAT_COUNT) (sendsize), \ + IN (DAT_PVOID) (recvbuf), \ + IN (DAT_COUNT) (recvsize), \ + IN (DAT_CONTEXT) (user_context), \ + IN (DAT_COMPLETION_FLAGS) (flags)) + +/* + * This call performs a non-uniform dat_collective_allgather() + * operation. It is equivalent to having all members of a collective group + * perform a dat_collective_gatherv() as the root. This results in all + * members of the collective having identical contents in their receive + * buffer. + */ +#define dat_ib_collective_allgatherv(coll_handle, sendbuf, sendsize, recvbuf, recvsizes, displs, user_context, flags) \ + dat_extension_op( \ + (DAT_IB_COLLECTIVE_HANDLE)(coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_ALLGATHERV_OP, \ + IN (DAT_PVOID) (sendbuf), \ + IN (DAT_COUNT) (sendsize), \ + IN (DAT_PVOID) (recvbuf), \ + IN (DAT_COUNT *) (recvsizes), \ + IN (DAT_COUNT *) (displs), \ + IN (DAT_CONTEXT) (user_context), \ + IN (DAT_COMPLETION_FLAGS) (flags)) + +/* + * This call is an extension of dat_collective_allgather() + * to the case where each member sends distinct data specified by send_buffer + * to each of the other members. The jth block sent from rank i is received + * by rank j and is placed in the ith block of recv_buffer. + */ +#define dat_ib_collective_alltoall(coll_handle, sendbuf, sendsize, recvbuf, recvsize, user_context, flags) \ + dat_extension_op( \ + (DAT_IB_COLLECTIVE_HANDLE)(coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_ALLTOALL_OP, \ + IN (DAT_PVOID) (sendbuf), \ + IN (DAT_COUNT) (sendsize), \ + IN (DAT_PVOID) (recvbuf), \ + IN (DAT_COUNT) (recvsize), \ + IN (DAT_CONTEXT) (user_context), \ + IN (DAT_COMPLETION_FLAGS) (flags)) + +/* + * This call performs a non-uniform dat_collective_alltoall() operation + */ +#define dat_ib_collective_alltoallv(coll_handle, sendbuf, sendsizes, senddspls, recvbuf, recvsizes, recvdispls, user_context, flags) \ + dat_extension_op( \ + (DAT_IB_COLLECTIVE_HANDLE)(coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_ALLTOALLV_OP, \ + IN (DAT_PVOID) (sendbuf), \ + IN (DAT_COUNT *) (sendsizes), \ + IN (DAT_COUNT *) (senddispls), \ + IN (DAT_PVOID) (recvbuf), \ + IN (DAT_COUNT *) (recvsizes), \ + IN (DAT_COUNT *) (recvdispls), \ + IN (DAT_CONTEXT) (user_context), \ + IN (DAT_COMPLETION_FLAGS) (flags)) + +/* + * This call combines the elements of the data type specified + * by data_type from the buffer specified by send_buffer of all members of + * the collective by performing the operation specified by reduce_operation + * and placing the result into the buffer of the root member specified by + * recv_buffer. It is an error to specify a floating point type with + * any of the logical reduction operators.When using the REDUCE_OP_MINLOC + * and REDUCE_OP _MAXLOC operations, it is assumed that the input and output + * buffers contain pair values where the first member of the pair is of the + * type specified by data_type followed by a COLLECTIVE_TYPE_UINT32 type. + * When the reduction is complete, the receive buffer will contain the + * MIN/MAX value in the first member of the pair with the first member rank + * that contained it in the second member of the pair. The tables below + * show the result of a REDUCE_OP_SUM reduce operation. + */ +#define dat_ib_collective_reduce(coll_handle, sendbuf, sendsize, recvbuf, recvsize, op, type, root, user_context, flags) \ + dat_extension_op( \ + IN (DAT_IB_COLLECTIVE_HANDLE)(coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_REDUCE_OP, \ + IN (DAT_PVOID) (sendbuf), \ + IN (DAT_COUNT) (sendsize), \ + IN (DAT_PVOID) (recvbuf), \ + IN (DAT_COUNT) (recvsize), \ + IN (DAT_IB_COLLECTIVE_REDUCE_DATA_OP) (op), \ + IN (DAT_IB_COLLECTIVE_DATA_TYPE) (type), \ + IN (DAT_IB_COLLECTIVE_RANK) (root), \ + IN (DAT_CONTEXT) (user_context), \ + IN (DAT_COMPLETION_FLAGS) (flags)) + +/* + * This call is identical to the dat_collective_reduce() + * call with the exception that the recv_buffer and recv_byte_count arguments + * are valid for all members of the collective and all members of will + * receive the reduction results. + */ +#define dat_ib_collective_allreduce(coll_handle, sendbuf, sendsize, recvbuf, recvsize, op, type, user_context, flags) \ + dat_extension_op( \ + IN (DAT_IB_COLLECTIVE_HANDLE) (coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_ALLREDUCE_OP, \ + IN (DAT_PVOID) (sendbuf), \ + IN (DAT_COUNT) (sendsize), \ + IN (DAT_PVOID) (recvbuf), \ + IN (DAT_COUNT) (recvsize), \ + IN (DAT_IB_COLLECTIVE_REDUCE_DATA_OP) (op), \ + IN (DAT_IB_COLLECTIVE_DATA_TYPE) (type), \ + IN (DAT_CONTEXT) (user_context), \ + IN (DAT_COMPLETION_FLAGS) (flags)) +/* + * This call is identical to rank 0 of the collective calling + * this dat_collective_reduce() followed by dat_collective_scatterv(). + * The number of bytes received in the scatter for each rank is determined + * by rank offset into the recv_byte_counts array. + */ +#define dat_ib_collective_reduce_scatter(coll_handle, sendbuf, sendsize, recvbuf, recvsizes, op, type, user_context, flags) \ + dat_extension_op( \ + IN (DAT_IB_COLLECTIVE_HANDLE) (coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_REDUCE_SCATTER_OP, \ + IN (DAT_PVOID) (sendbuf), \ + IN (DAT_COUNT) (sendsize), \ + IN (DAT_PVOID) (recvbuf), \ + IN (DAT_COUNT *) (recvsizes), \ + IN (DAT_IB_COLLECTIVE_REDUCE_DATA_OP) (op), \ + IN (DAT_IB_COLLECTIVE_DATA_TYPE) (type), \ + IN (DAT_CONTEXT) (user_context), \ + IN (DAT_COMPLETION_FLAGS) (flags)) + +/* + * This call is used to perform a prefix reduction on data + * distributed across the group. The operation returns, in recv_buffer of + * the member with rank i, the reduction of the values in send_buffer of + * members with ranks 0,...,i (inclusive). The tables below show the + * result of a REDUCE_OP_SUM scan operation. + */ +#define dat_ib_collective_scan(coll_handle, sendbuf, sendsize, recvbuf, recvsize, op, type, user_context, flags) \ + dat_extension_op( \ + IN (DAT_IB_COLLECTIVE_HANDLE) (coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_SCAN_OP, \ + IN (DAT_PVOID) (sendbuf), \ + IN (DAT_COUNT) (sendsize), \ + IN (DAT_PVOID) (recvbuf), \ + IN (DAT_COUNT) (recvsize), \ + IN (DAT_IB_COLLECTIVE_REDUCE_DATA_OP) (op), \ + IN (DAT_IB_COLLECTIVE_DATA_TYPE) (type), \ + IN (DAT_CONTEXT) (user_context), \ + IN (DAT_COMPLETION_FLAGS) (flags)) + +/* + * This call performs a broadcast send operation that transfers + * data specified by the buffer argument of the root into the buffer argument + * of all other Endpoints in the collective group specified by coll_handle. + * The operation is completed on the collective EVD unless completions are + * suppressed through the completion flags. All broadcasts are considered + * �in place� transfers. The tables below show the result of a broadcast + * operation. + */ +#define dat_ib_collective_broadcast(coll_handle, buf, size, root, user_context, flags) \ + dat_extension_op(\ + IN (DAT_IB_COLLECTIVE_HANDLE) (coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_BROADCAST_OP, \ + IN (DAT_PVOID) (buf), \ + IN (DAT_COUNT) (size), \ + IN (DAT_IB_COLLECTIVE_RANK) (root), \ + IN (DAT_CONTEXT) (user_context), \ + IN (DAT_COMPLETION_FLAGS) (flags)) + +/* + * This call will synchronize all endpoints of the collective + * group specified by coll_handle. This is an asynchronous call that + * will post a completion to the collective EVD when all endpoints + * have synchronized. + */ +#define dat_ib_collective_barrier(coll_handle, user_context, flags) \ + dat_extension_op( \ + IN (DAT_IB_COLLECTIVE_HANDLE) (coll_handle), \ + IN (DAT_IB_OP) DAT_IB_COLLECTIVE_BARRIER_OP, \ + IN (DAT_CONTEXT) (user_context), \ + IN (DAT_COMPLETION_FLAGS) (flags)) + + +/* Backward compatibility */ +#define DAT_ATTR_COUNTERS DAT_IB_ATTR_COUNTERS +#define dat_query_counters dat_ib_query_counters +#define dat_print_counters dat_ib_print_counters +#define DAT_QUERY_COUNTERS_OP DAT_IB_QUERY_COUNTERS_OP +#define DAT_PRINT_COUNTERS_OP DAT_IB_PRINT_COUNTERS_OP #endif /* _DAT_IB_EXTENSIONS_H_ */ -- 2.46.0