From: James Lentini Date: Thu, 13 Oct 2005 20:45:22 +0000 (+0000) Subject: r3774: Fix the async error handling and callback mappings. X-Git-Tag: libdapl-1.2.1~70 X-Git-Url: https://openfabrics.org/gitweb/?a=commitdiff_plain;h=42a64ec2ec1d8ec71492bfebba077b006684ce97;p=~ardavis%2Fdapl.git r3774: Fix the async error handling and callback mappings. Updated TODO list. Signed-off by: Arlin Davis Signed-off by: James Lentini --- diff --git a/dapl/openib/TODO b/dapl/openib/TODO index ef775e3..1505861 100644 --- a/dapl/openib/TODO +++ b/dapl/openib/TODO @@ -1,12 +1,10 @@ IB Verbs: - CQ resize -- mulitple CQ event support - memory window support DAPL: - reinit EP needs a QP timewait completion notification -- direct cq_wait_object when multi-CQ verbs event support arrives - shared receive queue support Under discussion: diff --git a/dapl/openib/dapl_ib_util.c b/dapl/openib/dapl_ib_util.c index 6561830..f7ed6ed 100644 --- a/dapl/openib/dapl_ib_util.c +++ b/dapl/openib/dapl_ib_util.c @@ -214,8 +214,11 @@ DAT_RETURN dapls_ib_open_hca ( /* Get list of all IB devices, find match, open */ dev_list = ibv_get_devices(); dlist_start(dev_list); - dlist_for_each_data(dev_list,hca_ptr->ib_trans.ib_dev,struct ibv_device) { - if (!strcmp(ibv_get_device_name(hca_ptr->ib_trans.ib_dev),hca_name)) + dlist_for_each_data(dev_list, + hca_ptr->ib_trans.ib_dev, + struct ibv_device) { + if (!strcmp(ibv_get_device_name(hca_ptr->ib_trans.ib_dev), + hca_name)) break; } @@ -226,20 +229,22 @@ DAT_RETURN dapls_ib_open_hca ( return DAT_INTERNAL_ERROR; } - dapl_dbg_log (DAPL_DBG_TYPE_UTIL," open_hca: Found dev %s %016llx\n", - ibv_get_device_name(hca_ptr->ib_trans.ib_dev), - (unsigned long long)bswap_64(ibv_get_device_guid(hca_ptr->ib_trans.ib_dev))); + dapl_dbg_log ( + DAPL_DBG_TYPE_UTIL," open_hca: Found dev %s %016llx\n", + ibv_get_device_name(hca_ptr->ib_trans.ib_dev), + (unsigned long long) + bswap_64(ibv_get_device_guid(hca_ptr->ib_trans.ib_dev))); hca_ptr->ib_hca_handle = ibv_open_device(hca_ptr->ib_trans.ib_dev); if (!hca_ptr->ib_hca_handle) { dapl_dbg_log (DAPL_DBG_TYPE_ERR, " open_hca: IB dev open failed for %s\n", - ibv_get_device_name(hca_ptr->ib_trans.ib_dev) ); + ibv_get_device_name(hca_ptr->ib_trans.ib_dev)); return DAT_INTERNAL_ERROR; } hca_ptr->ib_trans.ib_ctx = hca_ptr->ib_hca_handle; - /* set inline max with enviromment or default, get local lid and gid 0 */ + /* set inline max with env or default, get local lid and gid 0 */ hca_ptr->ib_trans.max_inline_send = dapl_os_get_env_val("DAPL_MAX_INLINE", INLINE_SEND_DEFAULT); @@ -253,15 +258,17 @@ DAT_RETURN dapls_ib_open_hca ( } dapl_dbg_log(DAPL_DBG_TYPE_UTIL, - " open_hca: GID subnet %016llx id %016llx\n", - (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.subnet_prefix), - (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.interface_id) ); + " open_hca: GID subnet %016llx id %016llx\n", + (unsigned long long) + bswap_64(hca_ptr->ib_trans.gid.global.subnet_prefix), + (unsigned long long) + bswap_64(hca_ptr->ib_trans.gid.global.interface_id)); /* get the IP address of the device using GID */ if (dapli_get_hca_addr(hca_ptr)) { dapl_dbg_log (DAPL_DBG_TYPE_ERR, " open_hca: ERR ib_at_ips_by_gid for %s\n", - ibv_get_device_name(hca_ptr->ib_trans.ib_dev) ); + ibv_get_device_name(hca_ptr->ib_trans.ib_dev)); goto bail; } @@ -310,15 +317,23 @@ DAT_RETURN dapls_ib_open_hca ( write(g_ib_pipe[1], "w", sizeof "w"); dapl_os_unlock(&g_hca_lock); - dapl_dbg_log (DAPL_DBG_TYPE_UTIL, - " open_hca: %s, port %d, %s %d.%d.%d.%d INLINE_MAX=%d\n", - ibv_get_device_name(hca_ptr->ib_trans.ib_dev), hca_ptr->port_num, - ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_family == AF_INET ? "AF_INET":"AF_INET6", - ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 0 & 0xff, - ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 8 & 0xff, - ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 16 & 0xff, - ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 24 & 0xff, - hca_ptr->ib_trans.max_inline_send ); + dapl_dbg_log ( + DAPL_DBG_TYPE_UTIL, + " open_hca: %s, port %d, %s %d.%d.%d.%d INLINE_MAX=%d\n", + ibv_get_device_name(hca_ptr->ib_trans.ib_dev), + hca_ptr->port_num, + ((struct sockaddr_in *) + &hca_ptr->hca_address)->sin_family == AF_INET ? + "AF_INET":"AF_INET6", + ((struct sockaddr_in *) + &hca_ptr->hca_address)->sin_addr.s_addr >> 0 & 0xff, + ((struct sockaddr_in *) + &hca_ptr->hca_address)->sin_addr.s_addr >> 8 & 0xff, + ((struct sockaddr_in *) + &hca_ptr->hca_address)->sin_addr.s_addr >> 16 & 0xff, + ((struct sockaddr_in *) + &hca_ptr->hca_address)->sin_addr.s_addr >> 24 & 0xff, + hca_ptr->ib_trans.max_inline_send ); hca_ptr->ib_trans.d_hca = hca_ptr; return DAT_SUCCESS; @@ -370,7 +385,7 @@ DAT_RETURN dapls_ib_close_hca ( IN DAPL_HCA *hca_ptr ) sleep.tv_sec = 0; sleep.tv_nsec = 10000000; /* 10 ms */ dapl_dbg_log(DAPL_DBG_TYPE_UTIL, - " ib_thread_destroy: waiting on hca %p destroy\n"); + " ib_thread_destroy: wait on hca %p destroy\n"); nanosleep (&sleep, &remain); } return (DAT_SUCCESS); @@ -425,19 +440,26 @@ DAT_RETURN dapls_ib_query_hca ( if (ia_attr != NULL) { ia_attr->adapter_name[DAT_NAME_MAX_LENGTH - 1] = '\0'; ia_attr->vendor_name[DAT_NAME_MAX_LENGTH - 1] = '\0'; - ia_attr->ia_address_ptr = (DAT_IA_ADDRESS_PTR)&hca_ptr->hca_address; + ia_attr->ia_address_ptr = + (DAT_IA_ADDRESS_PTR)&hca_ptr->hca_address; dapl_dbg_log (DAPL_DBG_TYPE_UTIL, " query_hca: %s %s %d.%d.%d.%d\n", ibv_get_device_name(hca_ptr->ib_trans.ib_dev), - ((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_family == AF_INET ? "AF_INET":"AF_INET6", - ((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_addr.s_addr >> 0 & 0xff, - ((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_addr.s_addr >> 8 & 0xff, - ((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_addr.s_addr >> 16 & 0xff, - ((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_addr.s_addr >> 24 & 0xff ); + ((struct sockaddr_in *) + ia_attr->ia_address_ptr)->sin_family == AF_INET ? + "AF_INET":"AF_INET6", + ((struct sockaddr_in *) + ia_attr->ia_address_ptr)->sin_addr.s_addr >> 0 & 0xff, + ((struct sockaddr_in *) + ia_attr->ia_address_ptr)->sin_addr.s_addr >> 8 & 0xff, + ((struct sockaddr_in *) + ia_attr->ia_address_ptr)->sin_addr.s_addr >> 16 & 0xff, + ((struct sockaddr_in *) + ia_attr->ia_address_ptr)->sin_addr.s_addr >> 24 & 0xff); ia_attr->hardware_version_major = dev_attr.hw_ver; - ia_attr->hardware_version_minor = dev_attr.fw_ver; + /* ia_attr->hardware_version_minor = dev_attr.fw_ver; */ ia_attr->max_eps = dev_attr.max_qp; ia_attr->max_dto_per_ep = dev_attr.max_qp_wr; ia_attr->max_rdma_read_per_ep = dev_attr.max_qp_rd_atom; @@ -468,7 +490,6 @@ DAT_RETURN dapls_ib_query_hca ( ia_attr->max_mtu_size, ia_attr->max_rdma_size, ia_attr->max_iov_segments_per_dto, ia_attr->max_lmrs, ia_attr->max_rmrs ); - } if (ep_attr != NULL) { @@ -522,27 +543,28 @@ DAT_RETURN dapls_ib_setup_async_callback ( ib_hca_transport_t *hca_ptr; dapl_dbg_log (DAPL_DBG_TYPE_UTIL, - " setup_async_cb: ia %p type %d handle %p cb %p ctx %p\n", + " setup_async_cb: ia %p type %d hdl %p cb %p ctx %p\n", ia_ptr, handler_type, evd_ptr, callback, context); hca_ptr = &ia_ptr->hca_ptr->ib_trans; switch(handler_type) { case DAPL_ASYNC_UNAFILIATED: - hca_ptr->async_unafiliated = callback; + hca_ptr->async_unafiliated = + (ib_async_handler_t)callback; hca_ptr->async_un_ctx = context; break; case DAPL_ASYNC_CQ_ERROR: - hca_ptr->async_cq_error = callback; - hca_ptr->async_cq_ctx = context; + hca_ptr->async_cq_error = + (ib_async_cq_handler_t)callback; break; case DAPL_ASYNC_CQ_COMPLETION: - hca_ptr->async_cq = callback; - hca_ptr->async_ctx = context; + hca_ptr->async_cq = + (ib_async_dto_handler_t)callback; break; case DAPL_ASYNC_QP_ERROR: - hca_ptr->async_qp_error = callback; - hca_ptr->async_qp_ctx = context; + hca_ptr->async_qp_error = + (ib_async_qp_handler_t)callback; break; default: break; @@ -573,7 +595,6 @@ void dapli_ib_thread_destroy(void) int retries = 10; dapl_dbg_log(DAPL_DBG_TYPE_UTIL, " ib_thread_destroy(%d)\n", getpid()); - /* * wait for async thread to terminate. * pthread_join would be the correct method @@ -623,34 +644,42 @@ void dapli_async_event_cb(struct _ib_hca_transport *hca) case IBV_EVENT_CQ_ERR: { - dapl_dbg_log(DAPL_DBG_TYPE_WARN, - " dapli_async_event CQ ERR %d\n", - event.event_type); + struct dapl_ep *evd_ptr = + event.element.cq->cq_context; + + dapl_dbg_log( + DAPL_DBG_TYPE_WARN, + " dapli_async_event CQ (%p) ERR %d\n", + evd_ptr, event.event_type); /* report up if async callback still setup */ if (hca->async_cq_error) hca->async_cq_error(hca->ib_ctx, + event.element.cq, &event, - hca->async_cq_ctx); + (void*)evd_ptr); break; } case IBV_EVENT_COMM_EST: { - /* Received messages on connected QP before RTU */ - struct dapl_ep *ep_ptr = event.element.qp->qp_context; + /* Received msgs on connected QP before RTU */ + struct dapl_ep *ep_ptr = + event.element.qp->qp_context; /* TODO: cannot process COMM_EST until ibv * guarantees valid QP context for events. * Race conditions exist with QP destroy call. * For now, assume the RTU will arrive. */ - dapl_dbg_log(DAPL_DBG_TYPE_UTIL, - " dapli_async_event COMM_EST (qp=%p)\n", - event.element.qp); + dapl_dbg_log( + DAPL_DBG_TYPE_UTIL, + " dapli_async_event COMM_EST(qp=%p)\n", + event.element.qp); if (!DAPL_BAD_HANDLE(ep_ptr, DAPL_MAGIC_EP) && ep_ptr->cm_handle != IB_INVALID_HANDLE) - ib_cm_establish(ep_ptr->cm_handle->cm_id); + ib_cm_establish( + ep_ptr->cm_handle->cm_id); break; } @@ -662,15 +691,20 @@ void dapli_async_event_cb(struct _ib_hca_transport *hca) case IBV_EVENT_SRQ_LIMIT_REACHED: case IBV_EVENT_SQ_DRAINED: { - dapl_dbg_log(DAPL_DBG_TYPE_WARN, - " dapli_async_event QP ERR %d\n", - event.event_type); + struct dapl_ep *ep_ptr = + event.element.qp->qp_context; + + dapl_dbg_log( + DAPL_DBG_TYPE_WARN, + " dapli_async_event QP (%p) ERR %d\n", + ep_ptr, event.event_type); /* report up if async callback still setup */ if (hca->async_qp_error) hca->async_qp_error(hca->ib_ctx, + event.element.qp, &event, - hca->async_qp_ctx); + (void*)ep_ptr); break; } case IBV_EVENT_PATH_MIG: diff --git a/dapl/openib/dapl_ib_util.h b/dapl/openib/dapl_ib_util.h index a692bb0..20f4968 100644 --- a/dapl/openib/dapl_ib_util.h +++ b/dapl/openib/dapl_ib_util.h @@ -141,7 +141,7 @@ typedef enum ibv_send_flags ib_send_op_type_t; typedef struct ibv_sge ib_data_segment_t; typedef enum ibv_qp_state ib_qp_state_t; typedef enum ibv_event_type ib_async_event_type; -typedef struct ibv_async_event ib_error_record_t; +typedef struct ibv_async_event ib_error_record_t; /* CQ notifications */ typedef enum @@ -222,12 +222,30 @@ typedef struct ibv_comp_channel *ib_wait_obj_handle_t; * ibv_post_recv - Return 0, -1 & bad_wr */ -/* async handler for CQ, QP, and unafiliated */ +/* async handler for DTO, CQ, QP, and unafiliated */ +typedef void (*ib_async_dto_handler_t)( + IN ib_hca_handle_t ib_hca_handle, + IN ib_error_record_t *err_code, + IN void *context); + +typedef void (*ib_async_cq_handler_t)( + IN ib_hca_handle_t ib_hca_handle, + IN ib_cq_handle_t ib_cq_handle, + IN ib_error_record_t *err_code, + IN void *context); + +typedef void (*ib_async_qp_handler_t)( + IN ib_hca_handle_t ib_hca_handle, + IN ib_qp_handle_t ib_qp_handle, + IN ib_error_record_t *err_code, + IN void *context); + typedef void (*ib_async_handler_t)( IN ib_hca_handle_t ib_hca_handle, IN ib_error_record_t *err_code, IN void *context); + /* ib_hca_transport_t, specific to this implementation */ typedef struct _ib_hca_transport { @@ -244,12 +262,9 @@ typedef struct _ib_hca_transport union ibv_gid gid; ib_async_handler_t async_unafiliated; void *async_un_ctx; - ib_async_handler_t async_cq_error; - void *async_ctx; - ib_async_handler_t async_cq; - void *async_cq_ctx; - ib_async_handler_t async_qp_error; - void *async_qp_ctx; + ib_async_cq_handler_t async_cq_error; + ib_async_dto_handler_t async_cq; + ib_async_qp_handler_t async_qp_error; } ib_hca_transport_t;