From c1014564205fb4a1be9c9483df09155e311be769 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Fri, 1 Feb 2013 17:17:34 -0800 Subject: [PATCH] librdmacm: Work-around kernel bug returning uid = 0 Older kernels have a bug where it can report an event with the uid set to 0. The librdmacm crashes when casting the uid to an rdma_cm_id and dereferencing the NULL pointer. There are a limited number of events where this can occur and in most cases it's safe to simply discard the event. (This is what the kernel does anyway.) However, it's possible for us to process an RDMA_CM_EVENT_ESTABLISHED event with the uid set to 0. (See kernel commit 418edaaba96e58112b15c82b4907084e2a9caf42.) Although it's rare for this to occur, it does in fact happen in practice. To work-around the kernel bug, when the uid of an established event is set to 0, we first try to locate the correct user space id based on related data before discarding the event. Signed-off-by: Sean Hefty --- src/cma.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/src/cma.c b/src/cma.c index ff9b426c..eb59ea87 100755 --- a/src/cma.c +++ b/src/cma.c @@ -50,6 +50,7 @@ #include #include "cma.h" +#include "indexer.h" #include #include #include @@ -123,6 +124,7 @@ static int cma_dev_cnt; static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; static int abi_ver = RDMA_USER_CM_MAX_ABI_VERSION; int af_ib_support; +static struct index_map id_idm; static void ucma_cleanup(void) { @@ -376,8 +378,30 @@ static void ucma_put_device(struct cma_device *cma_dev) pthread_mutex_unlock(&mut); } +static void ucma_insert_id(struct cma_id_private *id_priv) +{ + if (id_priv->handle > IDX_MAX_INDEX) + return; + + pthread_mutex_lock(&mut); + idm_set(&idm, id_priv->handle, rs); + pthread_mutex_unlock(&mut); +} + +static void ucma_remove_id(struct cma_id_private *id_priv) +{ + if (id_priv->handle <= IDX_MAX_INDEX) + idm_clear(&idm, id_priv->handle); +} + +static struct cma_id_private *ucma_lookup_id(int handle) +{ + return idm_lookup(&idm, handle); +} + static void ucma_free_id(struct cma_id_private *id_priv) { + ucma_remove_id(id_priv); if (id_priv->cma_dev) ucma_put_device(id_priv->cma_dev); pthread_cond_destroy(&id_priv->cond); @@ -406,6 +430,7 @@ static struct cma_id_private *ucma_alloc_id(struct rdma_event_channel *channel, id_priv->id.context = context; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; + id_priv->handle = 0xFFFFFFFF; if (!channel) { id_priv->id.channel = rdma_create_event_channel(); @@ -455,6 +480,7 @@ static int rdma_create_id2(struct rdma_event_channel *channel, VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); id_priv->handle = resp.id; + ucma_insert_id(id_priv, resp.id); *id = &id_priv->id; return 0; @@ -1785,6 +1811,7 @@ static int ucma_process_conn_req(struct cma_event *evt, evt->event.listen_id = &evt->id_priv->id; evt->event.id = &id_priv->id; id_priv->handle = handle; + ucma_insert_id(id_priv, handle); id_priv->initiator_depth = evt->event.param.conn.initiator_depth; id_priv->responder_resources = evt->event.param.conn.responder_resources; @@ -1916,7 +1943,28 @@ retry: VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp); evt->event.event = resp.event; - evt->id_priv = (void *) (uintptr_t) resp.uid; + /* + * We should have a non-zero uid, except for connection requests. + * But a bug in older kernels can report a uid 0. Work-around this + * issue by looking up the cma_id based on the kernel's id when the + * uid is 0 and we're processing a connection established event. + * In all other cases, if the uid is 0, we discard the event, like + * the kernel should have done. + */ + if (resp.uid) { + evt->id_priv = (void *) (uintptr_t) resp.uid; + } else { + evt->id_priv = ucma_lookup_id(&idm, resp.id); + if (!evt->id_priv) { + fprintf(stderr, PFX "Warning: discarding unmatched " + "event - rdma_destroy_id may hang.\n"); + goto retry; + } + if (resp.event != RDMA_CM_EVENT_ESTABLISHED) { + ucma_complete_event(evt->id_priv); + goto retry; + } + } evt->event.id = &evt->id_priv->id; evt->event.status = resp.status; -- 2.41.0