From 0ad340a2b1c61205a7f498fe936560b4a41f1bc9 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Fri, 17 Aug 2012 22:30:06 -0700 Subject: [PATCH] There is a race condition between dapl_rbuf_add() dapl_rbuf_remove() when the ring buffer is empty or full. This could lead to incorrect item be dequeued and correct item be discarded. A temporary workaround is provided to ensure the pending event queue work correctly. Without this fix dapl_evd_dequeue() could return the wrong event and causes various application errors (hanging, assertion failures. etc). Ultimately the ring buffer mechanism needs to the fixed. --- dapl/common/dapl_evd_dequeue.c | 2 ++ dapl/common/dapl_evd_util.c | 5 +++++ dapl/common/dapl_ring_buffer_util.c | 4 ++-- dapl/include/dapl.h | 1 + dapl/udapl/dapl_evd_wait.c | 2 ++ 5 files changed, 12 insertions(+), 2 deletions(-) diff --git a/dapl/common/dapl_evd_dequeue.c b/dapl/common/dapl_evd_dequeue.c index f31e497..c25494e 100644 --- a/dapl/common/dapl_evd_dequeue.c +++ b/dapl/common/dapl_evd_dequeue.c @@ -106,8 +106,10 @@ DAT_RETURN DAT_API dapl_evd_dequeue(IN DAT_EVD_HANDLE evd_handle, * This keeps events in order if dat_evd_wait() has copied events * from CQ to EVD. */ + dapl_os_lock(&evd_ptr->pending_event_queue_lock); /* temporary workaround for the ring buffer race condition */ local_event = (DAT_EVENT *) dapls_rbuf_remove(&evd_ptr->pending_event_queue); + dapl_os_unlock(&evd_ptr->pending_event_queue_lock); /* temporary workaround for the ring buffer race condition */ if (local_event != NULL) { *event = *local_event; dat_status = dapls_rbuf_add(&evd_ptr->free_event_queue, diff --git a/dapl/common/dapl_evd_util.c b/dapl/common/dapl_evd_util.c index c41c233..1315063 100644 --- a/dapl/common/dapl_evd_util.c +++ b/dapl/common/dapl_evd_util.c @@ -363,6 +363,9 @@ DAT_RETURN dapli_evd_event_alloc(IN DAPL_EVD * evd_ptr, IN DAT_COUNT qlen) event_ptr++; } + /* temporary workaround for the ring buffer race condition */ + dapl_os_lock_init(&evd_ptr->pending_event_queue_lock); + evd_ptr->cq_notified = DAT_FALSE; evd_ptr->cq_notified_when = 0; evd_ptr->threshold = 0; @@ -599,8 +602,10 @@ dapli_evd_post_event(IN DAPL_EVD * evd_ptr, IN const DAT_EVENT * event_ptr) __FUNCTION__, dapl_event_str(event_ptr->event_number), evd_ptr, evd_ptr->evd_state); + dapl_os_lock(&evd_ptr->pending_event_queue_lock); /* temporary work around for the ring buffer race condition */ dat_status = dapls_rbuf_add(&evd_ptr->pending_event_queue, (void *)event_ptr); + dapl_os_unlock(&evd_ptr->pending_event_queue_lock); /* temporary work around for the ring buffer race condition */ dapl_os_assert(dat_status == DAT_SUCCESS); dapl_os_assert(evd_ptr->evd_state == DAPL_EVD_STATE_WAITED diff --git a/dapl/common/dapl_ring_buffer_util.c b/dapl/common/dapl_ring_buffer_util.c index 54517a9..53e6426 100644 --- a/dapl/common/dapl_ring_buffer_util.c +++ b/dapl/common/dapl_ring_buffer_util.c @@ -199,7 +199,7 @@ DAT_RETURN dapls_rbuf_add(IN DAPL_RING_BUFFER * rbuf, IN void *entry) val = dapl_os_atomic_assign(&rbuf->head, pos, pos + 1); if (val == pos) { pos = (pos + 1) & rbuf->lim; /* verify in range */ - rbuf->base[pos] = entry; + rbuf->base[pos] = entry; /* FIXME: there a race condition between this line and the line in dapls_rbuf_remove */ return DAT_SUCCESS; } } @@ -236,7 +236,7 @@ void *dapls_rbuf_remove(IN DAPL_RING_BUFFER * rbuf) if (val == pos) { pos = (pos + 1) & rbuf->lim; /* verify in range */ - return (rbuf->base[pos]); + return (rbuf->base[pos]); /* FIXME: there a race condition between this line and the line in dapls_rbuf_add */ } } diff --git a/dapl/include/dapl.h b/dapl/include/dapl.h index 634d8ef..26bf5d8 100755 --- a/dapl/include/dapl.h +++ b/dapl/include/dapl.h @@ -365,6 +365,7 @@ struct dapl_evd DAT_EVENT *events; DAPL_RING_BUFFER free_event_queue; DAPL_RING_BUFFER pending_event_queue; + DAPL_OS_LOCK pending_event_queue_lock; /* temporary workaround for the ring buffer race condition */ /* CQ Completions are not placed into 'deferred_events' ** rather they are simply left on the Completion Queue diff --git a/dapl/udapl/dapl_evd_wait.c b/dapl/udapl/dapl_evd_wait.c index 33cec50..9380223 100644 --- a/dapl/udapl/dapl_evd_wait.c +++ b/dapl/udapl/dapl_evd_wait.c @@ -248,7 +248,9 @@ DAT_RETURN DAT_API dapl_evd_wait(IN DAT_EVD_HANDLE evd_handle, evd_ptr->evd_state = DAPL_EVD_STATE_OPEN; if (dat_status == DAT_SUCCESS) { + dapl_os_lock(&evd_ptr->pending_event_queue_lock); /* temporary work around for the ring buffer race condition */ local_event = dapls_rbuf_remove(&evd_ptr->pending_event_queue); + dapl_os_unlock(&evd_ptr->pending_event_queue_lock); /* temporary work around for the ring buffer race condition */ *event = *local_event; dapls_rbuf_add(&evd_ptr->free_event_queue, local_event); } -- 2.46.0