From ea4aab2e16b4dee2b0e7afcce42efcfc256a5303 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 16 May 2012 15:49:45 -0700 Subject: [PATCH] commit --- meta | 3 +- patches/rs-locking | 281 --------------------------------------------- 2 files changed, 1 insertion(+), 283 deletions(-) delete mode 100644 patches/rs-locking diff --git a/meta b/meta index cf535385..1231564e 100644 --- a/meta +++ b/meta @@ -1,8 +1,7 @@ Version: 1 -Previous: 4f4f74d9a95d23801c97fd3165269c923437a6f8 +Previous: 9f82c11aa5504370e77e44ff124162262089df13 Head: 992a8c5abcb0199fd6214fd31b6695573ccb5bf5 Applied: - rs-locking: ec6a8efe211b0dc98548443c2e0d67e2c355351f reuseaddr: 5ce7d9c48d082fd1959918e9134f4bdd85c402d9 rstream-delay: 8f0437ec677c4752126de8667e093667dd681d56 rstream-async-opt: 0ee1dfc9ad50f07fd3708ebb04cb92e0fb2f41d5 diff --git a/patches/rs-locking b/patches/rs-locking deleted file mode 100644 index aa3a2ba5..00000000 --- a/patches/rs-locking +++ /dev/null @@ -1,281 +0,0 @@ -Bottom: de666c51520c9988ea3a07e332fa0402fdef6010 -Top: 3fcdf48082614bb29c44242534cfeecb47e07114 -Author: Sean Hefty -Date: 2012-05-07 17:16:47 -0700 - -rsockets: Optimize synchronization to improve performance - -Hotspot performance analysis using VTune showed pthread_mutex_unlock() -as the most significant hotspot when transferring small messages using -rstream. To reduce the impact of using pthread mutexes, replace it -with a custom lock built using an atomic variable and a semaphore. -When there's no contention for the lock (which is the expected case -for nonblocking sockets), the synchronization is reduced to -incrementing and decrementing an atomic variable. - -A test that acquired and released a lock 2 billion times reported that -the custom lock was roughly 20% faster than using the mutex. -26.6 seconds versus 33.0 seconds. - -Unfortunately, further analysis showed that using the custom lock -provided a minimal performance gain on rstream itself, and simply -moved the hotspot to the custom unlock call. The hotspot is likely -a result of some other interaction, rather than caused by slowness -in releasing a lock. However, we keep the custom lock based on -the results of the direct lock tests that were done. - -Signed-off-by: Sean Hefty - - ---- - -diff --git a/configure.in b/configure.in -index 238ee59..fa90fcb 100644 ---- a/configure.in -+++ b/configure.in -@@ -48,6 +48,16 @@ AC_CHECK_MEMBER(struct ibv_path_record.service_id, [], - AC_DEFINE(DEFINE_PATH_RECORD, 1, [adding path record definition]), - [#include ]) - -+dnl Check for gcc atomic intrinsics -+AC_MSG_CHECKING(compiler support for atomics) -+AC_TRY_LINK([int i = 0;], -+ [ return __sync_add_and_fetch(&i, 1) != __sync_sub_and_fetch(&i, 1); ], -+ [ AC_MSG_RESULT(yes) ], -+ [ -+ AC_MSG_RESULT(no) -+ AC_DEFINE(DEFINE_ATOMICS, 1, [Set to 1 to implement atomics]) -+ ]) -+ - dnl Checks for header files. - AC_HEADER_STDC - if test "$disable_libcheck" != "yes"; then -diff --git a/src/cma.h b/src/cma.h -index 91528c0..7703fe8 100644 ---- a/src/cma.h -+++ b/src/cma.h -@@ -42,6 +42,7 @@ - #include - #include - #include -+#include - - #include - -@@ -68,6 +69,41 @@ static inline uint64_t ntohll(uint64_t x) { return x; } - - #define min(a, b) (a < b ? a : b) - -+/* -+ * Fast synchronization for low contention locking. -+ */ -+#if DEFINE_ATOMICS -+#define fastlock_t pthread_mutex_t -+#define fastlock_init(lock) pthread_mutex_init(lock, NULL) -+#define fastlock_destroy(lock) pthread_mutex_destroy(lock) -+#define fastlock_acquire(lock) pthread_mutex_lock(lock) -+#define fastlock_release(lock) pthread_mutex_unlock(lock) -+#else -+typedef struct { -+ sem_t sem; -+ volatile int cnt; -+} fastlock_t; -+static inline void fastlock_init(fastlock_t *lock) -+{ -+ sem_init(&lock->sem, 0, 0); -+ lock->cnt = 0; -+} -+static inline void fastlock_destroy(fastlock_t *lock) -+{ -+ sem_destroy(&lock->sem); -+} -+static inline void fastlock_acquire(fastlock_t *lock) -+{ -+ if (__sync_add_and_fetch(&lock->cnt, 1) > 1) -+ sem_wait(&lock->sem); -+} -+static inline void fastlock_release(fastlock_t *lock) -+{ -+ if (__sync_sub_and_fetch(&lock->cnt, 1) > 0) -+ sem_post(&lock->sem); -+} -+#endif /* DEFINE_ATOMICS */ -+ - int ucma_complete(struct rdma_cm_id *id); - static inline int ERR(int err) - { -diff --git a/src/rsocket.c b/src/rsocket.c -index 775e9b0..2ffde9b 100644 ---- a/src/rsocket.c -+++ b/src/rsocket.c -@@ -141,11 +141,10 @@ enum rs_state { - - struct rsocket { - struct rdma_cm_id *cm_id; -- pthread_mutex_t slock; -- pthread_mutex_t rlock; -- pthread_mutex_t cq_lock; -- pthread_cond_t cq_cond; -- int cq_busy; -+ fastlock_t slock; -+ fastlock_t rlock; -+ fastlock_t cq_lock; -+ fastlock_t cq_wait_lock; - - int opts; - long fd_flags; -@@ -225,10 +224,10 @@ static struct rsocket *rs_alloc(struct rsocket *inherited_rs) - rs->index = -1; - rs->sbuf_size = inherited_rs ? inherited_rs->sbuf_size : RS_BUF_SIZE; - rs->rbuf_size = inherited_rs ? inherited_rs->rbuf_size : RS_BUF_SIZE; -- pthread_mutex_init(&rs->slock, NULL); -- pthread_mutex_init(&rs->rlock, NULL); -- pthread_mutex_init(&rs->cq_lock, NULL); -- pthread_cond_init(&rs->cq_cond, NULL); -+ fastlock_init(&rs->slock); -+ fastlock_init(&rs->rlock); -+ fastlock_init(&rs->cq_lock); -+ fastlock_init(&rs->cq_wait_lock); - return rs; - } - -@@ -375,10 +374,10 @@ static void rs_free(struct rsocket *rs) - rdma_destroy_id(rs->cm_id); - } - -- pthread_cond_destroy(&rs->cq_cond); -- pthread_mutex_destroy(&rs->cq_lock); -- pthread_mutex_destroy(&rs->rlock); -- pthread_mutex_destroy(&rs->slock); -+ fastlock_destroy(&rs->cq_wait_lock); -+ fastlock_destroy(&rs->cq_lock); -+ fastlock_destroy(&rs->rlock); -+ fastlock_destroy(&rs->slock); - free(rs); - } - -@@ -833,13 +832,6 @@ static int rs_get_cq_event(struct rsocket *rs) - return ret; - } - --static void rs_signal_cq_waiters(struct rsocket *rs) --{ -- pthread_mutex_lock(&rs->cq_lock); -- pthread_cond_signal(&rs->cq_cond); -- pthread_mutex_unlock(&rs->cq_lock); --} -- - /* - * Although we serialize rsend and rrecv calls with respect to themselves, - * both calls may run simultaneously and need to poll the CQ for completions. -@@ -850,31 +842,15 @@ static void rs_signal_cq_waiters(struct rsocket *rs) - * which could be stalled until the remote process calls rrecv. This should - * not block rrecv from receiving data from the remote side however. - * -- * Perform a quick test before trying to acquire any locks. Also note that -- * busy may be set to 1 by another thread once it's been reset to 0. -+ * We handle this by using two locks. The cq_lock protects against polling -+ * the CQ and processing completions. The cq_wait_lock serializes access to -+ * waiting on the CQ. - */ - static int rs_process_cq(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs)) - { - int ret; - -- pthread_mutex_lock(&rs->cq_lock); -- if (rs->cq_busy && nonblock) { -- pthread_mutex_unlock(&rs->cq_lock); -- return ERR(EWOULDBLOCK); -- } -- -- while (rs->cq_busy && !test(rs)) { -- pthread_cond_wait(&rs->cq_cond, &rs->cq_lock); -- -- if (test(rs)) { -- pthread_mutex_unlock(&rs->cq_lock); -- return 0; -- } -- } -- -- rs->cq_busy = 1; -- pthread_mutex_unlock(&rs->cq_lock); -- -+ fastlock_acquire(&rs->cq_lock); - do { - rs_update_credits(rs); - ret = rs_poll_cq(rs); -@@ -890,14 +866,17 @@ static int rs_process_cq(struct rsocket *rs, int nonblock, int (*test)(struct rs - rs->cq_armed = 1; - } else { - rs_update_credits(rs); -- rs_signal_cq_waiters(rs); -+ fastlock_acquire(&rs->cq_wait_lock); -+ fastlock_release(&rs->cq_lock); -+ - ret = rs_get_cq_event(rs); -+ fastlock_release(&rs->cq_wait_lock); -+ fastlock_acquire(&rs->cq_lock); - } - } while (!ret); - - rs_update_credits(rs); -- rs->cq_busy = 0; -- rs_signal_cq_waiters(rs); -+ fastlock_release(&rs->cq_lock); - return ret; - } - -@@ -1002,7 +981,7 @@ ssize_t rrecv(int socket, void *buf, size_t len, int flags) - return ret; - } - } -- pthread_mutex_lock(&rs->rlock); -+ fastlock_acquire(&rs->rlock); - if (!rs_have_rdata(rs)) { - ret = rs_process_cq(rs, rs_nonblocking(rs, flags), rs_have_rdata); - if (ret && errno != ECONNRESET) -@@ -1040,7 +1019,7 @@ ssize_t rrecv(int socket, void *buf, size_t len, int flags) - } - rs->rbuf_bytes_avail += len - left; - out: -- pthread_mutex_unlock(&rs->rlock); -+ fastlock_release(&rs->rlock); - return ret ? ret : len - left; - } - -@@ -1105,7 +1084,7 @@ ssize_t rsend(int socket, const void *buf, size_t len, int flags) - } - } - -- pthread_mutex_lock(&rs->slock); -+ fastlock_acquire(&rs->slock); - for (left = len; left; left -= xfer_size, buf += xfer_size) { - if (!rs_can_send(rs)) { - ret = rs_process_cq(rs, rs_nonblocking(rs, flags), rs_can_send); -@@ -1157,7 +1136,7 @@ ssize_t rsend(int socket, const void *buf, size_t len, int flags) - if (ret) - break; - } -- pthread_mutex_unlock(&rs->slock); -+ fastlock_release(&rs->slock); - - return (ret && left == len) ? ret : len - left; - } -@@ -1214,7 +1193,7 @@ static ssize_t rsendv(int socket, const struct iovec *iov, int iovcnt, int flags - for (i = 1; i < iovcnt; i++) - len += iov[i].iov_len; - -- pthread_mutex_lock(&rs->slock); -+ fastlock_acquire(&rs->slock); - for (left = len; left; left -= xfer_size) { - if (!rs_can_send(rs)) { - ret = rs_process_cq(rs, rs_nonblocking(rs, flags), rs_can_send); -@@ -1262,7 +1241,7 @@ static ssize_t rsendv(int socket, const struct iovec *iov, int iovcnt, int flags - if (ret) - break; - } -- pthread_mutex_unlock(&rs->slock); -+ fastlock_release(&rs->slock); - - return (ret && left == len) ? ret : len - left; - } -- 2.41.0