]> git.openfabrics.org - ~shefty/librdmacm.git/commitdiff
commit
authorSean Hefty <sean.hefty@intel.com>
Wed, 16 May 2012 22:49:45 +0000 (15:49 -0700)
committerSean Hefty <sean.hefty@intel.com>
Wed, 16 May 2012 22:49:45 +0000 (15:49 -0700)
meta
patches/rs-locking [deleted file]

diff --git a/meta b/meta
index cf5353859ffcbc232b9f55f057ac5b7ebeee58ec..1231564e160ae7768d15cd3ddabc96595ea795df 100644 (file)
--- a/meta
+++ b/meta
@@ -1,8 +1,7 @@
 Version: 1
-Previous: 4f4f74d9a95d23801c97fd3165269c923437a6f8
+Previous: 9f82c11aa5504370e77e44ff124162262089df13
 Head: 992a8c5abcb0199fd6214fd31b6695573ccb5bf5
 Applied:
-  rs-locking: ec6a8efe211b0dc98548443c2e0d67e2c355351f
   reuseaddr: 5ce7d9c48d082fd1959918e9134f4bdd85c402d9
   rstream-delay: 8f0437ec677c4752126de8667e093667dd681d56
   rstream-async-opt: 0ee1dfc9ad50f07fd3708ebb04cb92e0fb2f41d5
diff --git a/patches/rs-locking b/patches/rs-locking
deleted file mode 100644 (file)
index aa3a2ba..0000000
+++ /dev/null
@@ -1,281 +0,0 @@
-Bottom: de666c51520c9988ea3a07e332fa0402fdef6010
-Top:    3fcdf48082614bb29c44242534cfeecb47e07114
-Author: Sean Hefty <sean.hefty@intel.com>
-Date:   2012-05-07 17:16:47 -0700
-
-rsockets: Optimize synchronization to improve performance
-
-Hotspot performance analysis using VTune showed pthread_mutex_unlock()
-as the most significant hotspot when transferring small messages using
-rstream.  To reduce the impact of using pthread mutexes, replace it
-with a custom lock built using an atomic variable and a semaphore.
-When there's no contention for the lock (which is the expected case
-for nonblocking sockets), the synchronization is reduced to
-incrementing and decrementing an atomic variable.
-
-A test that acquired and released a lock 2 billion times reported that
-the custom lock was roughly 20% faster than using the mutex.
-26.6 seconds versus 33.0 seconds.
-
-Unfortunately, further analysis showed that using the custom lock
-provided a minimal performance gain on rstream itself, and simply
-moved the hotspot to the custom unlock call.  The hotspot is likely
-a result of some other interaction, rather than caused by slowness
-in releasing a lock.  However, we keep the custom lock based on
-the results of the direct lock tests that were done.
-
-Signed-off-by: Sean Hefty <sean.hefty@intel.com>
-
-
----
-
-diff --git a/configure.in b/configure.in
-index 238ee59..fa90fcb 100644
---- a/configure.in
-+++ b/configure.in
-@@ -48,6 +48,16 @@ AC_CHECK_MEMBER(struct ibv_path_record.service_id, [],
-     AC_DEFINE(DEFINE_PATH_RECORD, 1, [adding path record definition]),
-     [#include <infiniband/sa.h>])
-+dnl Check for gcc atomic intrinsics
-+AC_MSG_CHECKING(compiler support for atomics)
-+AC_TRY_LINK([int i = 0;],
-+    [ return __sync_add_and_fetch(&i, 1) != __sync_sub_and_fetch(&i, 1); ],
-+    [ AC_MSG_RESULT(yes) ],
-+    [
-+        AC_MSG_RESULT(no)
-+        AC_DEFINE(DEFINE_ATOMICS, 1, [Set to 1 to implement atomics])
-+    ])
-+
- dnl Checks for header files.
- AC_HEADER_STDC
- if test "$disable_libcheck" != "yes"; then
-diff --git a/src/cma.h b/src/cma.h
-index 91528c0..7703fe8 100644
---- a/src/cma.h
-+++ b/src/cma.h
-@@ -42,6 +42,7 @@
- #include <errno.h>
- #include <endian.h>
- #include <byteswap.h>
-+#include <semaphore.h>
- #include <rdma/rdma_cma.h>
-@@ -68,6 +69,41 @@ static inline uint64_t ntohll(uint64_t x) { return x; }
- #define min(a, b) (a < b ? a : b)
-+/*
-+ * Fast synchronization for low contention locking.
-+ */
-+#if DEFINE_ATOMICS
-+#define fastlock_t pthread_mutex_t
-+#define fastlock_init(lock) pthread_mutex_init(lock, NULL)
-+#define fastlock_destroy(lock) pthread_mutex_destroy(lock)
-+#define fastlock_acquire(lock) pthread_mutex_lock(lock)
-+#define fastlock_release(lock) pthread_mutex_unlock(lock)
-+#else
-+typedef struct {
-+      sem_t sem;
-+      volatile int cnt;
-+} fastlock_t;
-+static inline void fastlock_init(fastlock_t *lock)
-+{
-+      sem_init(&lock->sem, 0, 0);
-+      lock->cnt = 0;
-+}
-+static inline void fastlock_destroy(fastlock_t *lock)
-+{
-+      sem_destroy(&lock->sem);
-+}
-+static inline void fastlock_acquire(fastlock_t *lock)
-+{
-+      if (__sync_add_and_fetch(&lock->cnt, 1) > 1)
-+              sem_wait(&lock->sem);
-+}
-+static inline void fastlock_release(fastlock_t *lock)
-+{
-+      if (__sync_sub_and_fetch(&lock->cnt, 1) > 0)
-+              sem_post(&lock->sem);
-+}
-+#endif /* DEFINE_ATOMICS */
-+
- int ucma_complete(struct rdma_cm_id *id);
- static inline int ERR(int err)
- {
-diff --git a/src/rsocket.c b/src/rsocket.c
-index 775e9b0..2ffde9b 100644
---- a/src/rsocket.c
-+++ b/src/rsocket.c
-@@ -141,11 +141,10 @@ enum rs_state {
- struct rsocket {
-       struct rdma_cm_id *cm_id;
--      pthread_mutex_t   slock;
--      pthread_mutex_t   rlock;
--      pthread_mutex_t   cq_lock;
--      pthread_cond_t    cq_cond;
--      int               cq_busy;
-+      fastlock_t        slock;
-+      fastlock_t        rlock;
-+      fastlock_t        cq_lock;
-+      fastlock_t        cq_wait_lock;
-       int               opts;
-       long              fd_flags;
-@@ -225,10 +224,10 @@ static struct rsocket *rs_alloc(struct rsocket *inherited_rs)
-       rs->index = -1;
-       rs->sbuf_size = inherited_rs ? inherited_rs->sbuf_size : RS_BUF_SIZE;
-       rs->rbuf_size = inherited_rs ? inherited_rs->rbuf_size : RS_BUF_SIZE;
--      pthread_mutex_init(&rs->slock, NULL);
--      pthread_mutex_init(&rs->rlock, NULL);
--      pthread_mutex_init(&rs->cq_lock, NULL);
--      pthread_cond_init(&rs->cq_cond, NULL);
-+      fastlock_init(&rs->slock);
-+      fastlock_init(&rs->rlock);
-+      fastlock_init(&rs->cq_lock);
-+      fastlock_init(&rs->cq_wait_lock);
-       return rs;
- }
-@@ -375,10 +374,10 @@ static void rs_free(struct rsocket *rs)
-               rdma_destroy_id(rs->cm_id);
-       }
--      pthread_cond_destroy(&rs->cq_cond);
--      pthread_mutex_destroy(&rs->cq_lock);
--      pthread_mutex_destroy(&rs->rlock);
--      pthread_mutex_destroy(&rs->slock);
-+      fastlock_destroy(&rs->cq_wait_lock);
-+      fastlock_destroy(&rs->cq_lock);
-+      fastlock_destroy(&rs->rlock);
-+      fastlock_destroy(&rs->slock);
-       free(rs);
- }
-@@ -833,13 +832,6 @@ static int rs_get_cq_event(struct rsocket *rs)
-       return ret;
- }
--static void rs_signal_cq_waiters(struct rsocket *rs)
--{
--      pthread_mutex_lock(&rs->cq_lock);
--      pthread_cond_signal(&rs->cq_cond);
--      pthread_mutex_unlock(&rs->cq_lock);
--}
--
- /*
-  * Although we serialize rsend and rrecv calls with respect to themselves,
-  * both calls may run simultaneously and need to poll the CQ for completions.
-@@ -850,31 +842,15 @@ static void rs_signal_cq_waiters(struct rsocket *rs)
-  * which could be stalled until the remote process calls rrecv.  This should
-  * not block rrecv from receiving data from the remote side however.
-  *
-- * Perform a quick test before trying to acquire any locks.  Also note that
-- * busy may be set to 1 by another thread once it's been reset to 0.
-+ * We handle this by using two locks.  The cq_lock protects against polling
-+ * the CQ and processing completions.  The cq_wait_lock serializes access to
-+ * waiting on the CQ.
-  */
- static int rs_process_cq(struct rsocket *rs, int nonblock, int (*test)(struct rsocket *rs))
- {
-       int ret;
--      pthread_mutex_lock(&rs->cq_lock);
--      if (rs->cq_busy && nonblock) {
--              pthread_mutex_unlock(&rs->cq_lock);
--              return ERR(EWOULDBLOCK);
--      }
--
--      while (rs->cq_busy && !test(rs)) {
--              pthread_cond_wait(&rs->cq_cond, &rs->cq_lock);
--
--              if (test(rs)) {
--                      pthread_mutex_unlock(&rs->cq_lock);
--                      return 0;
--              }
--      }
--
--      rs->cq_busy = 1;
--      pthread_mutex_unlock(&rs->cq_lock);
--
-+      fastlock_acquire(&rs->cq_lock);
-       do {
-               rs_update_credits(rs);
-               ret = rs_poll_cq(rs);
-@@ -890,14 +866,17 @@ static int rs_process_cq(struct rsocket *rs, int nonblock, int (*test)(struct rs
-                       rs->cq_armed = 1;
-               } else {
-                       rs_update_credits(rs);
--                      rs_signal_cq_waiters(rs);
-+                      fastlock_acquire(&rs->cq_wait_lock);
-+                      fastlock_release(&rs->cq_lock);
-+
-                       ret = rs_get_cq_event(rs);
-+                      fastlock_release(&rs->cq_wait_lock);
-+                      fastlock_acquire(&rs->cq_lock);
-               }
-       } while (!ret);
-       rs_update_credits(rs);
--      rs->cq_busy = 0;
--      rs_signal_cq_waiters(rs);
-+      fastlock_release(&rs->cq_lock);
-       return ret;
- }
-@@ -1002,7 +981,7 @@ ssize_t rrecv(int socket, void *buf, size_t len, int flags)
-                       return ret;
-               }
-       }
--      pthread_mutex_lock(&rs->rlock);
-+      fastlock_acquire(&rs->rlock);
-       if (!rs_have_rdata(rs)) {
-               ret = rs_process_cq(rs, rs_nonblocking(rs, flags), rs_have_rdata);
-               if (ret && errno != ECONNRESET)
-@@ -1040,7 +1019,7 @@ ssize_t rrecv(int socket, void *buf, size_t len, int flags)
-       }
-       rs->rbuf_bytes_avail += len - left;
- out:
--      pthread_mutex_unlock(&rs->rlock);
-+      fastlock_release(&rs->rlock);
-       return ret ? ret : len - left;
- }
-@@ -1105,7 +1084,7 @@ ssize_t rsend(int socket, const void *buf, size_t len, int flags)
-               }
-       }
--      pthread_mutex_lock(&rs->slock);
-+      fastlock_acquire(&rs->slock);
-       for (left = len; left; left -= xfer_size, buf += xfer_size) {
-               if (!rs_can_send(rs)) {
-                       ret = rs_process_cq(rs, rs_nonblocking(rs, flags), rs_can_send);
-@@ -1157,7 +1136,7 @@ ssize_t rsend(int socket, const void *buf, size_t len, int flags)
-               if (ret)
-                       break;
-       }
--      pthread_mutex_unlock(&rs->slock);
-+      fastlock_release(&rs->slock);
-       return (ret && left == len) ? ret : len - left;
- }
-@@ -1214,7 +1193,7 @@ static ssize_t rsendv(int socket, const struct iovec *iov, int iovcnt, int flags
-       for (i = 1; i < iovcnt; i++)
-               len += iov[i].iov_len;
--      pthread_mutex_lock(&rs->slock);
-+      fastlock_acquire(&rs->slock);
-       for (left = len; left; left -= xfer_size) {
-               if (!rs_can_send(rs)) {
-                       ret = rs_process_cq(rs, rs_nonblocking(rs, flags), rs_can_send);
-@@ -1262,7 +1241,7 @@ static ssize_t rsendv(int socket, const struct iovec *iov, int iovcnt, int flags
-               if (ret)
-                       break;
-       }
--      pthread_mutex_unlock(&rs->slock);
-+      fastlock_release(&rs->slock);
-       return (ret && left == len) ? ret : len - left;
- }