From: Alexander Schmidt Date: Fri, 20 Aug 2010 09:06:16 +0000 (+0000) Subject: Handle huge pages in ibv_fork_init() and madvise tracking X-Git-Url: https://openfabrics.org/gitweb/?a=commitdiff_plain;h=422642f1cf021b9de1f445f4392c99d3c2fe6624;p=~shefty%2Flibibverbs.git Handle huge pages in ibv_fork_init() and madvise tracking When fork support is enabled in libibverbs, madvise() is called for every memory page that is registered as a memory region. Memory ranges that are passed to madvise() must be page aligned and the size must be a multiple of the page size. libibverbs uses sysconf(_SC_PAGESIZE) to find out the system page size and rounds all ranges passed to reg_mr() according to this page size. When memory from libhugetlbfs is passed to reg_mr(), this does not work as the page size for this memory range might be different (e.g. 16MB). So libibverbs would have to use the huge page size to calculate a page aligned range for madvise. As huge pages are provided to the application "under the hood" when preloading libhugetlbfs, the application does not have any knowledge about when it registers a huge page or a usual page. To work around this issue, detect the use of huge pages in libibverbs and align memory ranges passed to madvise according to the huge page size. Determining the page size of a given memory range by watching madvise() fail has proven to be unreliable. So we introduce the RDMAV_HUGEPAGES_SAFE environment variable to let the user decide if the page size should be checked on every reg_mr() call or not. This requires the user to be aware if huge pages are used by the running application or not. I did not add an aditional API call to enable this, as applications can use setenv() + ibv_fork_init() to enable checking for huge pages in the code. Signed-off-by: Alexander Schmidt [ Updated ibv_fork_init() manpage for RDMAV_HUGEPAGES_SAFE. - Roland ] Signed-off-by: Roland Dreier --- diff --git a/man/ibv_fork_init.3 b/man/ibv_fork_init.3 index 6f2a287..acffe3c 100644 --- a/man/ibv_fork_init.3 +++ b/man/ibv_fork_init.3 @@ -41,12 +41,22 @@ or has the same effect as calling .B ibv_fork_init()\fR. .PP +Setting the environment variable +.BR RDMAV_HUGEPAGES_SAFE +tells the library to check the underlying page size used by the kernel +for memory regions. This is required if an application uses huge +pages either directly or indirectly via a library such as libhugetlbfs. +.PP Calling .B ibv_fork_init() will reduce performance due to an extra system call for every memory registration, and the additional memory allocated to track memory regions. The precise performance impact depends on the workload and usually will not be significant. +.PP +Setting +.BR RDMAV_HUGEPAGES_SAFE +adds further overhead to all memory registrations. .SH "SEE ALSO" .BR fork (2), .BR wait (2), diff --git a/src/memory.c b/src/memory.c index ce58ae8..faa43f3 100644 --- a/src/memory.c +++ b/src/memory.c @@ -40,6 +40,10 @@ #include #include #include +#include +#include +#include +#include #include "ibverbs.h" @@ -68,12 +72,72 @@ struct ibv_mem_node { static struct ibv_mem_node *mm_root; static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER; static int page_size; +static int huge_page_enabled; static int too_late; +static unsigned long smaps_page_size(FILE *file) +{ + int n; + unsigned long size = page_size; + char buf[1024]; + + while (fgets(buf, sizeof(buf), file) != NULL) { + if (!strstr(buf, "KernelPageSize:")) + continue; + + n = sscanf(buf, "%*s %lu", &size); + if (n < 1) + continue; + + /* page size is printed in Kb */ + size = size * 1024; + + break; + } + + return size; +} + +static unsigned long get_page_size(void *base) +{ + unsigned long ret = page_size; + pid_t pid; + FILE *file; + char buf[1024]; + + pid = getpid(); + snprintf(buf, sizeof(buf), "/proc/%d/smaps", pid); + + file = fopen(buf, "r"); + if (!file) + goto out; + + while (fgets(buf, sizeof(buf), file) != NULL) { + int n; + uintptr_t range_start, range_end; + + n = sscanf(buf, "%lx-%lx", &range_start, &range_end); + + if (n < 2) + continue; + + if ((uintptr_t) base >= range_start && (uintptr_t) base < range_end) { + ret = smaps_page_size(file); + break; + } + } + + fclose(file); + +out: + return ret; +} + int ibv_fork_init(void) { - void *tmp; + void *tmp, *tmp_aligned; int ret; + unsigned long size; if (mm_root) return 0; @@ -88,8 +152,21 @@ int ibv_fork_init(void) if (posix_memalign(&tmp, page_size, page_size)) return ENOMEM; - ret = madvise(tmp, page_size, MADV_DONTFORK) || - madvise(tmp, page_size, MADV_DOFORK); + if (getenv("RDMAV_HUGEPAGES_SAFE")) + huge_page_enabled = 1; + else + huge_page_enabled = 0; + + if (huge_page_enabled) { + size = get_page_size(tmp); + tmp_aligned = (void *) ((uintptr_t) tmp & ~(size - 1)); + } else { + size = page_size; + tmp_aligned = tmp; + } + + ret = madvise(tmp_aligned, size, MADV_DONTFORK) || + madvise(tmp_aligned, size, MADV_DOFORK); free(tmp); @@ -529,13 +606,19 @@ static int ibv_madvise_range(void *base, size_t size, int advice) int inc; int rolling_back = 0; int ret = 0; + unsigned long range_page_size; if (!size) return 0; - start = (uintptr_t) base & ~(page_size - 1); - end = ((uintptr_t) (base + size + page_size - 1) & - ~(page_size - 1)) - 1; + if (huge_page_enabled) + range_page_size = get_page_size(base); + else + range_page_size = page_size; + + start = (uintptr_t) base & ~(range_page_size - 1); + end = ((uintptr_t) (base + size + range_page_size - 1) & + ~(range_page_size - 1)) - 1; pthread_mutex_lock(&mm_mutex); again: