From: leonidk Date: Wed, 20 Sep 2006 11:55:57 +0000 (+0000) Subject: [MTHCA] bugfix: data corruption while registering MR X-Git-Url: https://openfabrics.org/gitweb/?a=commitdiff_plain;h=6c6bed88944bbd1fe68727dff274a0c77c8d27b3;p=~shefty%2Frdma-win.git [MTHCA] bugfix: data corruption while registering MR If a buffer to be registered overlaps a buffer, already registered, a race can happen between HCA, writing to the previously registered buffer and the probing functions (MmProbeAndLockPages, MmSecureVirtualMemory), used in the algorithm of memory registration. To prevent the race we maintain reference counters for the physical pages, being registered, and register every physical page FOR THE WRITE ACCESS only once. git-svn-id: svn://openib.tc.cornell.edu/gen1@501 ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86 --- diff --git a/trunk/hw/mthca/kernel/SOURCES b/trunk/hw/mthca/kernel/SOURCES index 4c7f63ec..ff1e0dae 100644 --- a/trunk/hw/mthca/kernel/SOURCES +++ b/trunk/hw/mthca/kernel/SOURCES @@ -14,6 +14,7 @@ SOURCES= \ mthca_log.mc \ mthca_log.rc \ hca.rc \ + mthca_log.c \ \ ..\hca_utils.c \ ..\mt_utils.c \ @@ -37,6 +38,7 @@ SOURCES= \ mt_ud_header.c \ mt_uverbs.c \ mt_verbs.c \ + mt_pa_cash.c \ \ mthca_allocator.c \ mthca_av.c \ @@ -44,7 +46,6 @@ SOURCES= \ mthca_cmd.c \ mthca_cq.c \ mthca_eq.c \ - mthca_log.c \ mthca_mad.c \ mthca_main.c \ mthca_mcg.c \ diff --git a/trunk/hw/mthca/kernel/hca_driver.c b/trunk/hw/mthca/kernel/hca_driver.c index df6ad81d..38586cd3 100644 --- a/trunk/hw/mthca/kernel/hca_driver.c +++ b/trunk/hw/mthca/kernel/hca_driver.c @@ -53,7 +53,7 @@ #pragma warning( pop ) #include #include "mthca/mthca_vc.h" - +#include "mt_pa_cash.h" /* from \inc\platform\evntrace.h #define TRACE_LEVEL_NONE 0 // Tracing is not on #define TRACE_LEVEL_FATAL 1 // Abnormal exit or termination @@ -196,6 +196,15 @@ DriverEntry( } // cl_memclr( mlnx_hca_array, MLNX_MAX_HCA * sizeof(ci_interface_t) ); + /* init pa cash */ + status = pa_cash_init(); + if (status) + { + HCA_PRINT( TRACE_LEVEL_ERROR ,HCA_DBG_INIT , + ("pa_cash_init failed.\n")); + return status; + } + /*leo: init function table */ hca_init_vfptr(); @@ -345,6 +354,7 @@ hca_drv_unload( UNUSED_PARAM( p_driver_obj ); + pa_cash_release(); ib_uverbs_cleanup(); ib_core_cleanup(); cl_free( g_param_path.Buffer ); diff --git a/trunk/hw/mthca/kernel/mt_memory.c b/trunk/hw/mthca/kernel/mt_memory.c index a91b5f08..755ea961 100644 --- a/trunk/hw/mthca/kernel/mt_memory.c +++ b/trunk/hw/mthca/kernel/mt_memory.c @@ -42,6 +42,8 @@ #include "mt_memory.tmh" #endif +#include "mt_pa_cash.h" + /* * Function: map user buffer to kernel and lock it @@ -343,7 +345,7 @@ static int register_segment( IN u64 size, IN int is_user, IN ib_access_t acc, - IN OUT mt_iobuf_t * iobuf_p) + OUT mt_iobuf_seg_t **iobuf_seg) { PMDL mdl_p; int rc; @@ -375,15 +377,15 @@ static int register_segment( // make context-dependent things if (is_user) { ASSERT(KeGetCurrentIrql() < DISPATCH_LEVEL); - mode = UserMode; + mode = UserMode; } else { /* Mapping to kernel virtual address */ // MmBuildMdlForNonPagedPool(mdl_p); // fill MDL ??? - should we do that really ? - mode = KernelMode; + mode = KernelMode; } __try { /* try */ - MmProbeAndLockPages( mdl_p, mode, Operation ); /* lock memory */ + MmProbeAndLockPages( mdl_p, mode, Operation ); /* lock memory */ } /* try */ __except (EXCEPTION_EXECUTE_HANDLER) { @@ -400,7 +402,7 @@ static int register_segment( new_iobuf->nr_pages = ADDRESS_AND_SIZE_TO_SPAN_PAGES( va, size ); new_iobuf->mdl_p = mdl_p; new_iobuf->is_user = is_user; - InsertTailList( &iobuf_p->seg_que, &new_iobuf->link ); + *iobuf_seg = new_iobuf; return 0; err_probe: @@ -411,6 +413,21 @@ err_nomem: return rc; } +void iobuf_init( + IN u64 va, + IN u64 size, + IN int is_user, + IN OUT mt_iobuf_t *iobuf_p) +{ + iobuf_p->va = va; + iobuf_p->size= size; + iobuf_p->is_user = is_user; + InitializeListHead( &iobuf_p->seg_que ); + iobuf_p->seg_num = 0; + iobuf_p->nr_pages = 0; + iobuf_p->is_cashed = 0; +} + int iobuf_register( IN u64 va, IN u64 size, @@ -423,6 +440,7 @@ int iobuf_register( u64 seg_size; // current segment size u64 rdc; // remain data counter - what is rest to lock u64 delta; // he size of the last not full page of the first segment + mt_iobuf_seg_t * new_iobuf; unsigned page_size = PAGE_SIZE; // 32 - for any case @@ -431,27 +449,27 @@ int iobuf_register( ASSERT(KeGetCurrentIrql() <= DISPATCH_LEVEL); - // init IOBUF object - InitializeListHead( &iobuf_p->seg_que ); - iobuf_p->seg_num = 0; - - // Round the seg_va down to a page boundary so that we always get a seg_size - // that is an integral number of pages. - delta = va & (PAGE_SIZE - 1); - seg_va = va - delta; - // Since we rounded down the seg_va, we need to round up the rdc and size. - seg_size = rdc = size + delta; + // we'll try to register all at once. + seg_va = va; + seg_size = rdc = size; // allocate segments while (rdc > 0) { // map a segment - rc = register_segment(seg_va, seg_size, is_user, acc, iobuf_p ); + rc = register_segment(seg_va, seg_size, is_user, acc, &new_iobuf ); // success - move to another segment if (!rc) { rdc -= seg_size; seg_va += seg_size; + InsertTailList( &iobuf_p->seg_que, &new_iobuf->link ); iobuf_p->seg_num++; + // round the segment size to the next page boundary + delta = (seg_va + seg_size) & (page_size - 1); + if (delta) { + seg_size -= delta; + seg_size += page_size; + } if (seg_size > rdc) seg_size = rdc; continue; @@ -464,14 +482,14 @@ int iobuf_register( break; // lessen the size seg_size >>= 1; - // round the segment size to the page boundary (only for the first segment) - if (iobuf_p->seg_num == 0) { - delta = (seg_va + seg_size) & (page_size - 1); + // round the segment size to the next page boundary + delta = (seg_va + seg_size) & (page_size - 1); + if (delta) { seg_size -= delta; seg_size += page_size; - if (seg_size > rdc) - seg_size = rdc; } + if (seg_size > rdc) + seg_size = rdc; continue; } @@ -482,18 +500,157 @@ int iobuf_register( // SUCCESS if (rc) iobuf_deregister( iobuf_p ); - else { - // fill IOBUF object - iobuf_p->va = va; - iobuf_p->size= size; - iobuf_p->nr_pages = ADDRESS_AND_SIZE_TO_SPAN_PAGES( va, size ); - iobuf_p->is_user = is_user; - } + else + iobuf_p->nr_pages += ADDRESS_AND_SIZE_TO_SPAN_PAGES( va, size ); return rc; } +static void __iobuf_copy( + IN OUT mt_iobuf_t *dst_iobuf_p, + IN mt_iobuf_t *src_iobuf_p + ) +{ + int i; + mt_iobuf_seg_t *iobuf_seg_p; + + *dst_iobuf_p = *src_iobuf_p; + InitializeListHead( &dst_iobuf_p->seg_que ); + for (i=0; iseg_num; ++i) { + iobuf_seg_p = (mt_iobuf_seg_t *)(PVOID)RemoveHeadList( &src_iobuf_p->seg_que ); + InsertTailList( &dst_iobuf_p->seg_que, &iobuf_seg_p->link ); + } +} + +/* if the buffer to be registered overlaps a buffer, already registered, + a race can happen between HCA, writing to the previously registered + buffer and the probing functions (MmProbeAndLockPages, MmSecureVirtualMemory), + used in the algorithm of memory registration. + To prevent the race we maintain reference counters for the physical pages, being registered, + and register every physical page FOR THE WRITE ACCESS only once.*/ + +int iobuf_register_with_cash( + IN u64 vaddr, + IN u64 size, + IN int is_user, + IN OUT ib_access_t *acc_p, + IN OUT mt_iobuf_t *iobuf_p) +{ + int rc, pa_in; + mt_iobuf_t sec_iobuf; + int i, page_in , page_out, page_in_total; + int nr_pages; + char *subregion_start, *va; + u64 subregion_size; + u64 rdc; // remain data counter - what is rest to lock + u64 delta; // he size of the last not full page of the first segment + ib_access_t acc; + + down(&g_pa_mutex); + + // register memory for read access to bring pages into the memory + rc = iobuf_register( vaddr, size, is_user, 0, iobuf_p); + + // on error or read access - exit + if (rc || !(*acc_p & IB_AC_LOCAL_WRITE)) + goto exit; + + // re-register buffer with the correct access rights + iobuf_init( (u64)vaddr, size, is_user, &sec_iobuf ); + nr_pages = ADDRESS_AND_SIZE_TO_SPAN_PAGES( vaddr, size ); + subregion_start = va = (char*)(ULONG_PTR)vaddr; + rdc = size; + pa_in = page_in = page_in_total = page_out = 0; + + for (i=0; i rdc) + subregion_size = rdc; + + // register the subregion + rc = iobuf_register( (u64)subregion_start, subregion_size, is_user, acc, &sec_iobuf); + if (rc) + goto cleanup; + + // prepare to the next loop + rdc -= subregion_size; + subregion_start +=subregion_size; + } + } + + // prepare to registration of the subregion + if (pa_in) { // SUBREGION WITH READ ACCESS + acc = 0; + subregion_size = (u64)page_in * PAGE_SIZE; + } + else { // SUBREGION WITH WRITE ACCESS + acc = IB_AC_LOCAL_WRITE; + subregion_size = (u64)page_out * PAGE_SIZE; + } + + // round the subregion size to the page boundary + delta = (u64)(subregion_start + subregion_size) & (PAGE_SIZE - 1); + subregion_size -= delta; + if (subregion_size > rdc) + subregion_size = rdc; + + // register the subregion + rc = iobuf_register( (u64)subregion_start, subregion_size, is_user, acc, &sec_iobuf); + if (rc) + goto cleanup; + + // cash phys pages + rc = pa_register(iobuf_p); + if (rc) + goto err_pa_reg; + + // replace the iobuf + iobuf_deregister( iobuf_p ); + sec_iobuf.is_cashed = TRUE; + __iobuf_copy( iobuf_p, &sec_iobuf ); + + // buffer is a part of also registered buffer - change the rights + if (page_in_total) + *acc_p = MTHCA_ACCESS_REMOTE_READ; + + goto exit; + +err_pa_reg: + iobuf_deregister( &sec_iobuf ); +cleanup: + iobuf_deregister( iobuf_p ); +exit: + up(&g_pa_mutex); + return rc; +} + static void deregister_segment(mt_iobuf_seg_t * iobuf_seg_p) { MmUnlockPages( iobuf_seg_p->mdl_p ); // unlock the buffer @@ -516,6 +673,17 @@ void iobuf_deregister(mt_iobuf_t *iobuf_p) ASSERT(iobuf_p->seg_num == 0); } +void iobuf_deregister_with_cash(mt_iobuf_t *iobuf_p) +{ + ASSERT(KeGetCurrentIrql() < DISPATCH_LEVEL); + + down(&g_pa_mutex); + if (iobuf_p->is_cashed) + pa_deregister(iobuf_p); + iobuf_deregister(iobuf_p); + up(&g_pa_mutex); +} + void iobuf_iter_init( IN mt_iobuf_t *iobuf_p, IN OUT mt_iobuf_iter_t *iterator_p) diff --git a/trunk/hw/mthca/kernel/mt_memory.h b/trunk/hw/mthca/kernel/mt_memory.h index 0c24911f..98b0d545 100644 --- a/trunk/hw/mthca/kernel/mt_memory.h +++ b/trunk/hw/mthca/kernel/mt_memory.h @@ -258,6 +258,7 @@ typedef struct _mt_iobuf { u32 nr_pages; int is_user; int seg_num; + int is_cashed; } mt_iobuf_t; /* iterator for getting segments of tpt */ @@ -266,7 +267,22 @@ typedef struct _mt_iobuf_iter { unsigned int pfn_ix; /* index from where to take the next translation */ } mt_iobuf_iter_t; -void iobuf_deregister(mt_iobuf_t *iobuf_p); +void iobuf_deregister_with_cash(IN mt_iobuf_t *iobuf_p); + +void iobuf_deregister(IN mt_iobuf_t *iobuf_p); + +void iobuf_init( + IN u64 va, + IN u64 size, + IN int is_user, + IN OUT mt_iobuf_t *iobuf_p); + +int iobuf_register_with_cash( + IN u64 vaddr, + IN u64 size, + IN int is_user, + IN OUT ib_access_t *acc_p, + IN OUT mt_iobuf_t *iobuf_p); int iobuf_register( IN u64 va, diff --git a/trunk/hw/mthca/kernel/mt_pa_cash.c b/trunk/hw/mthca/kernel/mt_pa_cash.c new file mode 100644 index 00000000..0a86cf60 --- /dev/null +++ b/trunk/hw/mthca/kernel/mt_pa_cash.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: mlnx_uvp_cq.c 1611 2006-08-20 14:48:55Z sleybo $ + */ + +#include "mt_pa_cash.h" + +#ifdef _WIN64 +#define MAX_PAGES_SUPPORTED (64 * 1024 * 1024) +#else +#define MAX_PAGES_SUPPORTED (16 * 1024 * 1024) +#endif + +typedef struct { + int ref_cnt; +} pa_table_entry_t; + +#define PA_TABLE_ENTRY_SIZE sizeof(pa_table_entry_t) +#define PA_TABLE_ENTRY_NUM (PAGE_SIZE / PA_TABLE_ENTRY_SIZE) +#define PA_TABLE_SIZE (PA_TABLE_ENTRY_SIZE * PA_TABLE_ENTRY_NUM) + +#define PA_DIR_ENTRY_SIZE sizeof(void*) +#define PA_DIR_ENTRY_NUM (MAX_PAGES_SUPPORTED /PA_TABLE_ENTRY_NUM) +#define PA_DIR_SIZE (PA_DIR_ENTRY_SIZE * PA_DIR_ENTRY_NUM) + +struct pa_cash_s { + pa_table_entry_t **pa_dir; + uint32_t max_nr_pages; +} g_cash = { NULL, 0 }; + +KMUTEX g_pa_mutex; +u64 g_pa[1024]; + +int pa_cash_init() +{ + void *pa_dir; + pa_dir = kzalloc(PA_DIR_SIZE, GFP_KERNEL); + + if (!pa_dir) + return -ENOMEM; + g_cash.pa_dir = pa_dir; + g_cash.max_nr_pages = PA_TABLE_ENTRY_NUM * PA_DIR_ENTRY_NUM; + KeInitializeMutex(&g_pa_mutex, 0); + return 0; +} + +void pa_cash_release() +{ + int i; + /* free cash tables */ + for (i=0; i> PAGE_SHIFT); + pa_table_entry_t *pa_te; + + /* or pa is incorrect or memory that big is not supported */ + if (ix > g_cash.max_nr_pages) { + ASSERT(FALSE); + return -EFAULT; + } + + pa_te = g_cash.pa_dir[ix / PA_TABLE_ENTRY_NUM]; + + /* no this page_table */ + if (!pa_te) + return 0; + + return pa_te[ix % PA_TABLE_ENTRY_NUM].ref_cnt; +} + +int __add_pa(uint64_t pa) +{ + uint32_t ix = (uint32_t)(pa >> PAGE_SHIFT); + pa_table_entry_t *pa_te; + + /* or pa is incorrect or memory that big is not supported */ + if (ix > g_cash.max_nr_pages) { + ASSERT(FALSE); + return -EFAULT; + } + + pa_te = g_cash.pa_dir[ix / PA_TABLE_ENTRY_NUM]; + + /* no this page_table - add a new one */ + if (!pa_te) { + pa_te = kzalloc(PA_DIR_SIZE, GFP_KERNEL); + if (!pa_te) + return -ENOMEM; + g_cash.pa_dir[ix / PA_TABLE_ENTRY_NUM] = pa_te; + } + + /* register page address */ + ++pa_te[ix % PA_TABLE_ENTRY_NUM].ref_cnt; + return 0; +} + +int __rmv_pa(uint64_t pa) +{ + uint32_t ix = (uint32_t)(pa >> PAGE_SHIFT); + pa_table_entry_t *pa_te; + + /* or pa is incorrect or memory that big is not supported */ + if (ix > g_cash.max_nr_pages) { + ASSERT(FALSE); + return -EFAULT; + } + + pa_te = g_cash.pa_dir[ix / PA_TABLE_ENTRY_NUM]; + + /* no this page_table - error*/ + if (!pa_te) { + ASSERT(FALSE); + return -EFAULT; + } + + /* deregister page address */ + --pa_te[ix % PA_TABLE_ENTRY_NUM].ref_cnt; + ASSERT(pa_te[ix % PA_TABLE_ENTRY_NUM].ref_cnt >= 0); + return 0; +} + +int pa_register(mt_iobuf_t *iobuf_p) +{ + int i,j,n; + mt_iobuf_iter_t iobuf_iter; + + iobuf_iter_init( iobuf_p, &iobuf_iter ); + n = 0; + for (;;) { + i = iobuf_get_tpt_seg( iobuf_p, &iobuf_iter, + sizeof(g_pa) / sizeof (u64), g_pa ); + if (!i) + break; + for (j=0; jiobuf; - err = iobuf_register( (u64)vaddr, length, um_call, - (acc & ~MTHCA_ACCESS_REMOTE_READ) ? IB_AC_LOCAL_WRITE : 0, iobuf_p ); + iobuf_init( (u64)vaddr, length, um_call, iobuf_p); + ib_acc = (acc & ~MTHCA_ACCESS_REMOTE_READ) ? IB_AC_LOCAL_WRITE : 0; + err = iobuf_register_with_cash( (u64)vaddr, length, um_call, + &ib_acc, iobuf_p ); if (err) goto err_reg_mem; mr->iobuf_used = TRUE; @@ -1076,7 +1079,7 @@ struct ib_mr *mthca_reg_virt_mr(struct ib_pd *pd, goto done; __try { mr->secure_handle = MmSecureVirtualMemory ( vaddr, (SIZE_T)length, - (acc & ~MTHCA_ACCESS_REMOTE_READ) ? PAGE_READWRITE : PAGE_READONLY ); + (ib_acc & IB_AC_LOCAL_WRITE) ? PAGE_READWRITE : PAGE_READONLY ); if (mr->secure_handle == NULL) { err = -EFAULT; goto err_secure; @@ -1090,6 +1093,7 @@ struct ib_mr *mthca_reg_virt_mr(struct ib_pd *pd, err = -EFAULT; goto err_secure; } + done: free_page((void*) pages); @@ -1117,11 +1121,11 @@ int mthca_dereg_mr(struct ib_mr *mr) struct mthca_mr *mmr = to_mmr(mr); struct mthca_dev* dev = to_mdev(mr->device); - if (mr->pd->ucontext) + if (mmr->secure_handle) MmUnsecureVirtualMemory ( mmr->secure_handle ); mthca_free_mr(dev, mmr); if (mmr->iobuf_used) - iobuf_deregister(&mmr->iobuf); + iobuf_deregister_with_cash(&mmr->iobuf); kfree(mmr); return 0; }