diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h index 932b6f4b8e..691af933d1 100644 --- a/opal/mca/btl/btl.h +++ b/opal/mca/btl/btl.h @@ -250,28 +250,29 @@ typedef uint8_t mca_btl_base_tag_t; #define MCA_BTL_ERROR_FLAGS_NONFATAL 0x2 #define MCA_BTL_ERROR_FLAGS_ADD_CUDA_IPC 0x4 -/** registration flags */ +/** registration flags. the access flags are a 1-1 mapping with the mpool + * access flags. */ enum { /** Allow local write on the registered region. If a region is registered * with this flag the registration can be used as the local handle for a * btl_get operation. */ - MCA_BTL_REG_FLAG_LOCAL_WRITE = 0x00000001, + MCA_BTL_REG_FLAG_LOCAL_WRITE = MCA_MPOOL_ACCESS_LOCAL_WRITE, /** Allow remote read on the registered region. If a region is registered * with this flag the registration can be used as the remote handle for a * btl_get operation. */ - MCA_BTL_REG_FLAG_REMOTE_READ = 0x00000002, + MCA_BTL_REG_FLAG_REMOTE_READ = MCA_MPOOL_ACCESS_REMOTE_READ, /** Allow remote write on the registered region. If a region is registered * with this flag the registration can be used as the remote handle for a * btl_put operation. */ - MCA_BTL_REG_FLAG_REMOTE_WRITE = 0x00000004, + MCA_BTL_REG_FLAG_REMOTE_WRITE = MCA_MPOOL_ACCESS_REMOTE_WRITE, /** Allow remote atomic operations on the registered region. If a region is * registered with this flag the registration can be used as the remote * handle for a btl_atomic_op or btl_atomic_fop operation. */ - MCA_BTL_REG_FLAG_REMOTE_ATOMIC = 0x00000008, + MCA_BTL_REG_FLAG_REMOTE_ATOMIC = MCA_MPOOL_ACCESS_REMOTE_ATOMIC, /** Allow any btl operation on the registered region. If a region is registered * with this flag the registration can be used as the local or remote handle for * any btl operation. */ - MCA_BTL_REG_FLAG_ACCESS_ANY = 0x0000000f, + MCA_BTL_REG_FLAG_ACCESS_ANY = MCA_MPOOL_ACCESS_ANY, #if OPAL_CUDA_GDR_SUPPORT /** Region is in GPU memory */ MCA_BTL_REG_FLAG_CUDA_GPU_MEM = 0x00010000, diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c index 9f82110cef..9b46701320 100644 --- a/opal/mca/btl/openib/btl_openib.c +++ b/opal/mca/btl/openib/btl_openib.c @@ -1753,6 +1753,7 @@ static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_ { mca_btl_openib_reg_t *reg; uint32_t mflags = 0; + int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; int rc; #if OPAL_CUDA_GDR_SUPPORT @@ -1761,7 +1762,7 @@ static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_ } #endif /* OPAL_CUDA_GDR_SUPPORT */ - rc = btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, mflags, + rc = btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, mflags, access_flags, (mca_mpool_base_registration_t **) ®); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == reg)) { return NULL; diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index 4bd09e23d7..0b6f9d8cf5 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -586,11 +586,24 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size, { mca_btl_openib_device_t *device = (mca_btl_openib_device_t*)reg_data; mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg; - enum ibv_access_flags access_flag = (enum ibv_access_flags) (IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); + enum ibv_access_flags access_flag = 0; + + if (reg->access_flags & MCA_MPOOL_ACCESS_REMOTE_READ) { + access_flag |= IBV_ACCESS_REMOTE_READ; + } + + if (reg->access_flags & MCA_MPOOL_ACCESS_REMOTE_WRITE) { + access_flag |= IBV_ACCESS_REMOTE_WRITE; + } + + if (reg->access_flags & MCA_MPOOL_ACCESS_LOCAL_WRITE) { + access_flag |= IBV_ACCESS_LOCAL_WRITE; + } #if HAVE_DECL_IBV_ATOMIC_HCA - access_flag |= IBV_ACCESS_REMOTE_ATOMIC; + if (reg->access_flags & MCA_MPOOL_ACCESS_REMOTE_ATOMIC) { + access_flag |= IBV_ACCESS_REMOTE_ATOMIC; + } #endif if (device->mem_reg_max && diff --git a/opal/mca/btl/scif/btl_scif_module.c b/opal/mca/btl/scif/btl_scif_module.c index 67e57dd2c9..649a7c21ef 100644 --- a/opal/mca/btl/scif/btl_scif_module.c +++ b/opal/mca/btl/scif/btl_scif_module.c @@ -181,6 +181,7 @@ static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca void *base, size_t size, uint32_t flags) { mca_btl_scif_reg_t *scif_reg; + int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; int rc; if (MCA_BTL_ENDPOINT_ANY == endpoint) { @@ -199,7 +200,7 @@ static mca_btl_base_registration_handle_t *mca_btl_scif_register_mem (struct mca } } - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0, + rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0, access_flags, (mca_mpool_base_registration_t **) &scif_reg); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return NULL; diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c index bb4aa03962..bf470f4fb7 100644 --- a/opal/mca/btl/smcuda/btl_smcuda.c +++ b/opal/mca/btl/smcuda/btl_smcuda.c @@ -1010,6 +1010,7 @@ static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem ( size_t size, uint32_t flags) { mca_mpool_common_cuda_reg_t *reg; + int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; int mpool_flags = 0; if (MCA_BTL_REG_FLAG_CUDA_GPU_MEM & flags) { @@ -1017,7 +1018,7 @@ static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem ( } btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, mpool_flags, - (mca_mpool_base_registration_t **) ®); + access_flags, (mca_mpool_base_registration_t **) ®); if (OPAL_UNLIKELY(NULL == reg)) { return NULL; } @@ -1088,6 +1089,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl, * support. */ rc = ep->mpool->mpool_register(ep->mpool, remote_handle->reg_data.memh_seg_addr.pval, remote_handle->reg_data.memh_seg_len, ep->peer_smp_rank, + MCA_MPOOL_ACCESS_LOCAL_WRITE, (mca_mpool_base_registration_t **)®_ptr); if (OPAL_SUCCESS != rc) { diff --git a/opal/mca/btl/ugni/btl_ugni_add_procs.c b/opal/mca/btl/ugni/btl_ugni_add_procs.c index 8d7f571e7c..2b547659c0 100644 --- a/opal/mca/btl/ugni/btl_ugni_add_procs.c +++ b/opal/mca/btl/ugni/btl_ugni_add_procs.c @@ -272,15 +272,24 @@ static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size, mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) reg_data; mca_btl_ugni_reg_t *ugni_reg = (mca_btl_ugni_reg_t *) reg; gni_return_t rc; + int flags; if (ugni_module->reg_count >= ugni_module->reg_max) { return OPAL_ERR_OUT_OF_RESOURCE; } + if (reg->access_flags & (MCA_MPOOL_ACCESS_REMOTE_WRITE | MCA_MPOOL_ACCESS_LOCAL_WRITE | + MCA_MPOOL_ACCESS_REMOTE_ATOMIC)) { + flags = GNI_MEM_READWRITE; + } else { + flags = GNI_MEM_READ_ONLY; + } + + flags |= GNI_MEM_RELAXED_PI_ORDERING; + OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base, - size, NULL, GNI_MEM_READWRITE | GNI_MEM_RELAXED_PI_ORDERING, - -1, &(ugni_reg->handle.gni_handle)); + size, NULL, flags, -1, &(ugni_reg->handle.gni_handle)); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { diff --git a/opal/mca/btl/ugni/btl_ugni_module.c b/opal/mca/btl/ugni/btl_ugni_module.c index 7f008c607f..311c3a1759 100644 --- a/opal/mca/btl/ugni/btl_ugni_module.c +++ b/opal/mca/btl/ugni/btl_ugni_module.c @@ -304,9 +304,10 @@ mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t * size_t size, uint32_t flags) { mca_btl_ugni_reg_t *reg; + int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; int rc; - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0, + rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0, access_flags, (mca_mpool_base_registration_t **) ®); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return NULL; diff --git a/opal/mca/btl/ugni/btl_ugni_prepare.h b/opal/mca/btl/ugni/btl_ugni_prepare.h index bd46aa227a..4988cf094d 100644 --- a/opal/mca/btl/ugni/btl_ugni_prepare.h +++ b/opal/mca/btl/ugni/btl_ugni_prepare.h @@ -75,6 +75,7 @@ mca_btl_ugni_prepare_src_send_inplace (struct mca_btl_base_module_t *btl, if (OPAL_UNLIKELY(true == use_eager_get)) { rc = btl->btl_mpool->mpool_register(btl->btl_mpool, data_ptr, *size, 0, + MCA_MPOOL_ACCESS_REMOTE_READ, (mca_mpool_base_registration_t **)®istration); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { mca_btl_ugni_frag_return (frag); diff --git a/opal/mca/btl/vader/btl_vader_knem.c b/opal/mca/btl/vader/btl_vader_knem.c index 157dc04ae2..8a270fafa6 100644 --- a/opal/mca/btl/vader/btl_vader_knem.c +++ b/opal/mca/btl/vader/btl_vader_knem.c @@ -35,8 +35,15 @@ static int mca_btl_vader_knem_reg (void *reg_data, void *base, size_t size, knem_cr.iovec_array = (uintptr_t) &knem_iov; knem_cr.iovec_nr = 1; - /* TODO -- set proper access flags when the protection is passed down */ - knem_cr.protection = PROT_READ | PROT_WRITE; + knem_cr.protection = 0; + + if (reg->access_flags & (MCA_MPOOL_ACCESS_LOCAL_WRITE | MCA_MPOOL_ACCESS_REMOTE_WRITE)) { + knem_cr.protection |= PROT_WRITE; + } + + if (reg->access_flags & MCA_MPOOL_ACCESS_REMOTE_READ) { + knem_cr.protection |= PROT_READ; + } /* Vader will explicitly destroy this cookie */ knem_cr.flags = 0; @@ -66,9 +73,10 @@ mca_btl_vader_register_mem_knem (struct mca_btl_base_module_t* btl, void *base, size_t size, uint32_t flags) { mca_btl_vader_registration_handle_t *reg = NULL; + int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; int rc; - rc = btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, 0, + rc = btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, 0, access_flags, (mca_mpool_base_registration_t **) ®); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return NULL; diff --git a/opal/mca/mpool/base/mpool_base_alloc.c b/opal/mca/mpool/base/mpool_base_alloc.c index d72b0b8d98..bcb64e3738 100644 --- a/opal/mca/mpool/base/mpool_base_alloc.c +++ b/opal/mca/mpool/base/mpool_base_alloc.c @@ -253,7 +253,7 @@ void *mca_mpool_base_alloc(size_t size, opal_info_t *info) mpool_tree_item->regs[mpool_tree_item->count++] = registration; } else { if(mpool->mpool_register(mpool, mem, size, MCA_MPOOL_FLAGS_PERSIST, - ®istration) != OPAL_SUCCESS) { + MCA_MPOOL_ACCESS_ANY, ®istration) != OPAL_SUCCESS) { if(mpool_requested) { unregister_tree_item(mpool_tree_item); goto out; diff --git a/opal/mca/mpool/gpusm/mpool_gpusm.h b/opal/mca/mpool/gpusm/mpool_gpusm.h index 9e6c993d51..537c95108a 100644 --- a/opal/mca/mpool/gpusm/mpool_gpusm.h +++ b/opal/mca/mpool/gpusm/mpool_gpusm.h @@ -75,7 +75,7 @@ void mca_mpool_gpusm_module_init(mca_mpool_gpusm_module_t *mpool); * register block of memory */ int mca_mpool_gpusm_register(mca_mpool_base_module_t* mpool, void *addr, - size_t size, uint32_t flags, mca_mpool_base_registration_t **reg); + size_t size, uint32_t flags, int32_t access_flags, mca_mpool_base_registration_t **reg); /** * deregister memory diff --git a/opal/mca/mpool/gpusm/mpool_gpusm_module.c b/opal/mca/mpool/gpusm/mpool_gpusm_module.c index fa8a915970..98740bbdcd 100644 --- a/opal/mca/mpool/gpusm/mpool_gpusm_module.c +++ b/opal/mca/mpool/gpusm/mpool_gpusm_module.c @@ -109,7 +109,7 @@ int mca_mpool_gpusm_find(mca_mpool_base_module_t *mpool, void *addr, size_t size, mca_mpool_base_registration_t **reg) { - return mca_mpool_gpusm_register(mpool, addr, size, 0, reg); + return mca_mpool_gpusm_register(mpool, addr, size, 0, 0, reg); } /* @@ -119,7 +119,7 @@ int mca_mpool_gpusm_find(mca_mpool_base_module_t *mpool, void *addr, * deregister function is a no-op. */ int mca_mpool_gpusm_register(mca_mpool_base_module_t *mpool, void *addr, - size_t size, uint32_t flags, + size_t size, uint32_t flags, int32_t access_flags, mca_mpool_base_registration_t **reg) { mca_mpool_gpusm_module_t *mpool_gpusm = (mca_mpool_gpusm_module_t*)mpool; @@ -147,6 +147,7 @@ int mca_mpool_gpusm_register(mca_mpool_base_module_t *mpool, void *addr, gpusm_reg->base = base; gpusm_reg->bound = bound; gpusm_reg->flags = flags; + gpusm_reg->access_flags = access_flags; rc = mpool_gpusm->resources.register_mem(base, size, gpusm_reg, NULL); diff --git a/opal/mca/mpool/grdma/mpool_grdma.h b/opal/mca/mpool/grdma/mpool_grdma.h index 4f5362149b..1ddbd139e0 100644 --- a/opal/mca/mpool/grdma/mpool_grdma.h +++ b/opal/mca/mpool/grdma/mpool_grdma.h @@ -112,7 +112,7 @@ void* mca_mpool_grdma_realloc( mca_mpool_base_module_t *mpool, void* addr, * register block of memory */ int mca_mpool_grdma_register(mca_mpool_base_module_t* mpool, void *addr, - size_t size, uint32_t flags, mca_mpool_base_registration_t **reg); + size_t size, uint32_t flags, int32_t access_flags, mca_mpool_base_registration_t **reg); /** * deregister memory diff --git a/opal/mca/mpool/grdma/mpool_grdma_module.c b/opal/mca/mpool/grdma/mpool_grdma_module.c index 0f96067763..b33b769bf1 100644 --- a/opal/mca/mpool/grdma/mpool_grdma_module.c +++ b/opal/mca/mpool/grdma/mpool_grdma_module.c @@ -44,6 +44,15 @@ #include "opal/mca/mpool/base/base.h" #include "mpool_grdma.h" +static inline bool registration_is_cacheable(mca_mpool_base_registration_t *reg) +{ + return (mca_mpool_grdma_component.leave_pinned && + !(reg->flags & + (MCA_MPOOL_FLAGS_CACHE_BYPASS | + MCA_MPOOL_FLAGS_PERSIST | + MCA_MPOOL_FLAGS_INVALID))); +} + #if OPAL_CUDA_GDR_SUPPORT static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *addr, size_t size); #endif /* OPAL_CUDA_GDR_SUPPORT */ @@ -155,7 +164,8 @@ void* mca_mpool_grdma_alloc(mca_mpool_base_module_t *mpool, size_t size, addr = (void*)OPAL_ALIGN((uintptr_t)base_addr, align, uintptr_t); #endif - if(OPAL_SUCCESS != mca_mpool_grdma_register(mpool, addr, size, flags, reg)) { + if(OPAL_SUCCESS != mca_mpool_grdma_register(mpool, addr, size, flags, + MCA_MPOOL_ACCESS_ANY, reg)) { free(base_addr); return NULL; } @@ -213,8 +223,8 @@ bool mca_mpool_grdma_evict (struct mca_mpool_base_module_t *mpool) /* * register memory */ -int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr, - size_t size, uint32_t flags, +int mca_mpool_grdma_register (mca_mpool_base_module_t *mpool, void *addr, + size_t size, uint32_t flags, int32_t access_flags, mca_mpool_base_registration_t **reg) { mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool; @@ -227,6 +237,8 @@ int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr, OPAL_THREAD_LOCK(&mpool->rcache->lock); + *reg = NULL; + /* if cache bypass is requested don't use the cache */ base = (unsigned char *) down_align_addr(addr, mca_mpool_base_page_size_log); bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1), @@ -249,23 +261,43 @@ int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr, * Persistent registration are always registered and placed in the cache */ if(!(bypass_cache || persist)) { /* check to see if memory is registered */ - mpool->rcache->rcache_find(mpool->rcache, base, bound - base + 1, reg); - if (*reg && !(flags & MCA_MPOOL_FLAGS_INVALID)) { - if (0 == (*reg)->ref_count) { - /* Leave pinned must be set for this to still be in the rcache. */ - opal_list_remove_item(&mpool_grdma->pool->lru_list, - (opal_list_item_t *)(*reg)); - } + mpool->rcache->rcache_find(mpool->rcache, base, bound - base + 1, &grdma_reg); + if (grdma_reg && !(flags & MCA_MPOOL_FLAGS_INVALID)) { + if (OPAL_UNLIKELY((access_flags & grdma_reg->access_flags) != access_flags)) { + access_flags |= grdma_reg->access_flags; - /* This segment fits fully within an existing segment. */ - mpool_grdma->stat_cache_hit++; - (*reg)->ref_count++; - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - return OPAL_SUCCESS; + if (0 != grdma_reg->ref_count) { + if (!(grdma_reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) { + grdma_reg->mpool->rcache->rcache_delete(grdma_reg->mpool->rcache, grdma_reg); + } + + /* mark the registration to go away when it is deregistered */ + grdma_reg->flags |= MCA_MPOOL_FLAGS_INVALID | MCA_MPOOL_FLAGS_CACHE_BYPASS; + } else { + if (registration_is_cacheable (grdma_reg)) { + /* pull the item out of the lru */ + opal_list_remove_item (&mpool_grdma->pool->lru_list, (opal_list_item_t *) grdma_reg); + } + + (void) dereg_mem (grdma_reg); + } + } else { + *reg = grdma_reg; + if (0 == grdma_reg->ref_count) { + /* Leave pinned must be set for this to still be in the rcache. */ + opal_list_remove_item(&mpool_grdma->pool->lru_list, + (opal_list_item_t *) grdma_reg); + } + + /* This segment fits fully within an existing segment. */ + mpool_grdma->stat_cache_hit++; + grdma_reg->ref_count++; + OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + return OPAL_SUCCESS; + } } mpool_grdma->stat_cache_miss++; - *reg = NULL; /* in case previous find found something */ /* Unless explicitly requested by the caller always store the * registration in the rcache. This will speed up the case where @@ -285,6 +317,7 @@ int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr, grdma_reg->base = base; grdma_reg->bound = bound; grdma_reg->flags = flags; + grdma_reg->access_flags = access_flags; #if OPAL_CUDA_GDR_SUPPORT if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) { mca_common_cuda_get_buffer_id(grdma_reg); @@ -389,15 +422,6 @@ int mca_mpool_grdma_find(struct mca_mpool_base_module_t *mpool, void *addr, return rc; } -static inline bool registration_is_cacheable(mca_mpool_base_registration_t *reg) -{ - return (mca_mpool_grdma_component.leave_pinned && - !(reg->flags & - (MCA_MPOOL_FLAGS_CACHE_BYPASS | - MCA_MPOOL_FLAGS_PERSIST | - MCA_MPOOL_FLAGS_INVALID))); -} - int mca_mpool_grdma_deregister(struct mca_mpool_base_module_t *mpool, mca_mpool_base_registration_t *reg) { @@ -412,7 +436,7 @@ int mca_mpool_grdma_deregister(struct mca_mpool_base_module_t *mpool, return OPAL_SUCCESS; } - if(registration_is_cacheable(reg)) { + if (registration_is_cacheable(reg)) { opal_list_append(&mpool_grdma->pool->lru_list, (opal_list_item_t *) reg); } else { rc = dereg_mem (reg); diff --git a/opal/mca/mpool/mpool.h b/opal/mca/mpool/mpool.h index 08d5dd03f8..a0f957438f 100644 --- a/opal/mca/mpool/mpool.h +++ b/opal/mca/mpool/mpool.h @@ -48,6 +48,14 @@ struct opal_info_t; * hooks (ptmalloc2, etc) are required. */ #define MCA_MPOOL_FLAGS_NO_HOOKS 0x80 +/* access flags */ +enum { + MCA_MPOOL_ACCESS_LOCAL_WRITE = 0x01, + MCA_MPOOL_ACCESS_REMOTE_READ = 0x02, + MCA_MPOOL_ACCESS_REMOTE_WRITE = 0x04, + MCA_MPOOL_ACCESS_REMOTE_ATOMIC = 0x08, + MCA_MPOOL_ACCESS_ANY = 0x0f, +}; struct mca_mpool_base_resources_t; @@ -63,6 +71,7 @@ struct mca_mpool_base_registration_t { #if OPAL_CUDA_GDR_SUPPORT unsigned long long gpu_bufID; #endif /* OPAL_CUDA_GDR_SUPPORT */ + int32_t access_flags; }; typedef struct mca_mpool_base_registration_t mca_mpool_base_registration_t; @@ -110,6 +119,7 @@ typedef int (*mca_mpool_base_module_register_fn_t)( void * addr, size_t size, uint32_t flags, + int32_t access_flags, mca_mpool_base_registration_t** registration); /** diff --git a/opal/mca/mpool/rgpusm/mpool_rgpusm.h b/opal/mca/mpool/rgpusm/mpool_rgpusm.h index 9a89e1b780..3f43347fb2 100644 --- a/opal/mca/mpool/rgpusm/mpool_rgpusm.h +++ b/opal/mca/mpool/rgpusm/mpool_rgpusm.h @@ -79,7 +79,7 @@ void mca_mpool_rgpusm_module_init(mca_mpool_rgpusm_module_t *mpool); * register block of memory */ int mca_mpool_rgpusm_register(mca_mpool_base_module_t* mpool, void *addr, - size_t size, uint32_t flags, mca_mpool_base_registration_t **reg); + size_t size, uint32_t flags, int32_t access_flags, mca_mpool_base_registration_t **reg); /** * deregister memory diff --git a/opal/mca/mpool/rgpusm/mpool_rgpusm_module.c b/opal/mca/mpool/rgpusm/mpool_rgpusm_module.c index aa96d90672..fe0854baee 100644 --- a/opal/mca/mpool/rgpusm/mpool_rgpusm_module.c +++ b/opal/mca/mpool/rgpusm/mpool_rgpusm_module.c @@ -180,9 +180,9 @@ void mca_mpool_rgpusm_module_init(mca_mpool_rgpusm_module_t* mpool) * from the remote memory. It uses the addr and size of the remote * memory for caching the registration. */ -int mca_mpool_rgpusm_register(mca_mpool_base_module_t *mpool, void *addr, - size_t size, uint32_t flags, - mca_mpool_base_registration_t **reg) +int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr, + size_t size, uint32_t flags, int32_t access_flags, + mca_mpool_base_registration_t **reg) { mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool; mca_mpool_common_cuda_reg_t *rgpusm_reg; diff --git a/opal/mca/mpool/udreg/mpool_udreg.h b/opal/mca/mpool/udreg/mpool_udreg.h index e1fb224060..50d8d37e36 100644 --- a/opal/mca/mpool/udreg/mpool_udreg.h +++ b/opal/mca/mpool/udreg/mpool_udreg.h @@ -98,6 +98,9 @@ struct mca_mpool_udreg_module_t { mca_mpool_udreg_hugepage_t *huge_page; opal_mutex_t lock; void *udreg_handle; + /** used to communicate the access flags to the underlying registration + * function */ + int requested_access_flags; }; typedef struct mca_mpool_udreg_module_t mca_mpool_udreg_module_t; @@ -129,7 +132,7 @@ void* mca_mpool_udreg_realloc( mca_mpool_base_module_t *mpool, void* addr, * register block of memory */ int mca_mpool_udreg_register(mca_mpool_base_module_t* mpool, void *addr, - size_t size, uint32_t flags, mca_mpool_base_registration_t **reg); + size_t size, uint32_t flags, int32_t access_flags, mca_mpool_base_registration_t **reg); /** * deregister memory diff --git a/opal/mca/mpool/udreg/mpool_udreg_module.c b/opal/mca/mpool/udreg/mpool_udreg_module.c index 1c10829a3e..66243300ff 100644 --- a/opal/mca/mpool/udreg/mpool_udreg_module.c +++ b/opal/mca/mpool/udreg/mpool_udreg_module.c @@ -204,6 +204,8 @@ static void *mca_mpool_udreg_reg_func (void *addr, uint64_t len, void *reg_conte udreg_reg->mpool = reg_context; udreg_reg->base = addr; udreg_reg->bound = (void *)((uintptr_t) addr + len); + /* pull the access flags out of the mpool module */ + udreg_reg->access_flags = mpool_udreg->requested_access_flags; rc = mpool_udreg->resources.register_mem(mpool_udreg->resources.reg_data, addr, len, udreg_reg); @@ -221,6 +223,11 @@ static uint32_t mca_mpool_udreg_dereg_func (void *device_data, void *dreg_contex mca_mpool_base_registration_t *udreg_reg = (mca_mpool_base_registration_t *) device_data; int rc; + if (udreg_reg->ref_count) { + /* there are still users of this registration. leave it alone */ + return 0; + } + rc = mpool_udreg->resources.deregister_mem(mpool_udreg->resources.reg_data, udreg_reg); if (OPAL_LIKELY(OPAL_SUCCESS == rc)) { @@ -327,7 +334,7 @@ void* mca_mpool_udreg_alloc(mca_mpool_base_module_t *mpool, size_t size, #endif } - if (OPAL_SUCCESS != mca_mpool_udreg_register(mpool, addr, size, flags, reg)) { + if (OPAL_SUCCESS != mca_mpool_udreg_register(mpool, addr, size, flags, MCA_MPOOL_ACCESS_ANY, reg)) { if (udreg_module->huge_page) { mca_mpool_udreg_free_huge ((mca_mpool_udreg_hugepage_alloc_t *) base_addr); } else { @@ -355,47 +362,87 @@ bool mca_mpool_udreg_evict (struct mca_mpool_base_module_t *mpool) * register memory */ int mca_mpool_udreg_register(mca_mpool_base_module_t *mpool, void *addr, - size_t size, uint32_t flags, + size_t size, uint32_t flags, int32_t access_flags, mca_mpool_base_registration_t **reg) { mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) mpool; - mca_mpool_base_registration_t *udreg_reg; + mca_mpool_base_registration_t *udreg_reg, *old_reg; bool bypass_cache = !!(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS); udreg_entry_t *udreg_entry; udreg_return_t urc; + *reg = NULL; + + OPAL_THREAD_LOCK(&mpool_udreg->lock); + + /* we hold the lock so no other thread can modify these flags until the registration is complete */ + mpool_udreg->requested_access_flags = access_flags; + if (false == bypass_cache) { /* Get a udreg entry for this region */ - OPAL_THREAD_LOCK(&mpool_udreg->lock); - while (UDREG_RC_SUCCESS != - (urc = UDREG_Register (mpool_udreg->udreg_handle, addr, size, &udreg_entry))) { - /* try to remove one unused reg and retry */ - if (!mca_mpool_udreg_evict (mpool)) { - *reg = NULL; + do { + while (UDREG_RC_SUCCESS != + (urc = UDREG_Register (mpool_udreg->udreg_handle, addr, size, &udreg_entry))) { + /* try to remove one unused reg and retry */ + if (!mca_mpool_udreg_evict (mpool)) { + OPAL_THREAD_UNLOCK(&mpool_udreg->lock); + return OPAL_ERR_OUT_OF_RESOURCE; + } + } + + udreg_reg = (mca_mpool_base_registration_t *) udreg_entry->device_data; + + if ((udreg_reg->access_flags & access_flags) == access_flags) { + /* sufficient access */ + break; + } + + old_reg = udreg_reg; + + /* to not confuse udreg make sure the new registration covers the same address + * range as the old one. */ + addr = old_reg->base; + size = (size_t)((intptr_t) old_reg->bound - (intptr_t) old_reg->base); + + /* make the new access flags more permissive */ + mpool_udreg->requested_access_flags = access_flags | old_reg->access_flags; + + /* get a new registration */ + udreg_reg = mca_mpool_udreg_reg_func (addr, size, mpool); + if (NULL == udreg_reg) { OPAL_THREAD_UNLOCK(&mpool_udreg->lock); return OPAL_ERR_OUT_OF_RESOURCE; } - } - OPAL_THREAD_UNLOCK(&mpool_udreg->lock); - udreg_reg = (mca_mpool_base_registration_t *) udreg_entry->device_data; + /* update the device data with the new registration */ + udreg_entry->device_data = udreg_reg; + + /* ensure that mca_mpool_udreg_deregister does not call into udreg since + * we are forcefully evicting the registration here */ + old_reg->flags |= MCA_MPOOL_FLAGS_CACHE_BYPASS | MCA_MPOOL_FLAGS_INVALID; + + mca_mpool_udreg_dereg_func (old_reg, mpool); + } while (0); + udreg_reg->mpool_context = udreg_entry; } else { /* if cache bypass is requested don't use the udreg cache */ while (NULL == (udreg_reg = mca_mpool_udreg_reg_func (addr, size, mpool))) { /* try to remove one unused reg and retry */ if (!mca_mpool_udreg_evict (mpool)) { - *reg = NULL; + OPAL_THREAD_UNLOCK(&mpool_udreg->lock); return OPAL_ERR_OUT_OF_RESOURCE; } } udreg_reg->mpool_context = NULL; } + OPAL_THREAD_UNLOCK(&mpool_udreg->lock); + udreg_reg->flags = flags; *reg = udreg_reg; - (*reg)->ref_count++; + udreg_reg->ref_count++; return OPAL_SUCCESS; } @@ -445,14 +492,14 @@ int mca_mpool_udreg_deregister(struct mca_mpool_base_module_t *mpool, assert(reg->ref_count > 0); - reg->ref_count--; + --reg->ref_count; - if (0 == reg->ref_count && reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS) { - mca_mpool_udreg_dereg_func (reg, mpool); - } else if (!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) { + if (!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) { OPAL_THREAD_LOCK(&mpool_udreg->lock); UDREG_DecrRefcount (mpool_udreg->udreg_handle, reg->mpool_context); OPAL_THREAD_UNLOCK(&mpool_udreg->lock); + } else { + mca_mpool_udreg_dereg_func (reg, mpool); } return OPAL_SUCCESS;