diff --git a/ompi/mca/crcp/base/base.h b/ompi/mca/crcp/base/base.h index e4f56b7aa4..66a6a05950 100644 --- a/ompi/mca/crcp/base/base.h +++ b/ompi/mca/crcp/base/base.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology @@ -9,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -145,7 +148,7 @@ BEGIN_C_DECLS ompi_crcp_base_btl_state_t* ompi_crcp_base_none_btl_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, + mca_rcache_base_registration_t* registration, struct opal_convertor_t* convertor, size_t reserve, size_t* size, @@ -154,7 +157,7 @@ BEGIN_C_DECLS ompi_crcp_base_btl_state_t* ompi_crcp_base_none_btl_prepare_dst( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, + mca_rcache_base_registration_t* registration, struct opal_convertor_t* convertor, size_t reserve, size_t* size, diff --git a/ompi/mca/crcp/base/crcp_base_fns.c b/ompi/mca/crcp/base/crcp_base_fns.c index 06c6b9f04d..70dcebf0ea 100644 --- a/ompi/mca/crcp/base/crcp_base_fns.c +++ b/ompi/mca/crcp/base/crcp_base_fns.c @@ -336,7 +336,7 @@ ompi_crcp_base_none_btl_free( struct mca_btl_base_module_t* btl, ompi_crcp_base_btl_state_t* ompi_crcp_base_none_btl_prepare_src( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, + mca_rcache_base_registration_t* registration, struct opal_convertor_t* convertor, size_t reserve, size_t* size, @@ -349,7 +349,7 @@ ompi_crcp_base_none_btl_prepare_src( struct mca_btl_base_module_t* btl, ompi_crcp_base_btl_state_t* ompi_crcp_base_none_btl_prepare_dst( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, + mca_rcache_base_registration_t* registration, struct opal_convertor_t* convertor, size_t reserve, size_t* size, diff --git a/ompi/mca/crcp/crcp.h b/ompi/mca/crcp/crcp.h index 261af794e5..ff43aa029c 100644 --- a/ompi/mca/crcp/crcp.h +++ b/ompi/mca/crcp/crcp.h @@ -235,7 +235,7 @@ typedef ompi_crcp_base_btl_state_t* (*mca_crcp_base_btl_module_free_fn_t) typedef ompi_crcp_base_btl_state_t* (*mca_crcp_base_btl_module_prepare_fn_t) ( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, - mca_mpool_base_registration_t* registration, + mca_rcache_base_registration_t* registration, struct opal_convertor_t* convertor, size_t reserve, size_t* size, diff --git a/ompi/mca/pml/base/pml_base_bsend.c b/ompi/mca/pml/base/pml_base_bsend.c index d00620d6b1..ad0ac06d0b 100644 --- a/ompi/mca/pml/base/pml_base_bsend.c +++ b/ompi/mca/pml/base/pml_base_bsend.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -12,6 +13,8 @@ * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -55,10 +58,7 @@ extern char *ompi_pml_base_bsend_allocator_name; /* * Routine to return pages to sub-allocator as needed */ -static void* mca_pml_bsend_alloc_segment( - struct mca_mpool_base_module_t* module, - size_t* size_inout, - mca_mpool_base_registration_t** registration) +static void* mca_pml_bsend_alloc_segment(void *ctx, size_t *size_inout) { void *addr; size_t size = *size_inout; @@ -70,7 +70,6 @@ static void* mca_pml_bsend_alloc_segment( addr = mca_pml_bsend_addr; mca_pml_bsend_addr += size; *size_inout = size; - if (NULL != registration) *registration = NULL; return addr; } @@ -232,7 +231,7 @@ int mca_pml_base_bsend_request_start(ompi_request_t* request) /* allocate a buffer to hold packed message */ sendreq->req_addr = mca_pml_bsend_allocator->alc_alloc( - mca_pml_bsend_allocator, sendreq->req_bytes_packed, 0, NULL); + mca_pml_bsend_allocator, sendreq->req_bytes_packed, 0); if(NULL == sendreq->req_addr) { /* release resources when request is freed */ sendreq->req_base.req_pml_complete = true; @@ -287,7 +286,7 @@ int mca_pml_base_bsend_request_alloc(ompi_request_t* request) /* allocate a buffer to hold packed message */ sendreq->req_addr = mca_pml_bsend_allocator->alc_alloc( - mca_pml_bsend_allocator, sendreq->req_bytes_packed, 0, NULL); + mca_pml_bsend_allocator, sendreq->req_bytes_packed, 0); if(NULL == sendreq->req_addr) { /* release resources when request is freed */ sendreq->req_base.req_pml_complete = true; @@ -321,7 +320,7 @@ void* mca_pml_base_bsend_request_alloc_buf( size_t length ) /* allocate a buffer to hold packed message */ buf = mca_pml_bsend_allocator->alc_alloc( - mca_pml_bsend_allocator, length, 0, NULL); + mca_pml_bsend_allocator, length, 0); if(NULL == buf) { /* release resources when request is freed */ OPAL_THREAD_UNLOCK(&mca_pml_bsend_mutex); diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c index 47a62fabe5..172a4dfe7e 100644 --- a/ompi/mca/pml/ob1/pml_ob1_component.c +++ b/ompi/mca/pml/ob1/pml_ob1_component.c @@ -79,12 +79,9 @@ mca_pml_base_component_2_0_0_t mca_pml_ob1_component = { .pmlm_finalize = mca_pml_ob1_component_fini, }; -void *mca_pml_ob1_seg_alloc( struct mca_mpool_base_module_t* mpool, - size_t* size, - mca_mpool_base_registration_t** registration); +void *mca_pml_ob1_seg_alloc (void *ctx, size_t* size); -void mca_pml_ob1_seg_free( struct mca_mpool_base_module_t* mpool, - void* segment ); +void mca_pml_ob1_seg_free (void *ctx, void *segment); static inline int mca_pml_ob1_param_register_int( const char* param_name, @@ -354,13 +351,12 @@ int mca_pml_ob1_component_fini(void) return OMPI_SUCCESS; } -void *mca_pml_ob1_seg_alloc( struct mca_mpool_base_module_t* mpool, - size_t* size, - mca_mpool_base_registration_t** registration) { +void *mca_pml_ob1_seg_alloc (void *ctx, size_t *size) +{ return malloc(*size); } -void mca_pml_ob1_seg_free( struct mca_mpool_base_module_t* mpool, - void* segment ) { +void mca_pml_ob1_seg_free (void *ctx, void *segment) +{ free(segment); } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.h b/ompi/mca/pml/ob1/pml_ob1_recvfrag.h index 306f269182..80bcef1501 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.h @@ -81,7 +81,7 @@ do { \ buffers[0].addr = (char*) \ mca_pml_ob1.allocator->alc_alloc( mca_pml_ob1.allocator, \ buffers[0].len, \ - 0, NULL); \ + 0); \ _ptr = (unsigned char*)(buffers[0].addr); \ macro_segments[0].seg_addr.pval = buffers[0].addr; \ } \ diff --git a/ompi/mca/vprotocol/base/vprotocol_base_request.c b/ompi/mca/vprotocol/base/vprotocol_base_request.c index 77adf51c23..a4357103fb 100644 --- a/ompi/mca/vprotocol/base/vprotocol_base_request.c +++ b/ompi/mca/vprotocol/base/vprotocol_base_request.c @@ -42,8 +42,8 @@ int mca_vprotocol_base_request_parasite(void) pml_fl_save.fl_max_to_alloc, pml_fl_save.fl_num_per_alloc, pml_fl_save.fl_mpool, - pml_fl_save.fl_mpool_reg_flags, - 0, + pml_fl_save.fl_rcache_reg_flags, + pml_fl_save.fl_rcache, pml_fl_save.item_init, pml_fl_save.ctx); if(OMPI_SUCCESS != ret) return ret; @@ -71,8 +71,8 @@ int mca_vprotocol_base_request_parasite(void) pml_fl_save.fl_max_to_alloc, pml_fl_save.fl_num_per_alloc, pml_fl_save.fl_mpool, - pml_fl_save.fl_mpool_reg_flags, - 0, + pml_fl_save.fl_rcache_reg_flags, + pml_fl_save.fl_rcache, pml_fl_save.item_init, pml_fl_save.ctx); if(OMPI_SUCCESS != ret) return ret; diff --git a/ompi/mpi/c/alloc_mem.c b/ompi/mpi/c/alloc_mem.c index 8d1b528146..8c8fb8cd54 100644 --- a/ompi/mpi/c/alloc_mem.c +++ b/ompi/mpi/c/alloc_mem.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana * University Research and Technology @@ -12,6 +13,8 @@ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -43,6 +46,8 @@ static const char FUNC_NAME[] = "MPI_Alloc_mem"; int MPI_Alloc_mem(MPI_Aint size, MPI_Info info, void *baseptr) { + char info_value[MPI_MAX_INFO_VAL + 1]; + char *mpool_hints = NULL; if (MPI_PARAM_CHECK) { OMPI_ERR_INIT_FINALIZE(FUNC_NAME); @@ -69,7 +74,16 @@ int MPI_Alloc_mem(MPI_Aint size, MPI_Info info, void *baseptr) OPAL_CR_ENTER_LIBRARY(); - *((void **) baseptr) = mca_mpool_base_alloc((size_t) size, (struct opal_info_t*)info); + if (MPI_INFO_NULL != info) { + int flag; + (void) ompi_info_get (info, "mpool_hints", MPI_MAX_INFO_VAL, info_value, &flag); + if (flag) { + mpool_hints = info_value; + } + } + + *((void **) baseptr) = mca_mpool_base_alloc ((size_t) size, (struct opal_info_t*)info, + mpool_hints); OPAL_CR_EXIT_LIBRARY(); if (NULL == *((void **) baseptr)) { return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_NO_MEM, diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index b5c61cba84..59f365bd0d 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -628,13 +628,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) /* Select which MPI components to use */ - if (OMPI_SUCCESS != - (ret = mca_mpool_base_init(OPAL_ENABLE_PROGRESS_THREADS, - ompi_mpi_thread_multiple))) { - error = "mca_mpool_base_init() failed"; - goto error; - } - if (OMPI_SUCCESS != (ret = mca_pml_base_select(OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) { diff --git a/opal/class/opal_free_list.c b/opal/class/opal_free_list.c index b509fe2840..53c3c5dfcd 100644 --- a/opal/class/opal_free_list.c +++ b/opal/class/opal_free_list.c @@ -13,7 +13,7 @@ * Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved. * Copyright (c) 2010-2013 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ * @@ -28,6 +28,9 @@ #include "opal/align.h" #include "opal/util/output.h" #include "opal/mca/mpool/mpool.h" +#include "opal/mca/mpool/base/base.h" +#include "opal/mca/rcache/rcache.h" +#include "opal/util/sys_limits.h" typedef struct opal_free_list_item_t opal_free_list_memory_t; @@ -49,17 +52,22 @@ static void opal_free_list_construct(opal_free_list_t* fl) fl->fl_payload_buffer_alignment = 0; fl->fl_frag_class = OBJ_CLASS(opal_free_list_item_t); fl->fl_mpool = NULL; + fl->fl_rcache = NULL; /* default flags */ - fl->fl_mpool_reg_flags = MCA_MPOOL_FLAGS_CACHE_BYPASS | - MCA_MPOOL_FLAGS_CUDA_REGISTER_MEM; + fl->fl_rcache_reg_flags = MCA_RCACHE_FLAGS_CACHE_BYPASS | + MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM; fl->ctx = NULL; OBJ_CONSTRUCT(&(fl->fl_allocations), opal_list_t); } static void opal_free_list_allocation_release (opal_free_list_t *fl, opal_free_list_memory_t *fl_mem) { + if (NULL != fl->fl_rcache) { + fl->fl_rcache->rcache_deregister (fl->fl_rcache, fl_mem->registration); + } + if (NULL != fl->fl_mpool) { - fl->fl_mpool->mpool_free (fl->fl_mpool, fl_mem->ptr, fl_mem->registration); + fl->fl_mpool->mpool_free (fl->fl_mpool, fl_mem->ptr); } else if (fl_mem->ptr) { free (fl_mem->ptr); } @@ -108,8 +116,9 @@ int opal_free_list_init (opal_free_list_t *flist, size_t frag_size, size_t frag_ opal_class_t *frag_class, size_t payload_buffer_size, size_t payload_buffer_alignment, int num_elements_to_alloc, int max_elements_to_alloc, int num_elements_per_alloc, - mca_mpool_base_module_t* mpool, int mpool_reg_flags, - void *unused0, opal_free_list_item_init_fn_t item_init, void *ctx) + mca_mpool_base_module_t *mpool, int rcache_reg_flags, + mca_rcache_base_module_t *rcache, opal_free_list_item_init_fn_t item_init, + void *ctx) { /* alignment must be more than zero and power of two */ if (frag_alignment <= 1 || (frag_alignment & (frag_alignment - 1))) { @@ -137,11 +146,12 @@ int opal_free_list_init (opal_free_list_t *flist, size_t frag_size, size_t frag_ flist->fl_max_to_alloc = max_elements_to_alloc; flist->fl_num_allocated = 0; flist->fl_num_per_alloc = num_elements_per_alloc; - flist->fl_mpool = mpool; + flist->fl_mpool = mpool ? mpool : mca_mpool_base_default_module; + flist->fl_rcache = rcache; flist->fl_frag_alignment = frag_alignment; flist->fl_payload_buffer_alignment = payload_buffer_alignment; flist->item_init = item_init; - flist->fl_mpool_reg_flags |= mpool_reg_flags; + flist->fl_rcache_reg_flags |= rcache_reg_flags; flist->ctx = ctx; if (num_elements_to_alloc) { @@ -153,10 +163,10 @@ int opal_free_list_init (opal_free_list_t *flist, size_t frag_size, size_t frag_ int opal_free_list_grow_st (opal_free_list_t* flist, size_t num_elements) { - unsigned char *ptr, *mpool_alloc_ptr = NULL, *payload_ptr = NULL; + unsigned char *ptr, *payload_ptr = NULL; opal_free_list_memory_t *alloc_ptr; - size_t alloc_size, head_size, elem_size = 0; - mca_mpool_base_registration_t *reg = NULL; + size_t alloc_size, head_size, elem_size = 0, buffer_size, align; + mca_rcache_base_registration_t *reg = NULL; int rc = OPAL_SUCCESS; if (flist->fl_max_to_alloc && (flist->fl_num_allocated + num_elements) > @@ -170,6 +180,29 @@ int opal_free_list_grow_st (opal_free_list_t* flist, size_t num_elements) head_size = OPAL_ALIGN(flist->fl_frag_size, flist->fl_frag_alignment, size_t); + /* NTH: calculate allocation alignment first as it might change the number of elements */ + if (0 != flist->fl_payload_buffer_size) { + elem_size = OPAL_ALIGN(flist->fl_payload_buffer_size, + flist->fl_payload_buffer_alignment, size_t); + + /* elem_size should not be 0 here */ + assert (elem_size > 0); + + buffer_size = num_elements * elem_size; + align = flist->fl_payload_buffer_alignment; + + if (MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM & flist->fl_rcache_reg_flags) { + size_t pagesize = opal_getpagesize (); + /* CUDA cannot handle registering overlapping regions, so make + * sure each region is page sized and page aligned. */ + align = OPAL_ALIGN(align, pagesize, size_t); + buffer_size = OPAL_ALIGN(buffer_size, pagesize, size_t); + + /* avoid wasting space in the buffer */ + num_elements = buffer_size / elem_size; + } + } + /* calculate head allocation size */ alloc_size = num_elements * head_size + sizeof(opal_free_list_memory_t) + flist->fl_frag_alignment; @@ -180,37 +213,27 @@ int opal_free_list_grow_st (opal_free_list_t* flist, size_t num_elements) } if (0 != flist->fl_payload_buffer_size) { - elem_size = OPAL_ALIGN(flist->fl_payload_buffer_size, - flist->fl_payload_buffer_alignment, size_t); - - /* elem_size should not be 0 here */ - assert (elem_size > 0); - /* allocate the rest from the mpool (or use memalign/malloc) */ - if(flist->fl_mpool != NULL) { - payload_ptr = mpool_alloc_ptr = - (unsigned char *) flist->fl_mpool->mpool_alloc(flist->fl_mpool, - num_elements * elem_size, - flist->fl_payload_buffer_alignment, - flist->fl_mpool_reg_flags, ®); - } else { -#ifdef HAVE_POSIX_MEMALIGN - posix_memalign ((void **) &mpool_alloc_ptr, flist->fl_payload_buffer_alignment, - num_elements * elem_size); - payload_ptr = mpool_alloc_ptr; -#else - mpool_alloc_ptr = (unsigned char *) malloc (num_elements * elem_size + - flist->fl_payload_buffer_alignment); - payload_ptr = (unsigned char *) OPAL_ALIGN((uintptr_t)mpool_alloc_ptr, - flist->fl_payload_buffer_alignment, - uintptr_t); -#endif - } - - if(NULL == mpool_alloc_ptr) { + payload_ptr = (unsigned char *) flist->fl_mpool->mpool_alloc(flist->fl_mpool, buffer_size, align, 0); + if (NULL == payload_ptr) { free(alloc_ptr); return OPAL_ERR_TEMP_OUT_OF_RESOURCE; } + + if (flist->fl_rcache) { + rc = flist->fl_rcache->rcache_register (flist->fl_rcache, payload_ptr, num_elements * elem_size, + flist->fl_rcache_reg_flags, MCA_RCACHE_ACCESS_ANY, ®); + if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { + free (alloc_ptr); + if (flist->fl_mpool) { + flist->fl_mpool->mpool_free (flist->fl_mpool, payload_ptr); + } else { + free (payload_ptr); + } + + return rc; + } + } } /* make the alloc_ptr a list item, save the chunk in the allocations list, @@ -219,7 +242,7 @@ int opal_free_list_grow_st (opal_free_list_t* flist, size_t num_elements) opal_list_append(&(flist->fl_allocations), (opal_list_item_t*)alloc_ptr); alloc_ptr->registration = reg; - alloc_ptr->ptr = mpool_alloc_ptr; + alloc_ptr->ptr = payload_ptr; ptr = (unsigned char*)alloc_ptr + sizeof(opal_free_list_memory_t); ptr = OPAL_ALIGN_PTR(ptr, flist->fl_frag_alignment, unsigned char*); diff --git a/opal/class/opal_free_list.h b/opal/class/opal_free_list.h index 4834f09c49..3a196141cc 100644 --- a/opal/class/opal_free_list.h +++ b/opal/class/opal_free_list.h @@ -77,6 +77,8 @@ struct opal_free_list_t { /** mpool to use for free list buffer allocation (posix_memalign/malloc * are used if this is NULL) */ struct mca_mpool_base_module_t *fl_mpool; + /** registration cache */ + struct mca_rcache_base_module_t *fl_rcache; /** Multi-threaded lock. Used when the free list is empty. */ opal_mutex_t fl_lock; /** Multi-threaded condition. Used when threads are waiting on free @@ -84,8 +86,8 @@ struct opal_free_list_t { opal_condition_t fl_condition; /** List of free list allocation */ opal_list_t fl_allocations; - /** Flags to pass to the mpool register function */ - int fl_mpool_reg_flags; + /** Flags to pass to the rcache register function */ + int fl_rcache_reg_flags; /** Free list item initialization function */ opal_free_list_item_init_fn_t item_init; /** Initialization function context */ @@ -98,7 +100,7 @@ struct mca_mpool_base_registration_t; struct opal_free_list_item_t { opal_list_item_t super; - struct mca_mpool_base_registration_t *registration; + struct mca_rcache_base_registration_t *registration; void *ptr; }; typedef struct opal_free_list_item_t opal_free_list_item_t; @@ -118,8 +120,8 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_free_list_item_t); * @param max_elements_to_alloc (IN) Maximum number of elements to allocate. * @param num_elements_per_alloc (IN) Number of elements to grow by per allocation. * @param mpool (IN) Optional memory pool for allocations. - * @param mpool_reg_flags (IN) Flags to pass to mpool registration function. - * @param unused0 (IN) Future. Must be NULL. + * @param rcache_reg_flags (IN) Flags to pass to rcache registration function. + * @param rcache (IN) Optional registration cache. * @param item_init (IN) Optional item initialization function * @param ctx (IN) Initialization function context. */ @@ -134,8 +136,8 @@ OPAL_DECLSPEC int opal_free_list_init (opal_free_list_t *free_list, int max_elements_to_alloc, int num_elements_per_alloc, struct mca_mpool_base_module_t *mpool, - int mpool_reg_flags, - void *unused0, + int rcache_reg_flags, + struct mca_rcache_base_module_t *rcache, opal_free_list_item_init_fn_t item_init, void *ctx); diff --git a/opal/include/opal/align.h b/opal/include/opal/align.h index 09d351985b..ea68d33bc6 100644 --- a/opal/include/opal/align.h +++ b/opal/include/opal/align.h @@ -22,6 +22,8 @@ #ifndef OPAL_ALIGN_H #define OPAL_ALIGN_H +#define OPAL_DOWN_ALIGN(x,a,t) ((x) & ~(((t)(a)-1))) +#define OPAL_DOWN_ALIGN_PTR(x,a,t) ((t)OPAL_DOWN_ALIGN((uintptr_t)x, a, uintptr_t)) #define OPAL_ALIGN(x,a,t) (((x)+((t)(a)-1)) & ~(((t)(a)-1))) #define OPAL_ALIGN_PTR(x,a,t) ((t)OPAL_ALIGN((uintptr_t)x, a, uintptr_t)) #define OPAL_ALIGN_PAD_AMOUNT(x,s) ((~((uintptr_t)(x))+1) & ((uintptr_t)(s)-1)) diff --git a/opal/mca/allocator/allocator.h b/opal/mca/allocator/allocator.h index 7447edc206..77180462c3 100644 --- a/opal/mca/allocator/allocator.h +++ b/opal/mca/allocator/allocator.h @@ -27,7 +27,6 @@ #include "opal_config.h" #include "opal/mca/mca.h" -#include "opal/mca/mpool/mpool.h" BEGIN_C_DECLS @@ -40,16 +39,14 @@ struct mca_allocator_base_module_t; typedef void* (*mca_allocator_base_module_alloc_fn_t)( struct mca_allocator_base_module_t*, size_t size, - size_t align, - mca_mpool_base_registration_t** registration); + size_t align); /** * The realloc function typedef */ typedef void* (*mca_allocator_base_module_realloc_fn_t)( struct mca_allocator_base_module_t*, - void*, size_t, - mca_mpool_base_registration_t** registration); + void*, size_t); /** * Free function typedef @@ -90,7 +87,7 @@ struct mca_allocator_base_module_t { mca_allocator_base_module_finalize_fn_t alc_finalize; /**< Finalize and free everything */ /* memory pool and resources */ - struct mca_mpool_base_module_t* alc_mpool; + void *alc_context; }; /** * Convenience typedef. @@ -103,19 +100,16 @@ typedef struct mca_allocator_base_module_t mca_allocator_base_module_t; * provided by the module to the allocator framework. */ -typedef void* (*mca_allocator_base_component_segment_alloc_fn_t)( - struct mca_mpool_base_module_t* module, - size_t* size, - mca_mpool_base_registration_t** registration); +typedef void* (*mca_allocator_base_component_segment_alloc_fn_t)(void *ctx, + size_t *size); /** * A function to free memory from the control of the allocator framework * back to the system. This function is to be provided by the module to the * allocator framework. */ -typedef void (*mca_allocator_base_component_segment_free_fn_t)( - struct mca_mpool_base_module_t* module, - void* segment); +typedef void (*mca_allocator_base_component_segment_free_fn_t)(void *ctx, + void *segment); /** @@ -126,7 +120,7 @@ typedef struct mca_allocator_base_module_t* bool enable_mpi_threads, mca_allocator_base_component_segment_alloc_fn_t segment_alloc, mca_allocator_base_component_segment_free_fn_t segment_free, - struct mca_mpool_base_module_t* mpool + void *context ); /** diff --git a/opal/mca/allocator/base/base.h b/opal/mca/allocator/base/base.h index 13251759f8..4d3d77bd9d 100644 --- a/opal/mca/allocator/base/base.h +++ b/opal/mca/allocator/base/base.h @@ -45,7 +45,7 @@ struct mca_allocator_base_selected_module_t { typedef struct mca_allocator_base_selected_module_t mca_allocator_base_selected_module_t; /** - * Declaces mca_mpool_base_selected_module_t as a class. + * Declaces mca_allocator_base_selected_module_t as a class. */ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_allocator_base_selected_module_t); diff --git a/opal/mca/allocator/basic/allocator_basic.c b/opal/mca/allocator/basic/allocator_basic.c index 22e1bc2a18..53c40ad85f 100644 --- a/opal/mca/allocator/basic/allocator_basic.c +++ b/opal/mca/allocator/basic/allocator_basic.c @@ -78,7 +78,7 @@ mca_allocator_base_module_t* mca_allocator_basic_component_init( bool enable_mpi_threads, mca_allocator_base_component_segment_alloc_fn_t segment_alloc, mca_allocator_base_component_segment_free_fn_t segment_free, - struct mca_mpool_base_module_t* mpool) + void *context) { mca_allocator_basic_module_t *module = (mca_allocator_basic_module_t *) malloc(sizeof(mca_allocator_basic_module_t)); @@ -91,7 +91,7 @@ mca_allocator_base_module_t* mca_allocator_basic_component_init( module->super.alc_free = mca_allocator_basic_free; module->super.alc_compact = mca_allocator_basic_compact; module->super.alc_finalize = mca_allocator_basic_finalize; - module->super.alc_mpool = mpool; + module->super.alc_context = context; module->seg_alloc = segment_alloc; module->seg_free = segment_free; OBJ_CONSTRUCT(&module->seg_list, opal_list_t); @@ -163,8 +163,7 @@ static void mca_allocator_basic_combine_next( void *mca_allocator_basic_alloc( mca_allocator_base_module_t * base, size_t size, - size_t align, - mca_mpool_base_registration_t** registration) + size_t align) { mca_allocator_basic_module_t* module = (mca_allocator_basic_module_t*)base; mca_allocator_basic_segment_t* seg; @@ -198,7 +197,7 @@ void *mca_allocator_basic_alloc( /* request additional block */ allocated_size = size; - if(NULL == (addr = (unsigned char *)module->seg_alloc(module->super.alc_mpool, &allocated_size, registration))) { + if(NULL == (addr = (unsigned char *)module->seg_alloc(module->super.alc_context, &allocated_size))) { OPAL_THREAD_UNLOCK(&module->seg_lock); return NULL; } @@ -239,14 +238,13 @@ void *mca_allocator_basic_alloc( void * mca_allocator_basic_realloc( mca_allocator_base_module_t * base, void * ptr, - size_t size, - mca_mpool_base_registration_t** registration) + size_t size) { unsigned char* addr = ((unsigned char*)ptr) - sizeof(size_t); size_t alloc_size = *(size_t*)addr; if(size <= alloc_size) return ptr; - addr = (unsigned char *)mca_allocator_basic_alloc(base,size,0,registration); + addr = (unsigned char *)mca_allocator_basic_alloc(base, size, 0); if(addr == NULL) return addr; memcpy(addr,ptr,alloc_size); diff --git a/opal/mca/allocator/basic/allocator_basic.h b/opal/mca/allocator/basic/allocator_basic.h index 0ae23a0c26..aa257457db 100644 --- a/opal/mca/allocator/basic/allocator_basic.h +++ b/opal/mca/allocator/basic/allocator_basic.h @@ -77,7 +77,7 @@ mca_allocator_base_module_t* mca_allocator_basic_component_init( bool enable_mpi_threads, mca_allocator_base_component_segment_alloc_fn_t segment_alloc, mca_allocator_base_component_segment_free_fn_t segment_free, - struct mca_mpool_base_module_t* module + void *ctx ); /** @@ -94,8 +94,7 @@ mca_allocator_base_module_t* mca_allocator_basic_component_init( void * mca_allocator_basic_alloc( mca_allocator_base_module_t * mem, size_t size, - size_t align, - mca_mpool_base_registration_t** registration); + size_t align); /** * Attempts to resize the passed region of memory into a larger or a smaller @@ -114,8 +113,7 @@ mca_allocator_base_module_t* mca_allocator_basic_component_init( void * mca_allocator_basic_realloc( mca_allocator_base_module_t * mem, void * ptr, - size_t size, - mca_mpool_base_registration_t** registration); + size_t size); /** * Frees the passed region of memory diff --git a/opal/mca/allocator/bucket/allocator_bucket.c b/opal/mca/allocator/bucket/allocator_bucket.c index 148245356f..e57b1648cb 100644 --- a/opal/mca/allocator/bucket/allocator_bucket.c +++ b/opal/mca/allocator/bucket/allocator_bucket.c @@ -24,14 +24,13 @@ #include "opal/mca/allocator/allocator.h" #include "opal/constants.h" #include "opal/mca/allocator/bucket/allocator_bucket_alloc.h" -#include "opal/mca/mpool/mpool.h" +#include "opal/mca/base/mca_base_var.h" struct mca_allocator_base_module_t* mca_allocator_bucket_module_init( bool enable_mpi_threads, mca_allocator_base_component_segment_alloc_fn_t segment_alloc, mca_allocator_base_component_segment_free_fn_t segment_free, - struct mca_mpool_base_module_t* mpool - ); + void *context); int mca_allocator_bucket_module_open(void); @@ -39,8 +38,7 @@ int mca_allocator_bucket_module_close(void); void * mca_allocator_bucket_alloc_wrapper( struct mca_allocator_base_module_t* allocator, - size_t size, size_t align, - mca_mpool_base_registration_t** registration); + size_t size, size_t align); static int mca_allocator_num_buckets; @@ -66,7 +64,7 @@ struct mca_allocator_base_module_t* mca_allocator_bucket_module_init( bool enable_mpi_threads, mca_allocator_base_component_segment_alloc_fn_t segment_alloc, mca_allocator_base_component_segment_free_fn_t segment_free, - struct mca_mpool_base_module_t* mpool) + void *context) { size_t alloc_size = sizeof(mca_allocator_bucket_t); mca_allocator_bucket_t * retval; @@ -87,7 +85,7 @@ struct mca_allocator_base_module_t* mca_allocator_bucket_module_init( allocator->super.alc_free = mca_allocator_bucket_free; allocator->super.alc_compact = mca_allocator_bucket_cleanup; allocator->super.alc_finalize = mca_allocator_bucket_finalize; - allocator->super.alc_mpool = mpool; + allocator->super.alc_context = context; return (mca_allocator_base_module_t *) allocator; } @@ -111,13 +109,12 @@ int mca_allocator_bucket_module_close(void) { void * mca_allocator_bucket_alloc_wrapper( struct mca_allocator_base_module_t* allocator, size_t size, - size_t align, - mca_mpool_base_registration_t** registration) + size_t align) { if(0 == align){ - return mca_allocator_bucket_alloc(allocator, size, registration); + return mca_allocator_bucket_alloc(allocator, size); } - return mca_allocator_bucket_alloc_align(allocator, size, align, registration); + return mca_allocator_bucket_alloc_align(allocator, size, align); } diff --git a/opal/mca/allocator/bucket/allocator_bucket_alloc.c b/opal/mca/allocator/bucket/allocator_bucket_alloc.c index e5dc81eeb4..be3db944d0 100644 --- a/opal/mca/allocator/bucket/allocator_bucket_alloc.c +++ b/opal/mca/allocator/bucket/allocator_bucket_alloc.c @@ -71,10 +71,8 @@ mca_allocator_bucket_t * mca_allocator_bucket_init( * region or NULL if there was an error * */ -void * mca_allocator_bucket_alloc( - mca_allocator_base_module_t * mem, - size_t size, - mca_mpool_base_registration_t** registration) +void * mca_allocator_bucket_alloc(mca_allocator_base_module_t * mem, + size_t size) { mca_allocator_bucket_t * mem_options = (mca_allocator_bucket_t *) mem; /* initialize for the later bit shifts */ @@ -113,7 +111,7 @@ void * mca_allocator_bucket_alloc( allocated_size += sizeof(mca_allocator_bucket_segment_head_t); /* attempt to get the memory */ segment_header = (mca_allocator_bucket_segment_head_t *) - mem_options->get_mem_fn(mem_options->super.alc_mpool, &allocated_size, registration); + mem_options->get_mem_fn(mem_options->super.alc_context, &allocated_size); if(NULL == segment_header) { /* release the lock */ OPAL_THREAD_UNLOCK(&(mem_options->buckets[bucket_num].lock)); @@ -153,11 +151,8 @@ void * mca_allocator_bucket_alloc( /* * allocates an aligned region of memory */ -void * mca_allocator_bucket_alloc_align( - mca_allocator_base_module_t * mem, - size_t size, - size_t alignment, - mca_mpool_base_registration_t** registration) +void * mca_allocator_bucket_alloc_align(mca_allocator_base_module_t * mem, + size_t size, size_t alignment) { mca_allocator_bucket_t * mem_options = (mca_allocator_bucket_t *) mem; int bucket_num = 1; @@ -177,7 +172,7 @@ void * mca_allocator_bucket_alloc_align( bucket_size = size + sizeof(mca_allocator_bucket_chunk_header_t); allocated_size = aligned_max_size; /* get some memory */ - ptr = mem_options->get_mem_fn(mem_options->super.alc_mpool, &allocated_size, registration); + ptr = mem_options->get_mem_fn(mem_options->super.alc_context, &allocated_size); if(NULL == ptr) { return(NULL); } @@ -236,11 +231,8 @@ void * mca_allocator_bucket_alloc_align( /* * function to reallocate the segment of memory */ -void * mca_allocator_bucket_realloc( - mca_allocator_base_module_t * mem, - void * ptr, - size_t size, - mca_mpool_base_registration_t** registration) +void * mca_allocator_bucket_realloc(mca_allocator_base_module_t * mem, + void * ptr, size_t size) { mca_allocator_bucket_t * mem_options = (mca_allocator_bucket_t *) mem; /* initialize for later bit shifts */ @@ -261,7 +253,7 @@ void * mca_allocator_bucket_realloc( return(ptr); } /* we need a new space in memory, so let's get it */ - ret_ptr = mca_allocator_bucket_alloc((mca_allocator_base_module_t *) mem_options, size, registration); + ret_ptr = mca_allocator_bucket_alloc((mca_allocator_base_module_t *) mem_options, size); if(NULL == ret_ptr) { /* we were unable to get a larger area of memory */ return(NULL); @@ -341,7 +333,7 @@ int mca_allocator_bucket_cleanup(mca_allocator_base_module_t * mem) next_segment = segment->next_segment; /* free the memory */ if(mem_options->free_mem_fn) - mem_options->free_mem_fn(mem->alc_mpool, segment); + mem_options->free_mem_fn(mem->alc_context, segment); segment = next_segment; } mem_options->buckets[i].free_chunk = NULL; @@ -378,7 +370,7 @@ int mca_allocator_bucket_cleanup(mca_allocator_base_module_t * mem) *segment_header = segment->next_segment; /* free the memory */ if(mem_options->free_mem_fn) - mem_options->free_mem_fn(mem->alc_mpool, segment); + mem_options->free_mem_fn(mem->alc_context, segment); } else { /* go to next segment */ segment_header = &((*segment_header)->next_segment); diff --git a/opal/mca/allocator/bucket/allocator_bucket_alloc.h b/opal/mca/allocator/bucket/allocator_bucket_alloc.h index f2af13a122..fe0b66e881 100644 --- a/opal/mca/allocator/bucket/allocator_bucket_alloc.h +++ b/opal/mca/allocator/bucket/allocator_bucket_alloc.h @@ -1,4 +1,5 @@ -/** +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. @@ -9,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reseved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -131,8 +134,7 @@ typedef struct mca_allocator_bucket_t mca_allocator_bucket_t; */ void * mca_allocator_bucket_alloc( mca_allocator_base_module_t * mem, - size_t size, - mca_mpool_base_registration_t** registration); + size_t size); /** * Accepts a request for memory in a specific region defined by the @@ -152,8 +154,7 @@ typedef struct mca_allocator_bucket_t mca_allocator_bucket_t; void * mca_allocator_bucket_alloc_align( mca_allocator_base_module_t * mem, size_t size, - size_t alignment, - mca_mpool_base_registration_t** registration); + size_t alignment); /** * Attempts to resize the passed region of memory into a larger or a smaller @@ -172,8 +173,7 @@ typedef struct mca_allocator_bucket_t mca_allocator_bucket_t; void * mca_allocator_bucket_realloc( mca_allocator_base_module_t * mem, void * ptr, - size_t size, - mca_mpool_base_registration_t** registration); + size_t size); /** * Frees the passed region of memory diff --git a/opal/mca/base/mca_base_var_group.c b/opal/mca/base/mca_base_var_group.c index bbbd6166b0..6fec2e2106 100644 --- a/opal/mca/base/mca_base_var_group.c +++ b/opal/mca/base/mca_base_var_group.c @@ -218,6 +218,12 @@ static int group_register (const char *project_name, const char *framework_name, return -1; } + /* avoid groups of the form opal_opal, ompi_ompi, etc */ + if (NULL != project_name && NULL != framework_name && + (0 == strcmp (project_name, framework_name))) { + project_name = NULL; + } + group_id = group_find (project_name, framework_name, component_name, true); if (0 <= group_id) { ret = mca_base_var_group_get_internal (group_id, &group, true); diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h index 691af933d1..343a024f46 100644 --- a/opal/mca/btl/btl.h +++ b/opal/mca/btl/btl.h @@ -122,6 +122,7 @@ #include "opal/datatype/opal_convertor.h" #include "opal/mca/mca.h" #include "opal/mca/mpool/mpool.h" +#include "opal/mca/rcache/rcache.h" #include "opal/mca/crs/crs.h" #include "opal/mca/crs/base/base.h" @@ -256,23 +257,23 @@ enum { /** Allow local write on the registered region. If a region is registered * with this flag the registration can be used as the local handle for a * btl_get operation. */ - MCA_BTL_REG_FLAG_LOCAL_WRITE = MCA_MPOOL_ACCESS_LOCAL_WRITE, + MCA_BTL_REG_FLAG_LOCAL_WRITE = MCA_RCACHE_ACCESS_LOCAL_WRITE, /** Allow remote read on the registered region. If a region is registered * with this flag the registration can be used as the remote handle for a * btl_get operation. */ - MCA_BTL_REG_FLAG_REMOTE_READ = MCA_MPOOL_ACCESS_REMOTE_READ, + MCA_BTL_REG_FLAG_REMOTE_READ = MCA_RCACHE_ACCESS_REMOTE_READ, /** Allow remote write on the registered region. If a region is registered * with this flag the registration can be used as the remote handle for a * btl_put operation. */ - MCA_BTL_REG_FLAG_REMOTE_WRITE = MCA_MPOOL_ACCESS_REMOTE_WRITE, + MCA_BTL_REG_FLAG_REMOTE_WRITE = MCA_RCACHE_ACCESS_REMOTE_WRITE, /** Allow remote atomic operations on the registered region. If a region is * registered with this flag the registration can be used as the remote * handle for a btl_atomic_op or btl_atomic_fop operation. */ - MCA_BTL_REG_FLAG_REMOTE_ATOMIC = MCA_MPOOL_ACCESS_REMOTE_ATOMIC, + MCA_BTL_REG_FLAG_REMOTE_ATOMIC = MCA_RCACHE_ACCESS_REMOTE_ATOMIC, /** Allow any btl operation on the registered region. If a region is registered * with this flag the registration can be used as the local or remote handle for * any btl operation. */ - MCA_BTL_REG_FLAG_ACCESS_ANY = MCA_MPOOL_ACCESS_ANY, + MCA_BTL_REG_FLAG_ACCESS_ANY = MCA_RCACHE_ACCESS_ANY, #if OPAL_CUDA_GDR_SUPPORT /** Region is in GPU memory */ MCA_BTL_REG_FLAG_CUDA_GPU_MEM = 0x00010000, diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c index 2f858d36c9..ad11ed8b48 100644 --- a/opal/mca/btl/openib/btl_openib.c +++ b/opal/mca/btl/openib/btl_openib.c @@ -57,7 +57,7 @@ #include "opal/datatype/opal_convertor.h" #include "opal/mca/mpool/base/base.h" #include "opal/mca/mpool/mpool.h" -#include "opal/mca/mpool/grdma/mpool_grdma.h" +#include "opal/mca/rcache/rcache.h" #if OPAL_CUDA_SUPPORT #include "opal/datatype/opal_datatype_cuda.h" @@ -733,7 +733,7 @@ static int prepare_device_for_use (mca_btl_openib_device_t *device) mca_btl_openib_component.buffer_alignment, mca_btl_openib_component.ib_free_list_num, -1, mca_btl_openib_component.ib_free_list_inc, - device->mpool, 0, NULL, mca_btl_openib_frag_init, + device->mpool, 0, device->rcache, mca_btl_openib_frag_init, init_data); if (OPAL_SUCCESS != rc) { /* If we're "out of memory", this usually means that we ran @@ -774,7 +774,7 @@ static int prepare_device_for_use (mca_btl_openib_device_t *device) mca_btl_openib_component.ib_free_list_num, mca_btl_openib_component.ib_free_list_max, mca_btl_openib_component.ib_free_list_inc, - device->mpool, 0, NULL, mca_btl_openib_frag_init, + device->mpool, 0, device->rcache, mca_btl_openib_frag_init, init_data); if (OPAL_SUCCESS != rc) { /* If we're "out of memory", this usually means that we @@ -807,7 +807,7 @@ static int prepare_device_for_use (mca_btl_openib_device_t *device) mca_btl_openib_component.ib_free_list_num, mca_btl_openib_component.ib_free_list_max, mca_btl_openib_component.ib_free_list_inc, - device->mpool, 0, NULL, mca_btl_openib_frag_init, + device->mpool, 0, device->rcache, mca_btl_openib_frag_init, init_data)) { rc = OPAL_ERROR; goto exit; @@ -1903,6 +1903,7 @@ static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_ mca_btl_base_endpoint_t *endpoint, void *base, size_t size, uint32_t flags) { + mca_btl_openib_module_t *openib_module = (mca_btl_openib_module_t *) btl; mca_btl_openib_reg_t *reg; uint32_t mflags = 0; int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; @@ -1910,12 +1911,12 @@ static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_ #if OPAL_CUDA_GDR_SUPPORT if (flags & MCA_BTL_REG_FLAG_CUDA_GPU_MEM) { - mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM; + mflags |= MCA_RCACHE_FLAGS_CUDA_GPU_MEM; } #endif /* OPAL_CUDA_GDR_SUPPORT */ - rc = btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, mflags, access_flags, - (mca_mpool_base_registration_t **) ®); + rc = openib_module->device->rcache->rcache_register (openib_module->device->rcache, base, size, mflags, + access_flags, (mca_rcache_base_registration_t **) ®); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == reg)) { return NULL; } @@ -1925,9 +1926,10 @@ static mca_btl_base_registration_handle_t *mca_btl_openib_register_mem (mca_btl_ static int mca_btl_openib_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle) { + mca_btl_openib_module_t *openib_module = (mca_btl_openib_module_t *) btl; mca_btl_openib_reg_t *reg = (mca_btl_openib_reg_t *)((intptr_t) handle - offsetof (mca_btl_openib_reg_t, btl_handle)); - btl->btl_mpool->mpool_deregister (btl->btl_mpool, (mca_mpool_base_registration_t *) reg); + openib_module->device->rcache->rcache_deregister (openib_module->device->rcache, (mca_rcache_base_registration_t *) reg); return OPAL_SUCCESS; } diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h index f5772f6360..3956f39acf 100644 --- a/opal/mca/btl/openib/btl_openib.h +++ b/opal/mca/btl/openib/btl_openib.h @@ -45,6 +45,7 @@ #include "opal/mca/event/event.h" #include "opal/threads/threads.h" #include "opal/mca/btl/btl.h" +#include "opal/mca/rcache/rcache.h" #include "opal/mca/mpool/mpool.h" #include "opal/mca/btl/base/btl_base_error.h" #include "opal/mca/btl/base/base.h" @@ -184,8 +185,11 @@ struct mca_btl_openib_component_t { opal_mutex_t ib_lock; /**< lock for accessing module state */ - char* ib_mpool_name; - /**< name of ib memory pool */ + char* ib_mpool_hints; + /**< hints for selecting an mpool component */ + + char *ib_rcache_name; + /**< name of ib registration cache */ uint8_t num_pp_qps; /**< number of pp qp's */ uint8_t num_srq_qps; /**< number of srq qp's */ @@ -374,6 +378,7 @@ typedef struct mca_btl_openib_device_t { struct ibv_cq *ib_cq[2]; uint32_t cq_size[2]; mca_mpool_base_module_t *mpool; + mca_rcache_base_module_t *rcache; /* MTU for this device */ uint32_t mtu; /* Whether this device supports eager RDMA */ @@ -502,7 +507,7 @@ struct mca_btl_base_registration_handle_t { }; struct mca_btl_openib_reg_t { - mca_mpool_base_registration_t base; + mca_rcache_base_registration_t base; struct ibv_mr *mr; mca_btl_base_registration_handle_t btl_handle; }; diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index 0fc4502f6e..fd926c1b78 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -67,7 +67,8 @@ #include "opal/mca/btl/btl.h" #include "opal/mca/btl/base/base.h" #include "opal/mca/mpool/base/base.h" -#include "opal/mca/mpool/grdma/mpool_grdma.h" +#include "opal/mca/rcache/rcache.h" +#include "opal/mca/rcache/base/base.h" #include "opal/mca/common/cuda/common_cuda.h" #include "opal/mca/common/verbs/common_verbs.h" #include "opal/runtime/opal_params.h" @@ -512,27 +513,27 @@ static void btl_openib_control(mca_btl_base_module_t* btl, } } -static int openib_reg_mr(void *reg_data, void *base, size_t size, - mca_mpool_base_registration_t *reg) +static int openib_reg_mr (void *reg_data, void *base, size_t size, + mca_rcache_base_registration_t *reg) { mca_btl_openib_device_t *device = (mca_btl_openib_device_t*)reg_data; mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg; enum ibv_access_flags access_flag = 0; - if (reg->access_flags & MCA_MPOOL_ACCESS_REMOTE_READ) { + if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_READ) { access_flag |= IBV_ACCESS_REMOTE_READ; } - if (reg->access_flags & MCA_MPOOL_ACCESS_REMOTE_WRITE) { + if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_WRITE) { access_flag |= IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE; } - if (reg->access_flags & MCA_MPOOL_ACCESS_LOCAL_WRITE) { + if (reg->access_flags & MCA_RCACHE_ACCESS_LOCAL_WRITE) { access_flag |= IBV_ACCESS_LOCAL_WRITE; } #if HAVE_DECL_IBV_ATOMIC_HCA - if (reg->access_flags & MCA_MPOOL_ACCESS_REMOTE_ATOMIC) { + if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_ATOMIC) { access_flag |= IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_LOCAL_WRITE; } #endif @@ -545,7 +546,7 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size, device->mem_reg_active += size; #if HAVE_DECL_IBV_ACCESS_SO - if (reg->flags & MCA_MPOOL_FLAGS_SO_MEM) { + if (reg->flags & MCA_RCACHE_FLAGS_SO_MEM) { access_flag |= IBV_ACCESS_SO; } #endif @@ -567,16 +568,16 @@ static int openib_reg_mr(void *reg_data, void *base, size_t size, (int) (reg->bound - reg->base + 1), reg->flags)); #if OPAL_CUDA_SUPPORT - if (reg->flags & MCA_MPOOL_FLAGS_CUDA_REGISTER_MEM) { - mca_common_cuda_register(base, size, - openib_reg->base.mpool->mpool_component->mpool_version.mca_component_name); + if (reg->flags & MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM) { + mca_common_cuda_register (base, size, + openib_reg->base.rcache->rcache_component->rcache_version.mca_component_name); } #endif return OPAL_SUCCESS; } -static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg) +static int openib_dereg_mr(void *reg_data, mca_rcache_base_registration_t *reg) { mca_btl_openib_device_t *device = (mca_btl_openib_device_t*)reg_data; mca_btl_openib_reg_t *openib_reg = (mca_btl_openib_reg_t*)reg; @@ -593,9 +594,9 @@ static int openib_dereg_mr(void *reg_data, mca_mpool_base_registration_t *reg) } #if OPAL_CUDA_SUPPORT - if (reg->flags & MCA_MPOOL_FLAGS_CUDA_REGISTER_MEM) { + if (reg->flags & MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM) { mca_common_cuda_unregister(openib_reg->base.base, - openib_reg->base.mpool->mpool_component->mpool_version.mca_component_name); + openib_reg->base.rcache->rcache_component->rcache_version.mca_component_name); } #endif @@ -878,6 +879,7 @@ static void device_construct(mca_btl_openib_device_t *device) device->ib_dev_context = NULL; device->ib_pd = NULL; device->mpool = NULL; + device->rcache = NULL; #if OPAL_ENABLE_PROGRESS_THREADS == 1 device->ib_channel = NULL; #endif @@ -960,8 +962,8 @@ static void device_destruct(mca_btl_openib_device_t *device) } } - if (OPAL_SUCCESS != mca_mpool_base_module_destroy(device->mpool)) { - BTL_VERBOSE(("Failed to release mpool")); + if (OPAL_SUCCESS != mca_rcache_base_module_destroy (device->rcache)) { + BTL_VERBOSE(("failed to release registration cache")); goto device_error; } @@ -1590,7 +1592,7 @@ static uint64_t calculate_max_reg (const char *device_name) static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) { - struct mca_mpool_base_resources_t mpool_resources; + mca_rcache_base_resources_t rcache_resources; mca_btl_openib_device_t *device; uint8_t i, k = 0; int ret = -1, port_cnt; @@ -1813,20 +1815,25 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev) "eager RDMA and progress threads", true); } - asprintf (&mpool_resources.pool_name, "verbs.%" PRIu64, device->ib_dev_attr.node_guid); - mpool_resources.reg_data = (void*)device; - mpool_resources.sizeof_reg = sizeof(mca_btl_openib_reg_t); - mpool_resources.register_mem = openib_reg_mr; - mpool_resources.deregister_mem = openib_dereg_mr; - device->mpool = - mca_mpool_base_module_create(mca_btl_openib_component.ib_mpool_name, - device, &mpool_resources); - if(NULL == device->mpool){ + asprintf (&rcache_resources.cache_name, "verbs.%" PRIu64, device->ib_dev_attr.node_guid); + rcache_resources.reg_data = (void*)device; + rcache_resources.sizeof_reg = sizeof(mca_btl_openib_reg_t); + rcache_resources.register_mem = openib_reg_mr; + rcache_resources.deregister_mem = openib_dereg_mr; + device->rcache = + mca_rcache_base_module_create (mca_btl_openib_component.ib_rcache_name, + device, &rcache_resources); + if (NULL == device->rcache) { /* Don't print an error message here -- we'll get one from mpool_create anyway */ goto error; } + device->mpool = mca_mpool_base_module_lookup (mca_btl_openib_component.ib_mpool_hints); + if (NULL == device->mpool) { + goto error; + } + #if OPAL_ENABLE_PROGRESS_THREADS device->ib_channel = ibv_create_comp_channel(device->ib_dev_context); if (NULL == device->ib_channel) { @@ -2223,9 +2230,6 @@ error: ibv_destroy_comp_channel(device->ib_channel); } #endif - if (device->mpool) { - mca_mpool_base_module_destroy(device->mpool); - } if (device->ib_pd) { ibv_dealloc_pd(device->ib_pd); diff --git a/opal/mca/btl/openib/btl_openib_eager_rdma.h b/opal/mca/btl/openib/btl_openib_eager_rdma.h index 993a5958fc..0ba5a030d4 100644 --- a/opal/mca/btl/openib/btl_openib_eager_rdma.h +++ b/opal/mca/btl/openib/btl_openib_eager_rdma.h @@ -20,6 +20,7 @@ BEGIN_C_DECLS struct mca_btl_openib_eager_rdma_local_t { opal_ptr_t base; /**< buffer for RDMAing eager messages */ + void *alloc_base; /**< allocated base */ mca_btl_openib_recv_frag_t *frags; mca_btl_openib_reg_t *reg; uint16_t head; /**< RDMA buffer to poll */ diff --git a/opal/mca/btl/openib/btl_openib_endpoint.c b/opal/mca/btl/openib/btl_openib_endpoint.c index 0186f8d5e2..ec2649d517 100644 --- a/opal/mca/btl/openib/btl_openib_endpoint.c +++ b/opal/mca/btl/openib/btl_openib_endpoint.c @@ -347,14 +347,17 @@ static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint) * was not in "connect" or "bad" flow (failed to allocate memory) * and changed the pointer back to NULL */ - if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL, - (void*)1)) { - if ((void*)1 != endpoint->eager_rdma_local.base.pval && - NULL != endpoint->eager_rdma_local.base.pval) { - endpoint->endpoint_btl->super.btl_mpool->mpool_free(endpoint->endpoint_btl->super.btl_mpool, - endpoint->eager_rdma_local.base.pval, - (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg); - pval_clean=true; + if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL, (void*)1)) { + if (NULL != endpoint->eager_rdma_local.reg) { + endpoint->endpoint_btl->device->rcache->rcache_deregister (endpoint->endpoint_btl->device->rcache, + &endpoint->eager_rdma_local.reg->base); + endpoint->eager_rdma_local.reg = NULL; + } + + void *alloc_base = opal_atomic_swap_ptr (&endpoint->eager_rdma_local.alloc_base, NULL); + if (alloc_base) { + endpoint->endpoint_btl->super.btl_mpool->mpool_free (endpoint->endpoint_btl->super.btl_mpool, alloc_base); + pval_clean = true; } } else { pval_clean=true; @@ -861,10 +864,10 @@ void mca_btl_openib_endpoint_connect_eager_rdma( mca_btl_openib_endpoint_t* endpoint) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; - char *buf; + char *buf, *alloc_base; mca_btl_openib_recv_frag_t *headers_buf; - int i; - uint32_t flag = MCA_MPOOL_FLAGS_CACHE_BYPASS; + int i, rc; + uint32_t flag = MCA_RCACHE_FLAGS_CACHE_BYPASS; /* Set local rdma pointer to 1 temporarily so other threads will not try * to enter the function */ @@ -890,19 +893,26 @@ void mca_btl_openib_endpoint_connect_eager_rdma( The following flag will be interpreted and the appropriate steps will be taken when the memory is registered in openib_reg_mr(). */ - flag |= MCA_MPOOL_FLAGS_SO_MEM; + flag |= MCA_RCACHE_FLAGS_SO_MEM; #endif - buf = (char *) openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool, - openib_btl->eager_rdma_frag_size * - mca_btl_openib_component.eager_rdma_num, - mca_btl_openib_component.buffer_alignment, - flag, - (mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg); + alloc_base = buf = (char *) openib_btl->super.btl_mpool->mpool_alloc(openib_btl->super.btl_mpool, + openib_btl->eager_rdma_frag_size * + mca_btl_openib_component.eager_rdma_num, + mca_btl_openib_component.buffer_alignment, + 0); if(!buf) goto free_headers_buf; + rc = openib_btl->device->rcache->rcache_register (openib_btl->device->rcache, buf, openib_btl->eager_rdma_frag_size * + mca_btl_openib_component.eager_rdma_num, flag, MCA_RCACHE_ACCESS_ANY, + (mca_rcache_base_registration_t**)&endpoint->eager_rdma_local.reg); + if (OPAL_SUCCESS != rc) { + openib_btl->super.btl_mpool->mpool_free (openib_btl->super.btl_mpool, alloc_base); + goto free_headers_buf; + } + buf = buf + openib_btl->eager_rdma_frag_size - sizeof(mca_btl_openib_footer_t) - openib_btl->super.btl_eager_limit - sizeof(mca_btl_openib_header_t); @@ -913,7 +923,7 @@ void mca_btl_openib_endpoint_connect_eager_rdma( mca_btl_openib_frag_init_data_t init_data; item = (opal_free_list_item_t*)&headers_buf[i]; - item->registration = (mca_mpool_base_registration_t *)endpoint->eager_rdma_local.reg; + item->registration = (mca_rcache_base_registration_t *)endpoint->eager_rdma_local.reg; item->ptr = buf + i * openib_btl->eager_rdma_frag_size; OBJ_CONSTRUCT(item, mca_btl_openib_recv_frag_t); @@ -941,6 +951,7 @@ void mca_btl_openib_endpoint_connect_eager_rdma( /* set local rdma pointer to real value */ (void)opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, (void*)1, buf); + endpoint->eager_rdma_local.alloc_base = alloc_base; if(mca_btl_openib_endpoint_send_eager_rdma(endpoint) == OPAL_SUCCESS) { mca_btl_openib_device_t *device = endpoint->endpoint_btl->device; @@ -957,8 +968,9 @@ void mca_btl_openib_endpoint_connect_eager_rdma( return; } - openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool, - buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg); + openib_btl->device->rcache->rcache_deregister (openib_btl->device->rcache, + (mca_rcache_base_registration_t*)endpoint->eager_rdma_local.reg); + openib_btl->super.btl_mpool->mpool_free(openib_btl->super.btl_mpool, buf); free_headers_buf: free(headers_buf); unlock_rdma_local: diff --git a/opal/mca/btl/openib/btl_openib_mca.c b/opal/mca/btl/openib/btl_openib_mca.c index 07dcdd07c7..05f01b035a 100644 --- a/opal/mca/btl/openib/btl_openib_mca.c +++ b/opal/mca/btl/openib/btl_openib_mca.c @@ -316,9 +316,12 @@ int btl_openib_register_mca_params(void) "(must be >= 1)", 32, &mca_btl_openib_component.ib_free_list_inc, REGINT_GE_ONE)); - CHECK(reg_string("mpool", NULL, - "Name of the memory pool to be used (it is unlikely that you will ever want to change this)", - "grdma", &mca_btl_openib_component.ib_mpool_name, + CHECK(reg_string("mpool_hints", NULL, "hints for selecting a memory pool (default: none)", + NULL, &mca_btl_openib_component.ib_mpool_hints, + 0)); + CHECK(reg_string("rcache", NULL, + "Name of the registration cache to be used (it is unlikely that you will ever want to change this)", + "grdma", &mca_btl_openib_component.ib_rcache_name, 0)); CHECK(reg_int("reg_mru_len", NULL, "Length of the registration cache most recently used list " diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c index 2dd5caead0..7920fd7aa3 100644 --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c +++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c @@ -74,7 +74,6 @@ #include "btl_openib_async.h" #include "connect/connect.h" -#include "opal/mca/mpool/grdma/mpool_grdma.h" #include "opal/util/sys_limits.h" #if (ENABLE_DYNAMIC_SL) @@ -1367,7 +1366,7 @@ static int udcm_rc_qp_create_one(udcm_module_t *m, mca_btl_base_endpoint_t* lcl_ &init_attr))) { /* NTH: this process may be out of registered memory. try evicting an item from the lru of this btl's mpool */ - if (false == mca_mpool_grdma_evict (m->btl->super.btl_mpool)) { + if (false == m->btl->device->rcache->rcache_evict (m->btl->device->rcache)) { break; } } @@ -1378,7 +1377,7 @@ static int udcm_rc_qp_create_one(udcm_module_t *m, mca_btl_base_endpoint_t* lcl_ &init_attr))) { /* NTH: this process may be out of registered memory. try evicting an item from the lru of this btl's mpool */ - if (false == mca_mpool_grdma_evict (m->btl->super.btl_mpool)) { + if (false == m->btl->device->rcache->rcache_evict (m->btl->device->rcache)) { break; } } diff --git a/opal/mca/btl/sm/btl_sm.c b/opal/mca/btl/sm/btl_sm.c index d1758c9ddd..d1fcdf237e 100644 --- a/opal/mca/btl/sm/btl_sm.c +++ b/opal/mca/btl/sm/btl_sm.c @@ -56,8 +56,6 @@ #include "opal/datatype/opal_convertor.h" #include "opal/mca/btl/btl.h" -#include "opal/mca/mpool/base/base.h" -#include "opal/mca/mpool/sm/mpool_sm.h" #include "opal/align.h" #include "opal/util/sys_limits.h" @@ -111,7 +109,7 @@ static void *mpool_calloc(size_t nmemb, size_t size) size_t bsize = nmemb * size; mca_mpool_base_module_t *mpool = mca_btl_sm_component.sm_mpool; - buf = mpool->mpool_alloc(mpool, bsize, opal_cache_line_size, 0, NULL); + buf = mpool->mpool_alloc(mpool, bsize, opal_cache_line_size, 0); if (NULL == buf) return NULL; @@ -122,7 +120,7 @@ static void *mpool_calloc(size_t nmemb, size_t size) static int setup_mpool_base_resources(mca_btl_sm_component_t *comp_ptr, - mca_mpool_base_resources_t *out_res) + mca_common_sm_mpool_resources_t *out_res) { int rc = OPAL_SUCCESS; int fd = -1; @@ -222,7 +220,7 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl, size_t length, length_payload; sm_fifo_t *my_fifos; int my_mem_node, num_mem_nodes, i, rc; - mca_mpool_base_resources_t *res = NULL; + mca_common_sm_mpool_resources_t *res = NULL; mca_btl_sm_component_t* m = &mca_btl_sm_component; /* Assume we don't have hwloc support and fill in dummy info */ @@ -291,15 +289,14 @@ sm_btl_first_time_init(mca_btl_sm_t *sm_btl, /* Disable memory binding, because each MPI process will claim pages in the * mpool for their local NUMA node */ res->mem_node = -1; + res->allocator = mca_btl_sm_component.allocator; if (OPAL_SUCCESS != (rc = setup_mpool_base_resources(m, res))) { free(res); return rc; } /* now that res is fully populated, create the thing */ - mca_btl_sm_component.sm_mpools[0] = - mca_mpool_base_module_create(mca_btl_sm_component.sm_mpool_name, - sm_btl, res); + mca_btl_sm_component.sm_mpools[0] = common_sm_mpool_create (res); /* Sanity check to ensure that we found it */ if (NULL == mca_btl_sm_component.sm_mpools[0]) { free(res); @@ -470,7 +467,7 @@ int mca_btl_sm_add_procs( bool have_connected_peer = false; char **bases; /* for easy access to the mpool_sm_module */ - mca_mpool_sm_module_t *sm_mpool_modp = NULL; + mca_common_sm_mpool_module_t *sm_mpool_modp = NULL; /* initializion */ @@ -548,7 +545,7 @@ int mca_btl_sm_add_procs( } bases = mca_btl_sm_component.shm_bases; - sm_mpool_modp = (mca_mpool_sm_module_t *)mca_btl_sm_component.sm_mpool; + sm_mpool_modp = (mca_common_sm_mpool_module_t *)mca_btl_sm_component.sm_mpool; /* initialize own FIFOs */ /* diff --git a/opal/mca/btl/sm/btl_sm.h b/opal/mca/btl/sm/btl_sm.h index d3d2606b9a..9721bede3f 100644 --- a/opal/mca/btl/sm/btl_sm.h +++ b/opal/mca/btl/sm/btl_sm.h @@ -212,6 +212,12 @@ struct mca_btl_sm_component_t { char *sm_mpool_rndv_file_name; char *sm_ctl_file_name; char *sm_rndv_file_name; + + /** minimum size of a btl/sm mpool */ + unsigned long mpool_min_size; + + /** allocator name to use with the mpool */ + char *allocator; }; typedef struct mca_btl_sm_component_t mca_btl_sm_component_t; OPAL_MODULE_DECLSPEC extern mca_btl_sm_component_t mca_btl_sm_component; @@ -281,7 +287,7 @@ static inline int sm_fifo_init(int fifo_size, mca_mpool_base_module_t *mpool, /* allocate the queue in the receiver's address space */ fifo->queue_recv = (volatile void **)mpool->mpool_alloc( - mpool, sizeof(void *) * qsize, opal_cache_line_size, 0, NULL); + mpool, sizeof(void *) * qsize, opal_cache_line_size, 0); if(NULL == fifo->queue_recv) { return OPAL_ERR_OUT_OF_RESOURCE; } diff --git a/opal/mca/btl/sm/btl_sm_component.c b/opal/mca/btl/sm/btl_sm_component.c index 2414f8ca52..858acf524c 100644 --- a/opal/mca/btl/sm/btl_sm_component.c +++ b/opal/mca/btl/sm/btl_sm_component.c @@ -221,6 +221,19 @@ static int sm_register(void) 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, &mca_btl_sm_component.knem_max_simultaneous); + mca_btl_sm_component.allocator = "bucket"; + (void) mca_base_component_var_register (&mca_btl_sm_component.super.btl_version, "allocator", + "Name of allocator component to use for btl/sm allocations", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_sm_component.allocator); + + mca_btl_sm_component.mpool_min_size = 134217728; + (void) mca_base_component_var_register(&mca_btl_sm_component.super.btl_version, "min_size", + "Minimum size of the common/sm mpool shared memory file", + MCA_BASE_VAR_TYPE_UNSIGNED_LONG, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_btl_sm_component.mpool_min_size); + /* CMA parameters */ mca_btl_sm_component.use_cma = 0; (void) mca_base_component_var_register(&mca_btl_sm_component.super.btl_version, @@ -234,9 +247,6 @@ static int sm_register(void) mca_btl_sm_param_register_int("free_list_max", -1, OPAL_INFO_LVL_5, &mca_btl_sm_component.sm_free_list_max); mca_btl_sm_param_register_int("free_list_inc", 64, OPAL_INFO_LVL_5, &mca_btl_sm_component.sm_free_list_inc); mca_btl_sm_param_register_int("max_procs", -1, OPAL_INFO_LVL_5, &mca_btl_sm_component.sm_max_procs); - /* there is no practical use for the mpool name parameter since mpool resources differ - between components */ - mca_btl_sm_component.sm_mpool_name = "sm"; mca_btl_sm_param_register_uint("fifo_size", 4096, OPAL_INFO_LVL_4, &mca_btl_sm_component.fifo_size); mca_btl_sm_param_register_int("num_fifos", 1, OPAL_INFO_LVL_4, &mca_btl_sm_component.nfifos); @@ -456,41 +466,6 @@ create_and_attach(mca_btl_sm_component_t *comp_ptr, return OPAL_SUCCESS; } -/* - * SKG - I'm not happy with this, but I can't figure out a better way of - * finding the sm mpool's minimum size 8-|. The way I see it. This BTL only - * uses the sm mpool, so maybe this isn't so bad... - * - * The problem is the we need to size the mpool resources at sm BTL component - * init. That means we need to know the mpool's minimum size at create. - */ -static int -get_min_mpool_size(mca_btl_sm_component_t *comp_ptr, - size_t *out_size) -{ - const char *type_name = "mpool"; - const char *param_name = "min_size"; - const mca_base_var_storage_t *min_size; - int id = 0; - - if (0 > (id = mca_base_var_find("ompi", type_name, comp_ptr->sm_mpool_name, - param_name))) { - opal_output(0, "mca_base_var_find: failure looking for %s_%s_%s\n", - type_name, comp_ptr->sm_mpool_name, param_name); - return OPAL_ERR_NOT_FOUND; - } - - if (OPAL_SUCCESS != mca_base_var_get_value(id, &min_size, NULL, NULL)) { - opal_output(0, "mca_base_var_get_value failure\n"); - return OPAL_ERROR; - } - - /* the min_size variable is an unsigned long long */ - *out_size = (size_t) min_size->ullval; - - return OPAL_SUCCESS; -} - static int get_mpool_res_size(int32_t max_procs, size_t *out_res_size) @@ -612,20 +587,16 @@ create_rndv_file(mca_btl_sm_component_t *comp_ptr, mca_common_sm_module_t *tmp_modp = NULL; if (MCA_BTL_SM_RNDV_MOD_MPOOL == type) { - size_t min_size = 0; /* get the segment size for the sm mpool. */ if (OPAL_SUCCESS != (rc = get_mpool_res_size(comp_ptr->sm_max_procs, &size))) { /* rc is already set */ goto out; } - /* do we need to update the size based on the sm mpool's min size? */ - if (OPAL_SUCCESS != (rc = get_min_mpool_size(comp_ptr, &min_size))) { - goto out; - } + /* update size if less than required minimum */ - if (size < min_size) { - size = min_size; + if (size < mca_btl_sm_component.mpool_min_size) { + size = mca_btl_sm_component.mpool_min_size; } /* we only need the shmem_ds info at this point. initilization will be * completed in the mpool module code. the idea is that we just need this diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c index bf470f4fb7..6208ea5399 100644 --- a/opal/mca/btl/smcuda/btl_smcuda.c +++ b/opal/mca/btl/smcuda/btl_smcuda.c @@ -12,8 +12,8 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2015 Los Alamos National Security, LLC. - * All rights reserved. + * Copyright (c) 2010-2016 Los Alamos National Security, LLC. All rights + * reserved. * Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved. * Copyright (c) 2012 Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014 Research Organization for Information Science @@ -53,11 +53,13 @@ #include "opal/datatype/opal_convertor.h" #include "opal/mca/btl/btl.h" +#include "opal/mca/common/sm/common_sm_mpool.h" + #if OPAL_CUDA_SUPPORT #include "opal/mca/common/cuda/common_cuda.h" #endif /* OPAL_CUDA_SUPPORT */ #include "opal/mca/mpool/base/base.h" -#include "opal/mca/mpool/sm/mpool_sm.h" +#include "opal/mca/rcache/base/base.h" #if OPAL_ENABLE_FT_CR == 1 #include "opal/mca/crs/base/base.h" @@ -122,7 +124,7 @@ static void *mpool_calloc(size_t nmemb, size_t size) size_t bsize = nmemb * size; mca_mpool_base_module_t *mpool = mca_btl_smcuda_component.sm_mpool; - buf = mpool->mpool_alloc(mpool, bsize, opal_cache_line_size, 0, NULL); + buf = mpool->mpool_alloc(mpool, bsize, opal_cache_line_size, 0); if (NULL == buf) return NULL; @@ -133,7 +135,7 @@ static void *mpool_calloc(size_t nmemb, size_t size) static int setup_mpool_base_resources(mca_btl_smcuda_component_t *comp_ptr, - mca_mpool_base_resources_t *out_res) + mca_common_sm_mpool_resources_t *out_res) { int rc = OPAL_SUCCESS; int fd = -1; @@ -228,7 +230,7 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, size_t length, length_payload; sm_fifo_t *my_fifos; int my_mem_node, num_mem_nodes, i, rc; - mca_mpool_base_resources_t *res = NULL; + mca_common_sm_mpool_resources_t *res = NULL; mca_btl_smcuda_component_t* m = &mca_btl_smcuda_component; /* Assume we don't have hwloc support and fill in dummy info */ @@ -297,15 +299,14 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, /* Disable memory binding, because each MPI process will claim pages in the * mpool for their local NUMA node */ res->mem_node = -1; + res->allocator = mca_btl_smcuda_component.allocator; if (OPAL_SUCCESS != (rc = setup_mpool_base_resources(m, res))) { free(res); return rc; } /* now that res is fully populated, create the thing */ - mca_btl_smcuda_component.sm_mpools[0] = - mca_mpool_base_module_create(mca_btl_smcuda_component.sm_mpool_name, - smcuda_btl, res); + mca_btl_smcuda_component.sm_mpools[0] = common_sm_mpool_create (res); /* Sanity check to ensure that we found it */ if (NULL == mca_btl_smcuda_component.sm_mpools[0]) { free(res); @@ -345,10 +346,9 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl, /* Create a local memory pool that sends handles to the remote * side. Note that the res argument is not really used, but * needed to satisfy function signature. */ - smcuda_btl->super.btl_mpool = mca_mpool_base_module_create("gpusm", - smcuda_btl, - res); - if (NULL == smcuda_btl->super.btl_mpool) { + mca_rcache_base_resources_t rcache_res; + smcuda_btl->rcache = mca_rcache_base_module_create("gpusm", smcuda_btl, &rcache_res); + if (NULL == smcuda_btl->rcache) { return OPAL_ERR_OUT_OF_RESOURCE; } #endif /* OPAL_CUDA_SUPPORT */ @@ -479,16 +479,9 @@ create_sm_endpoint(int local_proc, struct opal_proc_t *proc) } #endif #if OPAL_CUDA_SUPPORT - { - mca_mpool_base_resources_t resources; /* unused, but needed */ - - /* Create a remote memory pool on the endpoint. Note that the resources - * argument is just to satisfy the function signature. The rcuda mpool - * actually takes care of filling in the resources. */ - ep->mpool = mca_mpool_base_module_create("rgpusm", - NULL, - &resources); - } + /* Create a remote memory pool on the endpoint. The rgpusm component + * does not take any resources. They are filled in internally. */ + ep->rcache = mca_rcache_base_module_create ("rgpusm", NULL, NULL); #endif /* OPAL_CUDA_SUPPORT */ return ep; } @@ -507,7 +500,7 @@ int mca_btl_smcuda_add_procs( bool have_connected_peer = false; char **bases; /* for easy access to the mpool_sm_module */ - mca_mpool_sm_module_t *sm_mpool_modp = NULL; + mca_common_sm_mpool_module_t *sm_mpool_modp = NULL; /* initializion */ @@ -584,7 +577,7 @@ int mca_btl_smcuda_add_procs( } bases = mca_btl_smcuda_component.shm_bases; - sm_mpool_modp = (mca_mpool_sm_module_t *)mca_btl_smcuda_component.sm_mpool; + sm_mpool_modp = (mca_common_sm_mpool_module_t *)mca_btl_smcuda_component.sm_mpool; /* initialize own FIFOs */ /* @@ -693,6 +686,13 @@ int mca_btl_smcuda_del_procs( struct opal_proc_t **procs, struct mca_btl_base_endpoint_t **peers) { + for (size_t i = 0 ; i < nprocs ; ++i) { + if (peers[i]->rcache) { + mca_rcache_base_module_destroy (peers[i]->rcache); + peers[i]->rcache = NULL; + } + } + return OPAL_SUCCESS; } @@ -1009,16 +1009,17 @@ static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem ( struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base, size_t size, uint32_t flags) { - mca_mpool_common_cuda_reg_t *reg; + mca_btl_smcuda_t *smcuda_module = (mca_btl_smcuda_t *) btl; + mca_rcache_common_cuda_reg_t *reg; int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; - int mpool_flags = 0; + int rcache_flags = 0; if (MCA_BTL_REG_FLAG_CUDA_GPU_MEM & flags) { - mpool_flags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM; + rcache_flags |= MCA_RCACHE_FLAGS_CUDA_GPU_MEM; } - btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, mpool_flags, - access_flags, (mca_mpool_base_registration_t **) ®); + smcuda_module->rcache->rcache_register (smcuda_module->rcache, base, size, rcache_flags, + access_flags, (mca_rcache_base_registration_t **) ®); if (OPAL_UNLIKELY(NULL == reg)) { return NULL; } @@ -1029,10 +1030,11 @@ static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem ( static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl, struct mca_btl_base_registration_handle_t *handle) { - mca_mpool_common_cuda_reg_t *reg = (mca_mpool_common_cuda_reg_t *) - ((intptr_t) handle - offsetof (mca_mpool_common_cuda_reg_t, data)); + mca_btl_smcuda_t *smcuda_module = (mca_btl_smcuda_t *) btl; + mca_rcache_common_cuda_reg_t *reg = (mca_rcache_common_cuda_reg_t *) + ((intptr_t) handle - offsetof (mca_rcache_common_cuda_reg_t, data)); - btl->btl_mpool->mpool_deregister (btl->btl_mpool, ®->base); + smcuda_module->rcache->rcache_deregister (smcuda_module->rcache, ®->base); return OPAL_SUCCESS; } @@ -1043,8 +1045,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl, struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_mpool_common_cuda_reg_t rget_reg; - mca_mpool_common_cuda_reg_t *reg_ptr = &rget_reg; + mca_rcache_common_cuda_reg_t rget_reg; + mca_rcache_common_cuda_reg_t *reg_ptr = &rget_reg; int rc, done; void *remote_memory_address; size_t offset; @@ -1087,16 +1089,16 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl, * remote memory which may lie somewhere in the middle. This is taken care of * a few lines down. Note that we hand in the peer rank just for debugging * support. */ - rc = ep->mpool->mpool_register(ep->mpool, remote_handle->reg_data.memh_seg_addr.pval, - remote_handle->reg_data.memh_seg_len, ep->peer_smp_rank, - MCA_MPOOL_ACCESS_LOCAL_WRITE, - (mca_mpool_base_registration_t **)®_ptr); + rc = ep->rcache->rcache_register (ep->rcache, remote_handle->reg_data.memh_seg_addr.pval, + remote_handle->reg_data.memh_seg_len, ep->peer_smp_rank, + MCA_RCACHE_ACCESS_LOCAL_WRITE, + (mca_rcache_base_registration_t **)®_ptr); if (OPAL_SUCCESS != rc) { opal_output(0, "Failed to register remote memory, rc=%d", rc); return rc; } - frag->registration = (mca_mpool_base_registration_t *)reg_ptr; + frag->registration = (mca_rcache_base_registration_t *)reg_ptr; frag->endpoint = ep; /* The registration has given us back the memory block that this diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h index 7c9d30fade..807d908116 100644 --- a/opal/mca/btl/smcuda/btl_smcuda.h +++ b/opal/mca/btl/smcuda/btl_smcuda.h @@ -206,6 +206,8 @@ struct mca_btl_smcuda_component_t { int use_cuda_ipc; int use_cuda_ipc_same_gpu; #endif /* OPAL_CUDA_SUPPORT */ + unsigned long mpool_min_size; + char *allocator; }; typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t; OPAL_MODULE_DECLSPEC extern mca_btl_smcuda_component_t mca_btl_smcuda_component; @@ -217,7 +219,7 @@ struct mca_btl_smcuda_t { mca_btl_base_module_t super; /**< base BTL interface */ bool btl_inited; /**< flag indicating if btl has been inited */ mca_btl_base_module_error_cb_fn_t error_cb; - + mca_rcache_base_module_t *rcache; }; typedef struct mca_btl_smcuda_t mca_btl_smcuda_t; OPAL_MODULE_DECLSPEC extern mca_btl_smcuda_t mca_btl_smcuda; @@ -254,7 +256,7 @@ static inline int sm_fifo_init(int fifo_size, mca_mpool_base_module_t *mpool, /* allocate the queue in the receiver's address space */ fifo->queue_recv = (volatile void **)mpool->mpool_alloc( - mpool, sizeof(void *) * qsize, opal_cache_line_size, 0, NULL); + mpool, sizeof(void *) * qsize, opal_cache_line_size, 0); if(NULL == fifo->queue_recv) { return OPAL_ERR_OUT_OF_RESOURCE; } diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c index dcbf0ec518..8aedf9f1d7 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_component.c +++ b/opal/mca/btl/smcuda/btl_smcuda_component.c @@ -12,8 +12,8 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2015 Los Alamos National Security, LLC. - * All rights reserved. + * Copyright (c) 2010-2016 Los Alamos National Security, LLC. All rights + * reserved. * Copyright (c) 2011-2015 NVIDIA Corporation. All rights reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved. * $COPYRIGHT$ @@ -141,6 +141,13 @@ static int mca_btl_smcuda_component_verify(void) { static int smcuda_register(void) { /* register SM component parameters */ + mca_btl_smcuda_component.mpool_min_size = 134217728; + (void) mca_base_component_var_register(&mca_btl_smcuda_component.super.btl_version, "min_size", + "Minimum size of the common/sm mpool shared memory file", + MCA_BASE_VAR_TYPE_UNSIGNED_LONG, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &mca_btl_smcuda_component.mpool_min_size); + mca_btl_smcuda_param_register_int("free_list_num", 8, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_num); mca_btl_smcuda_param_register_int("free_list_max", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_max); mca_btl_smcuda_param_register_int("free_list_inc", 64, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_inc); @@ -156,6 +163,12 @@ static int smcuda_register(void) /* default number of extra procs to allow for future growth */ mca_btl_smcuda_param_register_int("sm_extra_procs", 0, OPAL_INFO_LVL_9, &mca_btl_smcuda_component.sm_extra_procs); + mca_btl_smcuda_component.allocator = "bucket"; + (void) mca_base_component_var_register (&mca_btl_smcuda_component.super.btl_version, "allocator", + "Name of allocator component to use for btl/smcuda allocations", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_smcuda_component.allocator); + #if OPAL_CUDA_SUPPORT /* Lower priority when CUDA support is not requested */ if (opal_cuda_support) { @@ -366,41 +379,6 @@ create_and_attach(mca_btl_smcuda_component_t *comp_ptr, return OPAL_SUCCESS; } -/* - * SKG - I'm not happy with this, but I can't figure out a better way of - * finding the sm mpool's minimum size 8-|. The way I see it. This BTL only - * uses the sm mpool, so maybe this isn't so bad... - * - * The problem is the we need to size the mpool resources at sm BTL component - * init. That means we need to know the mpool's minimum size at create. - */ -static int -get_min_mpool_size(mca_btl_smcuda_component_t *comp_ptr, - size_t *out_size) -{ - const char *type_name = "mpool"; - const char *param_name = "min_size"; - const mca_base_var_storage_t *min_size; - int id = 0; - - if (0 > (id = mca_base_var_find("ompi", type_name, comp_ptr->sm_mpool_name, - param_name))) { - opal_output(0, "mca_base_var_find: failure looking for %s_%s_%s\n", - type_name, comp_ptr->sm_mpool_name, param_name); - return OPAL_ERR_NOT_FOUND; - } - - if (OPAL_SUCCESS != mca_base_var_get_value(id, &min_size, NULL, NULL)) { - opal_output(0, "mca_base_var_get_value failure\n"); - return OPAL_ERROR; - } - - /* the min_size variable is an unsigned long long */ - *out_size = (size_t) min_size->ullval; - - return OPAL_SUCCESS; -} - static int get_mpool_res_size(int32_t max_procs, size_t *out_res_size) @@ -521,21 +499,18 @@ create_rndv_file(mca_btl_smcuda_component_t *comp_ptr, mca_common_sm_module_t *tmp_modp = NULL; if (MCA_BTL_SM_RNDV_MOD_MPOOL == type) { - size_t min_size = 0; /* get the segment size for the sm mpool. */ if (OPAL_SUCCESS != (rc = get_mpool_res_size(comp_ptr->sm_max_procs, &size))) { /* rc is already set */ goto out; } - /* do we need to update the size based on the sm mpool's min size? */ - if (OPAL_SUCCESS != (rc = get_min_mpool_size(comp_ptr, &min_size))) { - goto out; - } + /* update size if less than required minimum */ - if (size < min_size) { - size = min_size; + if (size < mca_btl_smcuda_component.mpool_min_size) { + size = mca_btl_smcuda_component.mpool_min_size; } + /* we only need the shmem_ds info at this point. initilization will be * completed in the mpool module code. the idea is that we just need this * info so we can populate the rndv file (or modex when we have it). */ @@ -1161,8 +1136,8 @@ int mca_btl_smcuda_component_progress(void) OPAL_SUCCESS); if(frag->registration != NULL) { - frag->endpoint->mpool->mpool_deregister(frag->endpoint->mpool, - (mca_mpool_base_registration_t*)frag->registration); + frag->endpoint->rcache->rcache_deregister (frag->endpoint->rcache, + (mca_rcache_base_registration_t*)frag->registration); frag->registration = NULL; MCA_BTL_SMCUDA_FRAG_RETURN(frag); } diff --git a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h index cead5ec7a5..1dfb359e17 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h +++ b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -11,6 +12,8 @@ * All rights reserved. * Copyright (c) 2006-2007 Voltaire. All rights reserved. * Copyright (c) 2012 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -35,7 +38,7 @@ struct mca_btl_base_endpoint_t { int peer_smp_rank; /**< My peer's SMP process rank. Used for accessing * SMP specfic data structures. */ #if OPAL_CUDA_SUPPORT - mca_mpool_base_module_t *mpool; /**< mpool for remotely registered memory */ + mca_rcache_base_module_t *rcache; /**< rcache for remotely registered memory */ #endif /* OPAL_CUDA_SUPPORT */ #if OPAL_ENABLE_PROGRESS_THREADS == 1 int fifo_fd; /**< pipe/fifo used to signal endpoint that data is queued */ diff --git a/opal/mca/btl/smcuda/btl_smcuda_frag.h b/opal/mca/btl/smcuda/btl_smcuda_frag.h index 55996d6eef..78cc9c3901 100644 --- a/opal/mca/btl/smcuda/btl_smcuda_frag.h +++ b/opal/mca/btl/smcuda/btl_smcuda_frag.h @@ -54,7 +54,7 @@ typedef struct mca_btl_smcuda_hdr_t mca_btl_smcuda_hdr_t; #if OPAL_CUDA_SUPPORT struct mca_btl_base_registration_handle_t { - mca_mpool_common_cuda_reg_data_t reg_data; + mca_rcache_common_cuda_reg_data_t reg_data; }; #endif @@ -78,7 +78,7 @@ struct mca_btl_smcuda_frag_t { mca_btl_base_segment_t segment; struct mca_btl_base_endpoint_t *endpoint; #if OPAL_CUDA_SUPPORT - struct mca_mpool_base_registration_t *registration; + struct mca_rcache_base_registration_t *registration; struct mca_btl_base_registration_handle_t *local_handle; #endif /* OPAL_CUDA_SUPPORT */ size_t size; diff --git a/opal/mca/btl/ugni/btl_ugni.h b/opal/mca/btl/ugni/btl_ugni.h index e6d9634f58..bc9c6b67b5 100644 --- a/opal/mca/btl/ugni/btl_ugni.h +++ b/opal/mca/btl/ugni/btl_ugni.h @@ -25,7 +25,8 @@ #include "opal/mca/mpool/mpool.h" #include "opal/mca/mpool/base/base.h" -#include "opal/mca/mpool/udreg/mpool_udreg.h" +#include "opal/mca/rcache/base/base.h" +#include "opal/mca/rcache/udreg/rcache_udreg.h" #include "opal/util/output.h" #include "opal_stdint.h" #include "opal/mca/btl/btl.h" @@ -56,8 +57,8 @@ typedef struct mca_btl_ugni_endpoint_attr_t { } mca_btl_ugni_endpoint_attr_t; enum { - MCA_BTL_UGNI_MPOOL_UDREG, - MCA_BTL_UGNI_MPOOL_GRDMA + MCA_BTL_UGNI_RCACHE_UDREG, + MCA_BTL_UGNI_RCACHE_GRDMA }; typedef struct mca_btl_ugni_module_t { @@ -86,7 +87,7 @@ typedef struct mca_btl_ugni_module_t { opal_free_list_t post_descriptors; - mca_mpool_base_module_t *smsg_mpool; + mca_mpool_base_module_t *mpool; opal_free_list_t smsg_mboxes; gni_ep_handle_t wildcard_ep; @@ -128,6 +129,8 @@ typedef struct mca_btl_ugni_module_t { int nlocal_procs; volatile int active_send_count; + + mca_rcache_base_module_t *rcache; } mca_btl_ugni_module_t; typedef struct mca_btl_ugni_component_t { @@ -177,8 +180,11 @@ typedef struct mca_btl_ugni_component_t { /* Page size to use for SMSG allocations (udreg mpool) */ unsigned int smsg_page_size; - /* mpool type (grdma or udreg) */ - int mpool_type; + /* rcache type (grdma or udreg) */ + int rcache_type; + + /* memory pool hints */ + char *mpool_hints; /* Number of mailboxes to allocate in each block */ unsigned int mbox_increment; @@ -312,7 +318,7 @@ struct mca_btl_base_registration_handle_t { }; typedef struct mca_btl_ugni_reg_t { - mca_mpool_base_registration_t base; + mca_rcache_base_registration_t base; mca_btl_base_registration_handle_t handle; } mca_btl_ugni_reg_t; diff --git a/opal/mca/btl/ugni/btl_ugni_add_procs.c b/opal/mca/btl/ugni/btl_ugni_add_procs.c index 2b547659c0..dff366af99 100644 --- a/opal/mca/btl/ugni/btl_ugni_add_procs.c +++ b/opal/mca/btl/ugni/btl_ugni_add_procs.c @@ -266,11 +266,12 @@ struct mca_btl_base_endpoint_t *mca_btl_ugni_get_ep (struct mca_btl_base_module_ } -static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size, - mca_mpool_base_registration_t *reg) +static int ugni_reg_mem (void *reg_data, void *base, size_t size, + mca_rcache_base_registration_t *reg) { mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) reg_data; mca_btl_ugni_reg_t *ugni_reg = (mca_btl_ugni_reg_t *) reg; + gni_cq_handle_t cq = NULL; gni_return_t rc; int flags; @@ -278,18 +279,24 @@ static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size, return OPAL_ERR_OUT_OF_RESOURCE; } - if (reg->access_flags & (MCA_MPOOL_ACCESS_REMOTE_WRITE | MCA_MPOOL_ACCESS_LOCAL_WRITE | - MCA_MPOOL_ACCESS_REMOTE_ATOMIC)) { + if (reg->access_flags & (MCA_RCACHE_ACCESS_REMOTE_WRITE | MCA_RCACHE_ACCESS_LOCAL_WRITE | + MCA_RCACHE_ACCESS_REMOTE_ATOMIC)) { flags = GNI_MEM_READWRITE; } else { flags = GNI_MEM_READ_ONLY; } - flags |= GNI_MEM_RELAXED_PI_ORDERING; + if (!(reg->flags & MCA_RCACHE_FLAGS_SO_MEM)) { + flags |= GNI_MEM_RELAXED_PI_ORDERING; + } + + if (reg->flags & MCA_RCACHE_FLAGS_RESV0) { + cq = ugni_module->smsg_remote_cq; + } OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base, - size, NULL, flags, -1, &(ugni_reg->handle.gni_handle)); + size, cq, flags, -1, &(ugni_reg->handle.gni_handle)); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { @@ -301,24 +308,8 @@ static int ugni_reg_rdma_mem (void *reg_data, void *base, size_t size, return OPAL_SUCCESS; } - -static int ugni_reg_smsg_mem (void *reg_data, void *base, size_t size, - mca_mpool_base_registration_t *reg) -{ - mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) reg_data; - mca_btl_ugni_reg_t *ugni_reg = (mca_btl_ugni_reg_t *) reg; - gni_return_t rc; - - OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); - rc = GNI_MemRegister (ugni_module->device->dev_handle, (uint64_t) base, - size, ugni_module->smsg_remote_cq, GNI_MEM_READWRITE, -1, - &(ugni_reg->handle.gni_handle)); - OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); - return opal_common_rc_ugni_to_opal (rc); -} - static int -ugni_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg) +ugni_dereg_mem (void *reg_data, mca_rcache_base_registration_t *reg) { mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) reg_data; mca_btl_ugni_reg_t *ugni_reg = (mca_btl_ugni_reg_t *)reg; @@ -339,10 +330,10 @@ ugni_dereg_mem (void *reg_data, mca_mpool_base_registration_t *reg) static int mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module) { - struct mca_mpool_base_resources_t mpool_resources; + mca_rcache_udreg_resources_t rcache_resources; unsigned int mbox_increment; uint32_t nprocs, *u32; - const char *mpool_name; + char *rcache_name; int rc; rc = opal_pointer_array_init (&ugni_module->pending_smsg_frags_bb, 0, @@ -404,43 +395,35 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module) return rc; } - mpool_resources.pool_name = "ompi.ugni"; - mpool_resources.reg_data = (void *) ugni_module; - mpool_resources.sizeof_reg = sizeof (mca_btl_ugni_reg_t); - mpool_resources.register_mem = ugni_reg_rdma_mem; - mpool_resources.deregister_mem = ugni_dereg_mem; - - if (MCA_BTL_UGNI_MPOOL_UDREG == mca_btl_ugni_component.mpool_type) { - /* additional settings for the udreg mpool */ - /* 4k should be large enough for any Gemini/Ares system */ - mpool_resources.max_entries = 4096; - mpool_resources.use_kernel_cache = true; - - /* request a specific page size. this request may not be honored if the - * page size does not exist. */ - mpool_resources.page_size = mca_btl_ugni_component.smsg_page_size; - - mpool_resources.use_evict_w_unreg = false; - mpool_name = "udreg"; - } else { - mpool_name = "grdma"; - } - - ugni_module->super.btl_mpool = - mca_mpool_base_module_create(mpool_name, ugni_module->device, &mpool_resources); - - mpool_resources.register_mem = ugni_reg_smsg_mem; - - ugni_module->smsg_mpool = - mca_mpool_base_module_create(mpool_name, ugni_module->device, &mpool_resources); - + ugni_module->super.btl_mpool = mca_mpool_base_module_lookup (mca_btl_ugni_component.mpool_hints); if (NULL == ugni_module->super.btl_mpool) { - BTL_ERROR(("error creating rdma mpool")); + BTL_ERROR(("could not find mpool matching hints %s", mca_btl_ugni_component.mpool_hints)); return OPAL_ERROR; } - if (NULL == ugni_module->smsg_mpool) { - BTL_ERROR(("error creating smsg mpool")); + rcache_resources.base.cache_name = "ompi.ugni"; + rcache_resources.base.reg_data = (void *) ugni_module; + rcache_resources.base.sizeof_reg = sizeof (mca_btl_ugni_reg_t); + rcache_resources.base.register_mem = ugni_reg_mem; + rcache_resources.base.deregister_mem = ugni_dereg_mem; + + if (MCA_BTL_UGNI_RCACHE_UDREG == mca_btl_ugni_component.rcache_type) { + /* additional settings for the udreg mpool */ + /* 4k should be large enough for any Gemini/Ares system */ + rcache_resources.max_entries = 4096; + rcache_resources.use_kernel_cache = true; + + rcache_resources.use_evict_w_unreg = false; + rcache_name = "udreg"; + } else { + rcache_name = "grdma"; + } + + ugni_module->rcache = + mca_rcache_base_module_create (rcache_name, ugni_module->device, &rcache_resources.base); + + if (NULL == ugni_module->rcache) { + BTL_ERROR(("error creating registration cache")); return OPAL_ERROR; } @@ -451,7 +434,7 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module) mca_btl_ugni_component.ugni_eager_num, mca_btl_ugni_component.ugni_eager_max, mca_btl_ugni_component.ugni_eager_inc, - ugni_module->super.btl_mpool, 0, NULL, + ugni_module->super.btl_mpool, 0, ugni_module->rcache, (opal_free_list_item_init_fn_t) mca_btl_ugni_frag_init, (void *) ugni_module); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { @@ -466,7 +449,7 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module) mca_btl_ugni_component.ugni_eager_num, mca_btl_ugni_component.ugni_eager_max, mca_btl_ugni_component.ugni_eager_inc, - ugni_module->super.btl_mpool, 0, NULL, + ugni_module->super.btl_mpool, 0, ugni_module->rcache, (opal_free_list_item_init_fn_t) mca_btl_ugni_frag_init, (void *) ugni_module); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { @@ -487,12 +470,14 @@ mca_btl_ugni_setup_mpools (mca_btl_ugni_module_t *ugni_module) mbox_increment = mca_btl_ugni_component.mbox_increment; } + /* use the MCA_RCACHE_FLAGS_RESV0 to signal this is smsg memory */ rc = opal_free_list_init (&ugni_module->smsg_mboxes, sizeof (mca_btl_ugni_smsg_mbox_t), 8, OBJ_CLASS(mca_btl_ugni_smsg_mbox_t), mca_btl_ugni_component.smsg_mbox_size, 128, - 32, -1, mbox_increment, ugni_module->smsg_mpool, - 0, NULL, NULL, NULL); + 32, -1, mbox_increment, ugni_module->super.btl_mpool, + MCA_RCACHE_FLAGS_SO_MEM | MCA_RCACHE_FLAGS_RESV0, + ugni_module->rcache, NULL, NULL); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { BTL_ERROR(("error creating smsg mailbox free list")); return rc; diff --git a/opal/mca/btl/ugni/btl_ugni_component.c b/opal/mca/btl/ugni/btl_ugni_component.c index 323cf367c8..acf0b073ec 100644 --- a/opal/mca/btl/ugni/btl_ugni_component.c +++ b/opal/mca/btl/ugni/btl_ugni_component.c @@ -15,6 +15,11 @@ #include "btl_ugni_rdma.h" #include "btl_ugni_smsg.h" +#include "opal/util/sys_limits.h" + +#include +#include + #include "opal/memoryhooks/memory.h" #include "opal/runtime/opal_params.h" @@ -25,6 +30,7 @@ static int btl_ugni_component_open(void); static int btl_ugni_component_close(void); static mca_btl_base_module_t **mca_btl_ugni_component_init(int *, bool, bool); static int mca_btl_ugni_component_progress(void); +static unsigned long mca_btl_ugni_ugni_page_size = 0; mca_btl_ugni_component_t mca_btl_ugni_component = { .super = { @@ -44,9 +50,9 @@ mca_btl_ugni_component_t mca_btl_ugni_component = { } }; -mca_base_var_enum_value_t mpool_values[] = { - {MCA_BTL_UGNI_MPOOL_UDREG, "udreg"}, - {MCA_BTL_UGNI_MPOOL_GRDMA, "grdma"}, +mca_base_var_enum_value_t rcache_values[] = { + {MCA_BTL_UGNI_RCACHE_UDREG, "udreg"}, + {MCA_BTL_UGNI_RCACHE_GRDMA, "grdma"}, {-1, NULL} /* sentinal */ }; @@ -55,6 +61,7 @@ btl_ugni_component_register(void) { mca_base_var_enum_t *new_enum; gni_nic_device_t device_type; + char *mpool_hints_tmp = NULL; int rc; (void) mca_base_var_group_component_register(&mca_btl_ugni_component.super.btl_version, @@ -174,10 +181,29 @@ btl_ugni_component_register(void) MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mbox_increment); + /* determine if there are get alignment restrictions */ + GNI_GetDeviceType (&device_type); + + mca_btl_ugni_component.smsg_page_size = 2 << 20; + if (GNI_DEVICE_GEMINI == device_type) { + if (access ("/sys/class/gemini/ghal0/mrt", R_OK)) { + int fd = open ("/sys/class/gemini/ghal0/mrt", O_RDONLY); + char buffer[10]; + + if (0 <= fd) { + memset (buffer, 0, sizeof (buffer)); + read (fd, buffer, sizeof (buffer) - 1); + close (fd); + mca_btl_ugni_ugni_page_size = strtol (buffer, NULL, 10) * 1024; + mca_btl_ugni_component.smsg_page_size = mca_btl_ugni_ugni_page_size; + } + } + } + (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, "smsg_page_size", "Page size to use for SMSG " - "mailbox allocation (default 2M)", + "mailbox allocation (default: detect)", MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, @@ -202,20 +228,38 @@ btl_ugni_component_register(void) MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, NULL, NULL, NULL, &mca_btl_ugni_progress_thread_wakeups); - /* btl/ugni can only support only a fixed set of mpools (these mpools have compatible resource + /* btl/ugni can only support only a fixed set of rcache components (these rcache components have compatible resource * structures) */ - rc = mca_base_var_enum_create ("btl_ugni_mpool", mpool_values, &new_enum); + rc = mca_base_var_enum_create ("btl_ugni_rcache", rcache_values, &new_enum); if (OPAL_SUCCESS != rc) { return rc; } - mca_btl_ugni_component.mpool_type = MCA_BTL_UGNI_MPOOL_UDREG; + mca_btl_ugni_component.rcache_type = MCA_BTL_UGNI_RCACHE_UDREG; (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, - "mpool", "mpool to use", MCA_BASE_VAR_TYPE_INT, new_enum, + "rcache", "registration cache to use", MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mpool_type); + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.rcache_type); OBJ_RELEASE(new_enum); + if (mca_btl_ugni_ugni_page_size) { + rc = asprintf (&mpool_hints_tmp, "page_size=%lu", mca_btl_ugni_ugni_page_size); + if (rc < 0) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + mca_btl_ugni_component.mpool_hints = mpool_hints_tmp; + } else { + mca_btl_ugni_component.mpool_hints = "page_size=2M"; + } + + (void) mca_base_component_var_register(&mca_btl_ugni_component.super.btl_version, + "mpool_hints", "hints to use when selecting a memory pool (default: " + "\"page_size=2M\")", MCA_BASE_VAR_TYPE_STRING, NULL, 0, + MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, + MCA_BASE_VAR_SCOPE_LOCAL, &mca_btl_ugni_component.mpool_hints); + free (mpool_hints_tmp); + /* ensure we loose send exclusivity to sm and vader if they are enabled */ mca_btl_ugni_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 2; @@ -228,9 +272,6 @@ btl_ugni_component_register(void) mca_btl_ugni_module.super.btl_get_limit = 1 * 1024 * 1024; - /* determine if there are get alignment restrictions */ - GNI_GetDeviceType (&device_type); - /* * see def. of ALIGNMENT_MASK to figure this one out */ @@ -291,7 +332,7 @@ btl_ugni_component_close(void) } static void mca_btl_ugni_autoset_leave_pinned (void) { - if (MCA_BTL_UGNI_MPOOL_UDREG != mca_btl_ugni_component.mpool_type) { + if (MCA_BTL_UGNI_RCACHE_UDREG != mca_btl_ugni_component.rcache_type) { int value = opal_mem_hooks_support_level(); if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) == ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) & value)) { @@ -362,6 +403,12 @@ mca_btl_ugni_component_init (int *num_btl_modules, return NULL; } + if (mca_btl_ugni_component.smsg_page_size != (unsigned long) opal_getpagesize ()) { + if (mca_btl_ugni_ugni_page_size > mca_btl_ugni_component.smsg_page_size) { + mca_btl_ugni_component.smsg_page_size = mca_btl_ugni_ugni_page_size; + } + } + mca_btl_ugni_autoset_leave_pinned (); mca_btl_ugni_module.super.btl_rdma_pipeline_send_length = mca_btl_ugni_module.super.btl_eager_limit; diff --git a/opal/mca/btl/ugni/btl_ugni_frag.h b/opal/mca/btl/ugni/btl_ugni_frag.h index 2b04564c61..8257ee2d70 100644 --- a/opal/mca/btl/ugni/btl_ugni_frag.h +++ b/opal/mca/btl/ugni/btl_ugni_frag.h @@ -159,8 +159,8 @@ static inline int mca_btl_ugni_frag_alloc (mca_btl_base_endpoint_t *ep, static inline int mca_btl_ugni_frag_return (mca_btl_ugni_base_frag_t *frag) { if (frag->registration) { - frag->endpoint->btl->super.btl_mpool->mpool_deregister(frag->endpoint->btl->super.btl_mpool, - (mca_mpool_base_registration_t *) frag->registration); + frag->endpoint->btl->rcache->rcache_deregister (frag->endpoint->btl->rcache, + (mca_rcache_base_registration_t *) frag->registration); frag->registration = NULL; } diff --git a/opal/mca/btl/ugni/btl_ugni_module.c b/opal/mca/btl/ugni/btl_ugni_module.c index 311c3a1759..a884ef59f9 100644 --- a/opal/mca/btl/ugni/btl_ugni_module.c +++ b/opal/mca/btl/ugni/btl_ugni_module.c @@ -215,17 +215,8 @@ mca_btl_ugni_module_finalize (struct mca_btl_base_module_t *btl) OBJ_DESTRUCT(&ugni_module->eager_get_pending); OBJ_DESTRUCT(&ugni_module->eager_get_pending_lock); - if (ugni_module->initialized) { - /* need to tear down the mpools *after* the free lists */ - if (NULL != ugni_module->smsg_mpool) { - (void) mca_mpool_base_module_destroy (ugni_module->smsg_mpool); - ugni_module->smsg_mpool = NULL; - } - - if (NULL != ugni_module->super.btl_mpool) { - (void) mca_mpool_base_module_destroy (ugni_module->super.btl_mpool); - ugni_module->super.btl_mpool = NULL; - } + if (ugni_module->rcache) { + mca_rcache_base_module_destroy (ugni_module->rcache); } ugni_module->initialized = false; @@ -303,12 +294,13 @@ static mca_btl_base_registration_handle_t * mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *base, size_t size, uint32_t flags) { + mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; mca_btl_ugni_reg_t *reg; int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; int rc; - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, base, size, 0, access_flags, - (mca_mpool_base_registration_t **) ®); + rc = ugni_module->rcache->rcache_register (ugni_module->rcache, base, size, 0, access_flags, + (mca_rcache_base_registration_t **) ®); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return NULL; } @@ -318,10 +310,11 @@ mca_btl_ugni_register_mem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t * static int mca_btl_ugni_deregister_mem (mca_btl_base_module_t *btl, mca_btl_base_registration_handle_t *handle) { + mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; mca_btl_ugni_reg_t *reg = (mca_btl_ugni_reg_t *)((intptr_t) handle - offsetof (mca_btl_ugni_reg_t, handle)); - (void) btl->btl_mpool->mpool_deregister (btl->btl_mpool, ®->base); + (void) ugni_module->rcache->rcache_deregister (ugni_module->rcache, ®->base); return OPAL_SUCCESS; } diff --git a/opal/mca/btl/ugni/btl_ugni_prepare.h b/opal/mca/btl/ugni/btl_ugni_prepare.h index 4988cf094d..093c9f6cb0 100644 --- a/opal/mca/btl/ugni/btl_ugni_prepare.h +++ b/opal/mca/btl/ugni/btl_ugni_prepare.h @@ -57,6 +57,7 @@ mca_btl_ugni_prepare_src_send_inplace (struct mca_btl_base_module_t *btl, uint32_t flags) { bool use_eager_get = (*size + reserve) > mca_btl_ugni_component.smsg_max_data; + mca_btl_ugni_module_t *ugni_module = (mca_btl_ugni_module_t *) btl; mca_btl_ugni_base_frag_t *frag = NULL; mca_btl_ugni_reg_t *registration = NULL; void *data_ptr; @@ -74,9 +75,9 @@ mca_btl_ugni_prepare_src_send_inplace (struct mca_btl_base_module_t *btl, (unsigned int)(*size + reserve))); if (OPAL_UNLIKELY(true == use_eager_get)) { - rc = btl->btl_mpool->mpool_register(btl->btl_mpool, data_ptr, *size, 0, - MCA_MPOOL_ACCESS_REMOTE_READ, - (mca_mpool_base_registration_t **)®istration); + rc = ugni_module->rcache->rcache_register (ugni_module->rcache, data_ptr, *size, 0, + MCA_RCACHE_ACCESS_REMOTE_READ, + (mca_rcache_base_registration_t **)®istration); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { mca_btl_ugni_frag_return (frag); return NULL; diff --git a/opal/mca/btl/ugni/btl_ugni_smsg.c b/opal/mca/btl/ugni/btl_ugni_smsg.c index 5d9ea1eef6..b7848bfc66 100644 --- a/opal/mca/btl/ugni/btl_ugni_smsg.c +++ b/opal/mca/btl/ugni/btl_ugni_smsg.c @@ -16,8 +16,8 @@ static void mca_btl_ugni_smsg_mbox_construct (mca_btl_ugni_smsg_mbox_t *mbox) { struct mca_btl_ugni_reg_t *ugni_reg = (struct mca_btl_ugni_reg_t *) mbox->super.registration; - struct mca_mpool_base_registration_t *base_reg = - (struct mca_mpool_base_registration_t *) ugni_reg; + mca_rcache_base_registration_t *base_reg = + (mca_rcache_base_registration_t *) ugni_reg; /* initialize mailbox attributes */ mbox->attr.smsg_attr.msg_type = GNI_SMSG_TYPE_MBOX_AUTO_RETRANSMIT; diff --git a/opal/mca/btl/vader/btl_vader.h b/opal/mca/btl/vader/btl_vader.h index 90eedba430..14e3b2dc11 100644 --- a/opal/mca/btl/vader/btl_vader.h +++ b/opal/mca/btl/vader/btl_vader.h @@ -47,8 +47,7 @@ #include "opal/class/opal_free_list.h" #include "opal/sys/atomic.h" #include "opal/mca/btl/btl.h" -#include "opal/mca/mpool/mpool.h" -#include "opal/mca/mpool/base/base.h" +#include "opal/mca/rcache/rcache.h" #include "opal/mca/btl/base/base.h" #include "opal/mca/rcache/rcache.h" #include "opal/mca/rcache/base/base.h" @@ -152,6 +151,9 @@ struct mca_btl_vader_t { mca_btl_base_module_error_cb_fn_t error_cb; #if OPAL_BTL_VADER_HAVE_KNEM int knem_fd; + + /* registration cache */ + mca_rcache_base_module_t *knem_rcache; #endif }; typedef struct mca_btl_vader_t mca_btl_vader_t; diff --git a/opal/mca/btl/vader/btl_vader_component.c b/opal/mca/btl/vader/btl_vader_component.c index 9e2b884a6f..23a93a3bbf 100644 --- a/opal/mca/btl/vader/btl_vader_component.c +++ b/opal/mca/btl/vader/btl_vader_component.c @@ -573,13 +573,14 @@ void mca_btl_vader_poll_handle_frag (mca_btl_vader_hdr_t *hdr, struct mca_btl_ba segments[0].seg_len = hdr->len; if (hdr->flags & MCA_BTL_VADER_FLAG_SINGLE_COPY) { - mca_mpool_base_registration_t *xpmem_reg; + mca_rcache_base_registration_t *xpmem_reg; xpmem_reg = vader_get_registation (endpoint, hdr->sc_iov.iov_base, hdr->sc_iov.iov_len, 0, &segments[1].seg_addr.pval); + assert (NULL != xpmem_reg); - segments[1].seg_len = hdr->sc_iov.iov_len; + segments[1].seg_len = hdr->sc_iov.iov_len; frag.des_segment_count = 2; /* recv upcall */ diff --git a/opal/mca/btl/vader/btl_vader_endpoint.h b/opal/mca/btl/vader/btl_vader_endpoint.h index 2fd957dfbb..9a90feb2c6 100644 --- a/opal/mca/btl/vader/btl_vader_endpoint.h +++ b/opal/mca/btl/vader/btl_vader_endpoint.h @@ -28,6 +28,7 @@ #include "opal_config.h" #include "btl_vader_xpmem.h" +#include "opal/mca/rcache/base/rcache_base_vma.h" #define MCA_BTL_VADER_FBOX_ALIGNMENT 32 #define MCA_BTL_VADER_FBOX_ALIGNMENT_MASK (MCA_BTL_VADER_FBOX_ALIGNMENT - 1) @@ -74,7 +75,7 @@ typedef struct mca_btl_base_endpoint_t { union { #if OPAL_BTL_VADER_HAVE_XPMEM struct { - struct mca_rcache_base_module_t *rcache; + mca_rcache_base_vma_module_t *vma_module; xpmem_apid_t apid; /**< xpmem apid for remote peer */ } xpmem; #endif diff --git a/opal/mca/btl/vader/btl_vader_get.c b/opal/mca/btl/vader/btl_vader_get.c index ce8d7b89d8..f77a1df821 100644 --- a/opal/mca/btl/vader/btl_vader_get.c +++ b/opal/mca/btl/vader/btl_vader_get.c @@ -38,7 +38,7 @@ int mca_btl_vader_get_xpmem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_mpool_base_registration_t *reg; + mca_rcache_base_registration_t *reg; void *rem_ptr; /* silence warning about unused arguments */ diff --git a/opal/mca/btl/vader/btl_vader_knem.c b/opal/mca/btl/vader/btl_vader_knem.c index 8a270fafa6..96a7e77527 100644 --- a/opal/mca/btl/vader/btl_vader_knem.c +++ b/opal/mca/btl/vader/btl_vader_knem.c @@ -19,12 +19,11 @@ #include #include "opal/util/show_help.h" -#include "opal/mca/mpool/grdma/mpool_grdma.h" -OBJ_CLASS_INSTANCE(mca_btl_vader_registration_handle_t, mca_mpool_base_registration_t, NULL, NULL); +OBJ_CLASS_INSTANCE(mca_btl_vader_registration_handle_t, mca_rcache_base_registration_t, NULL, NULL); static int mca_btl_vader_knem_reg (void *reg_data, void *base, size_t size, - mca_mpool_base_registration_t *reg) + mca_rcache_base_registration_t *reg) { mca_btl_vader_registration_handle_t *knem_reg = (mca_btl_vader_registration_handle_t *) reg; struct knem_cmd_create_region knem_cr; @@ -37,11 +36,11 @@ static int mca_btl_vader_knem_reg (void *reg_data, void *base, size_t size, knem_cr.iovec_nr = 1; knem_cr.protection = 0; - if (reg->access_flags & (MCA_MPOOL_ACCESS_LOCAL_WRITE | MCA_MPOOL_ACCESS_REMOTE_WRITE)) { + if (reg->access_flags & (MCA_RCACHE_ACCESS_LOCAL_WRITE | MCA_RCACHE_ACCESS_REMOTE_WRITE)) { knem_cr.protection |= PROT_WRITE; } - if (reg->access_flags & MCA_MPOOL_ACCESS_REMOTE_READ) { + if (reg->access_flags & MCA_RCACHE_ACCESS_REMOTE_READ) { knem_cr.protection |= PROT_READ; } @@ -57,7 +56,7 @@ static int mca_btl_vader_knem_reg (void *reg_data, void *base, size_t size, return OPAL_SUCCESS; } -static int mca_btl_vader_knem_dereg (void *reg_data, mca_mpool_base_registration_t *reg) +static int mca_btl_vader_knem_dereg (void *reg_data, mca_rcache_base_registration_t *reg) { mca_btl_vader_registration_handle_t *knem_reg = (mca_btl_vader_registration_handle_t *) reg; @@ -72,12 +71,14 @@ mca_btl_vader_register_mem_knem (struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base, size_t size, uint32_t flags) { + mca_btl_vader_t *vader_module = (mca_btl_vader_t *) btl; mca_btl_vader_registration_handle_t *reg = NULL; int access_flags = flags & MCA_BTL_REG_FLAG_ACCESS_ANY; int rc; - rc = btl->btl_mpool->mpool_register (btl->btl_mpool, base, size, 0, access_flags, - (mca_mpool_base_registration_t **) ®); + rc = vader_module->knem_rcache->rcache_register (vader_module->knem_rcache, base, size, 0, + access_flags, + (mca_rcache_base_registration_t **) ®); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return NULL; } @@ -88,18 +89,19 @@ mca_btl_vader_register_mem_knem (struct mca_btl_base_module_t* btl, static int mca_btl_vader_deregister_mem_knem (struct mca_btl_base_module_t *btl, struct mca_btl_base_registration_handle_t *handle) { + mca_btl_vader_t *vader_module = (mca_btl_vader_t *) btl; mca_btl_vader_registration_handle_t *reg = (mca_btl_vader_registration_handle_t *)((intptr_t) handle - offsetof (mca_btl_vader_registration_handle_t, btl_handle)); - btl->btl_mpool->mpool_deregister (btl->btl_mpool, ®->base); + vader_module->knem_rcache->rcache_deregister (vader_module->knem_rcache, ®->base); return OPAL_SUCCESS; } int mca_btl_vader_knem_init (void) { - mca_mpool_base_resources_t mpool_resources = { - .pool_name = "vader", .reg_data = NULL, + mca_rcache_base_resources_t rcache_resources = { + .cache_name = "vader", .reg_data = NULL, .sizeof_reg = sizeof (mca_btl_vader_registration_handle_t), .register_mem = mca_btl_vader_knem_reg, .deregister_mem = mca_btl_vader_knem_dereg @@ -107,6 +109,7 @@ int mca_btl_vader_knem_init (void) struct knem_cmd_info knem_info; int rc; + signal (SIGSEGV, SIG_DFL); /* Open the knem device. Try to print a helpful message if we fail to open it. */ mca_btl_vader.knem_fd = open("/dev/knem", O_RDWR); @@ -130,6 +133,7 @@ int mca_btl_vader_knem_init (void) do { /* Check that the ABI if kernel module running is the same * as what we were compiled against. */ + memset (&knem_info, 0, sizeof (knem_info)); rc = ioctl(mca_btl_vader.knem_fd, KNEM_CMD_GET_INFO, &knem_info); if (rc < 0) { opal_show_help("help-btl-vader.txt", "knem get ABI fail", @@ -161,9 +165,9 @@ int mca_btl_vader_knem_init (void) mca_btl_vader.super.btl_deregister_mem = mca_btl_vader_deregister_mem_knem; mca_btl_vader.super.btl_registration_handle_size = sizeof (mca_btl_base_registration_handle_t); - mca_btl_vader.super.btl_mpool = mca_mpool_base_module_create ("grdma", NULL, - &mpool_resources); - if (NULL == mca_btl_vader.super.btl_mpool) { + mca_btl_vader.knem_rcache = mca_rcache_base_module_create ("grdma", NULL, + &rcache_resources); + if (NULL == mca_btl_vader.knem_rcache) { return OPAL_ERR_OUT_OF_RESOURCE; } @@ -182,9 +186,9 @@ int mca_btl_vader_knem_fini (void) mca_btl_vader.knem_fd = -1; } - if (mca_btl_vader.super.btl_mpool) { - (void) mca_mpool_base_module_destroy (mca_btl_vader.super.btl_mpool); - mca_btl_vader.super.btl_mpool = NULL; + if (mca_btl_vader.knem_rcache) { + (void) mca_rcache_base_module_destroy (mca_btl_vader.knem_rcache); + mca_btl_vader.knem_rcache = NULL; } return OPAL_SUCCESS; diff --git a/opal/mca/btl/vader/btl_vader_knem.h b/opal/mca/btl/vader/btl_vader_knem.h index 8d3b840209..76fa6e1054 100644 --- a/opal/mca/btl/vader/btl_vader_knem.h +++ b/opal/mca/btl/vader/btl_vader_knem.h @@ -24,7 +24,7 @@ struct mca_btl_base_registration_handle_t { }; struct mca_btl_vader_registration_handle_t { - mca_mpool_base_registration_t base; + mca_rcache_base_registration_t base; mca_btl_base_registration_handle_t btl_handle; }; typedef struct mca_btl_vader_registration_handle_t mca_btl_vader_registration_handle_t; diff --git a/opal/mca/btl/vader/btl_vader_module.c b/opal/mca/btl/vader/btl_vader_module.c index 708650bb26..4ba4549c56 100644 --- a/opal/mca/btl/vader/btl_vader_module.c +++ b/opal/mca/btl/vader/btl_vader_module.c @@ -171,9 +171,9 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_ if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) { /* always use xpmem if it is available */ ep->segment_data.xpmem.apid = xpmem_get (modex->xpmem.seg_id, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void *) 0666); - ep->segment_data.xpmem.rcache = mca_rcache_base_module_create("vma"); + ep->segment_data.xpmem.vma_module = mca_rcache_base_vma_module_alloc (); (void) vader_get_registation (ep, modex->xpmem.segment_base, mca_btl_vader_component.segment_size, - MCA_MPOOL_FLAGS_PERSIST, (void **) &ep->segment_base); + MCA_RCACHE_FLAGS_PERSIST, (void **) &ep->segment_base); } else { #endif /* store a copy of the segment information for detach */ @@ -434,6 +434,7 @@ static struct mca_btl_base_descriptor_t *vader_prepare_src (struct mca_btl_base_ int rc; opal_convertor_get_current_pointer (convertor, &data_ptr); + assert (NULL != data_ptr); /* in place send fragment */ if (OPAL_UNLIKELY(opal_convertor_need_buffers(convertor))) { @@ -545,16 +546,15 @@ static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep) #if OPAL_BTL_VADER_HAVE_XPMEM if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) { - if (ep->segment_data.xpmem.rcache) { + if (ep->segment_data.xpmem.vma_module) { /* clean out the registration cache */ const int nregs = 100; - mca_mpool_base_registration_t *regs[nregs]; + mca_rcache_base_registration_t *regs[nregs]; int reg_cnt; do { - reg_cnt = ep->segment_data.xpmem.rcache->rcache_find_all(ep->segment_data.xpmem.rcache, 0, (size_t)-1, - regs, nregs); - + reg_cnt = mca_rcache_base_vma_find_all (ep->segment_data.xpmem.vma_module, + 0, (size_t) -1, regs, nregs); for (int i = 0 ; i < reg_cnt ; ++i) { /* otherwise dereg will fail on assert */ regs[i]->ref_count = 0; @@ -562,7 +562,7 @@ static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep) } } while (reg_cnt == nregs); - ep->segment_data.xpmem.rcache = NULL; + ep->segment_data.xpmem.vma_module = NULL; } if (ep->segment_base) { diff --git a/opal/mca/btl/vader/btl_vader_put.c b/opal/mca/btl/vader/btl_vader_put.c index 3107f420b3..c3d2112412 100644 --- a/opal/mca/btl/vader/btl_vader_put.c +++ b/opal/mca/btl/vader/btl_vader_put.c @@ -40,7 +40,7 @@ int mca_btl_vader_put_xpmem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_mpool_base_registration_t *reg; + mca_rcache_base_registration_t *reg; void *rem_ptr; reg = vader_get_registation (endpoint, (void *)(intptr_t) remote_address, size, 0, &rem_ptr); diff --git a/opal/mca/btl/vader/btl_vader_xpmem.c b/opal/mca/btl/vader/btl_vader_xpmem.c index 0836b6a195..f1fcf8b8fb 100644 --- a/opal/mca/btl/vader/btl_vader_xpmem.c +++ b/opal/mca/btl/vader/btl_vader_xpmem.c @@ -34,13 +34,14 @@ int mca_btl_vader_xpmem_init (void) /* look up the remote pointer in the peer rcache and attach if * necessary */ -mca_mpool_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *ep, void *rem_ptr, - size_t size, int flags, void **local_ptr) +mca_rcache_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *ep, void *rem_ptr, + size_t size, int flags, void **local_ptr) { - struct mca_rcache_base_module_t *rcache = ep->segment_data.xpmem.rcache; - mca_mpool_base_registration_t *regs[10], *reg = NULL; + mca_rcache_base_vma_module_t *vma_module = ep->segment_data.xpmem.vma_module; + mca_rcache_base_registration_t *regs[10], *reg = NULL; xpmem_addr_t xpmem_addr; uintptr_t base, bound; + uint64_t attach_align = 1 << mca_btl_vader_component.log_attach_align; int rc, i; /* protect rcache access */ @@ -49,15 +50,14 @@ mca_mpool_base_registration_t *vader_get_registation (struct mca_btl_base_endpoi /* use btl/self for self communication */ assert (ep->peer_smp_rank != MCA_BTL_VADER_LOCAL_RANK); - base = (uintptr_t) down_align_addr(rem_ptr, mca_btl_vader_component.log_attach_align); - bound = (uintptr_t) up_align_addr((void *)((uintptr_t) rem_ptr + size - 1), - mca_btl_vader_component.log_attach_align) + 1; + base = OPAL_DOWN_ALIGN((uintptr_t) rem_ptr, attach_align, uintptr_t); + bound = OPAL_ALIGN((uintptr_t) rem_ptr + size - 1, attach_align, uintptr_t) + 1; if (OPAL_UNLIKELY(bound > VADER_MAX_ADDRESS)) { bound = VADER_MAX_ADDRESS; } /* several segments may match the base pointer */ - rc = rcache->rcache_find_all (rcache, (void *) base, bound - base, regs, 10); + rc = mca_rcache_base_vma_find_all (vma_module, (void *) base, bound - base, regs, 10); for (i = 0 ; i < rc ; ++i) { if (bound <= (uintptr_t)regs[i]->bound && base >= (uintptr_t)regs[i]->base) { (void)opal_atomic_add (®s[i]->ref_count, 1); @@ -65,13 +65,13 @@ mca_mpool_base_registration_t *vader_get_registation (struct mca_btl_base_endpoi goto reg_found; } - if (regs[i]->flags & MCA_MPOOL_FLAGS_PERSIST) { + if (regs[i]->flags & MCA_RCACHE_FLAGS_PERSIST) { continue; } /* remove this pointer from the rcache and decrement its reference count (so it is detached later) */ - rc = rcache->rcache_delete (rcache, regs[i]); + rc = mca_rcache_base_vma_delete (vma_module, regs[i]); if (OPAL_UNLIKELY(0 != rc)) { /* someone beat us to it? */ break; @@ -84,14 +84,14 @@ mca_mpool_base_registration_t *vader_get_registation (struct mca_btl_base_endpoi if (OPAL_LIKELY(0 == regs[i]->ref_count)) { /* this pointer is not in use */ - (void) xpmem_detach (regs[i]->alloc_base); + (void) xpmem_detach (regs[i]->rcache_context); OBJ_RELEASE(regs[i]); } break; } - reg = OBJ_NEW(mca_mpool_base_registration_t); + reg = OBJ_NEW(mca_rcache_base_registration_t); if (OPAL_LIKELY(NULL != reg)) { /* stick around for awhile */ reg->ref_count = 2; @@ -106,21 +106,21 @@ mca_mpool_base_registration_t *vader_get_registation (struct mca_btl_base_endpoi #endif xpmem_addr.offset = base; - reg->alloc_base = xpmem_attach (xpmem_addr, bound - base, NULL); - if (OPAL_UNLIKELY((void *)-1 == reg->alloc_base)) { + reg->rcache_context = xpmem_attach (xpmem_addr, bound - base, NULL); + if (OPAL_UNLIKELY((void *)-1 == reg->rcache_context)) { OPAL_THREAD_UNLOCK(&ep->lock); OBJ_RELEASE(reg); return NULL; } - opal_memchecker_base_mem_defined (reg->alloc_base, bound - base); + opal_memchecker_base_mem_defined (reg->rcache_context, bound - base); - rcache->rcache_insert (rcache, reg, 0); + mca_rcache_base_vma_insert (vma_module, reg, 0); } reg_found: opal_atomic_wmb (); - *local_ptr = (void *) ((uintptr_t) reg->alloc_base + + *local_ptr = (void *) ((uintptr_t) reg->rcache_context + (ptrdiff_t)((uintptr_t) rem_ptr - (uintptr_t) reg->base)); OPAL_THREAD_UNLOCK(&ep->lock); @@ -128,20 +128,20 @@ reg_found: return reg; } -void vader_return_registration (mca_mpool_base_registration_t *reg, struct mca_btl_base_endpoint_t *ep) +void vader_return_registration (mca_rcache_base_registration_t *reg, struct mca_btl_base_endpoint_t *ep) { - struct mca_rcache_base_module_t *rcache = ep->segment_data.xpmem.rcache; + mca_rcache_base_vma_module_t *vma_module = ep->segment_data.xpmem.vma_module; int32_t ref_count; ref_count = opal_atomic_add_32 (®->ref_count, -1); - if (OPAL_UNLIKELY(0 == ref_count && !(reg->flags & MCA_MPOOL_FLAGS_PERSIST))) { + if (OPAL_UNLIKELY(0 == ref_count && !(reg->flags & MCA_RCACHE_FLAGS_PERSIST))) { /* protect rcache access */ OPAL_THREAD_LOCK(&ep->lock); - rcache->rcache_delete (rcache, reg); + mca_rcache_base_vma_delete (vma_module, reg); OPAL_THREAD_UNLOCK(&ep->lock); - opal_memchecker_base_mem_noaccess (reg->alloc_base, (uintptr_t)(reg->bound - reg->base)); - (void)xpmem_detach (reg->alloc_base); + opal_memchecker_base_mem_noaccess (reg->rcache_context, (uintptr_t)(reg->bound - reg->base)); + (void)xpmem_detach (reg->rcache_context); OBJ_RELEASE (reg); } } diff --git a/opal/mca/btl/vader/btl_vader_xpmem.h b/opal/mca/btl/vader/btl_vader_xpmem.h index 4a4f3371e6..a90023e937 100644 --- a/opal/mca/btl/vader/btl_vader_xpmem.h +++ b/opal/mca/btl/vader/btl_vader_xpmem.h @@ -25,6 +25,9 @@ typedef int64_t xpmem_apid_t; #endif +#include +#include + /* look up the remote pointer in the peer rcache and attach if * necessary */ @@ -33,15 +36,15 @@ int mca_btl_vader_xpmem_init (void); -mca_mpool_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *endpoint, void *rem_ptr, - size_t size, int flags, void **local_ptr); +mca_rcache_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *endpoint, void *rem_ptr, + size_t size, int flags, void **local_ptr); -void vader_return_registration (mca_mpool_base_registration_t *reg, struct mca_btl_base_endpoint_t *endpoint); +void vader_return_registration (mca_rcache_base_registration_t *reg, struct mca_btl_base_endpoint_t *endpoint); #else -static inline mca_mpool_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *endpoint, void *rem_ptr, - size_t size, int flags, void **local_ptr) +static inline mca_rcache_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *endpoint, void *rem_ptr, + size_t size, int flags, void **local_ptr) { (void) endpoint; (void) rem_ptr; @@ -51,7 +54,7 @@ static inline mca_mpool_base_registration_t *vader_get_registation (struct mca_b return NULL; } -static inline void vader_return_registration (mca_mpool_base_registration_t *reg, struct mca_btl_base_endpoint_t *endpoint) +static inline void vader_return_registration (mca_rcache_base_registration_t *reg, struct mca_btl_base_endpoint_t *endpoint) { (void) reg; (void) endpoint; diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c index 0afe0dd94a..94886739fb 100644 --- a/opal/mca/common/cuda/common_cuda.c +++ b/opal/mca/common/cuda/common_cuda.c @@ -38,7 +38,7 @@ #include "opal/util/proc.h" #include "opal/util/argv.h" -#include "opal/mca/mpool/base/base.h" +#include "opal/mca/rcache/base/base.h" #include "opal/runtime/opal_params.h" #include "opal/mca/timer/base/base.h" #include "opal/mca/dl/base/base.h" @@ -712,7 +712,7 @@ static int mca_common_cuda_stage_three_init(void) OPAL_PROC_MY_HOSTNAME, res, mem_reg->msg); } else { opal_output_verbose(20, mca_common_cuda_output, - "CUDA: cuMemHostRegister OK on mpool %s: " + "CUDA: cuMemHostRegister OK on rcache %s: " "address=%p, bufsize=%d", mem_reg->msg, mem_reg->ptr, (int)mem_reg->amount); } @@ -795,7 +795,7 @@ static int mca_common_cuda_stage_three_init(void) * Cleanup all CUDA resources. * * Note: Still figuring out how to get cuMemHostUnregister called from the smcuda sm - * mpool. Looks like with the memory pool from openib (grdma), the unregistering is + * rcache. Looks like with the memory pool from openib (grdma), the unregistering is * called as the free list is destructed. Not true for the sm mpool. This means we * are currently still leaking some host memory we registered with CUDA. */ @@ -949,7 +949,7 @@ void mca_common_cuda_register(void *ptr, size_t amount, char *msg) { OPAL_PROC_MY_HOSTNAME, res, msg); } else { opal_output_verbose(20, mca_common_cuda_output, - "CUDA: cuMemHostRegister OK on mpool %s: " + "CUDA: cuMemHostRegister OK on rcache %s: " "address=%p, bufsize=%d", msg, ptr, (int)amount); } @@ -984,12 +984,12 @@ void mca_common_cuda_unregister(void *ptr, char *msg) { /* If unregistering the memory fails, just continue. This is during * shutdown. Only print when running in verbose mode. */ opal_output_verbose(20, mca_common_cuda_output, - "CUDA: cuMemHostUnregister failed: ptr=%p, res=%d, mpool=%s", + "CUDA: cuMemHostUnregister failed: ptr=%p, res=%d, rcache=%s", ptr, res, msg); } else { opal_output_verbose(20, mca_common_cuda_output, - "CUDA: cuMemHostUnregister OK on mpool %s: " + "CUDA: cuMemHostUnregister OK on rcache %s: " "address=%p", msg, ptr); } @@ -1001,8 +1001,8 @@ void mca_common_cuda_unregister(void *ptr, char *msg) { * to the remote size so it can access the memory. This is the * registration function for the sending side of a message transfer. */ -int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *newreg, - mca_mpool_base_registration_t *hdrreg) +int cuda_getmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg, + mca_rcache_base_registration_t *hdrreg) { CUmemorytype memType; @@ -1011,7 +1011,7 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne CUdeviceptr pbase; size_t psize; - mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)newreg; + mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t*)newreg; memHandle = (CUipcMemHandle *)cuda_reg->data.memHandle; /* We should only be there if this is a CUDA device pointer */ @@ -1090,11 +1090,11 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne * This function is called by the local side that called the cuda_getmemhandle. * There is nothing to be done so just return. */ -int cuda_ungetmemhandle(void *reg_data, mca_mpool_base_registration_t *reg) +int cuda_ungetmemhandle(void *reg_data, mca_rcache_base_registration_t *reg) { opal_output_verbose(10, mca_common_cuda_output, "CUDA: cuda_ungetmemhandle (no-op): base=%p", reg->base); - CUDA_DUMP_MEMHANDLE((100, ((mca_mpool_common_cuda_reg_t *)reg)->data.memHandle, "cuda_ungetmemhandle")); + CUDA_DUMP_MEMHANDLE((100, ((mca_rcache_common_cuda_reg_t *)reg)->data.memHandle, "cuda_ungetmemhandle")); return OPAL_SUCCESS; } @@ -1105,12 +1105,12 @@ int cuda_ungetmemhandle(void *reg_data, mca_mpool_base_registration_t *reg) * remote side of a transfer. newreg contains the new handle. hddrreg contains * the memory handle that was received from the remote side. */ -int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *newreg, - mca_mpool_base_registration_t *hdrreg) +int cuda_openmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg, + mca_rcache_base_registration_t *hdrreg) { CUresult result; CUipcMemHandle *memHandle; - mca_mpool_common_cuda_reg_t *cuda_newreg = (mca_mpool_common_cuda_reg_t*)newreg; + mca_rcache_common_cuda_reg_t *cuda_newreg = (mca_rcache_common_cuda_reg_t*)newreg; /* Save in local variable to avoid ugly casting */ memHandle = (CUipcMemHandle *)cuda_newreg->data.memHandle; @@ -1147,10 +1147,10 @@ int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *n /* * Close a memory handle that refers to remote memory. */ -int cuda_closememhandle(void *reg_data, mca_mpool_base_registration_t *reg) +int cuda_closememhandle(void *reg_data, mca_rcache_base_registration_t *reg) { CUresult result; - mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)reg; + mca_rcache_common_cuda_reg_t *cuda_reg = (mca_rcache_common_cuda_reg_t*)reg; /* Only attempt to close if we have valid context. This can change if a call * to the fini function is made and we discover context is gone. */ @@ -1213,7 +1213,7 @@ void mca_common_cuda_destruct_event(uintptr_t event) * Put remote event on stream to ensure that the the start of the * copy does not start until the completion of the event. */ -void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg) +void mca_common_wait_stream_synchronize(mca_rcache_common_cuda_reg_t *rget_reg) { #if OPAL_CUDA_SYNC_MEMOPS /* No need for any of this with SYNC_MEMOPS feature */ @@ -1643,8 +1643,8 @@ int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) { * Need to make sure the handle we are retrieving from the cache is still * valid. Compare the cached handle to the one received. */ -int mca_common_cuda_memhandle_matches(mca_mpool_common_cuda_reg_t *new_reg, - mca_mpool_common_cuda_reg_t *old_reg) +int mca_common_cuda_memhandle_matches(mca_rcache_common_cuda_reg_t *new_reg, + mca_rcache_common_cuda_reg_t *old_reg) { if (0 == memcmp(new_reg->data.memHandle, old_reg->data.memHandle, sizeof(new_reg->data.memHandle))) { @@ -2008,7 +2008,7 @@ int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base) * not matching the BUFFER_ID of the buffer we are checking. Return false * if the registration is still good. */ -bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg) +bool mca_common_cuda_previously_freed_memory(mca_rcache_base_registration_t *reg) { int res; unsigned long long bufID; @@ -2040,7 +2040,7 @@ bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg) * Also set SYNC_MEMOPS on any GPU registration to ensure that * synchronous copies complete before the buffer is accessed. */ -void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg) +void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg) { int res; unsigned long long bufID = 0; diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h index c0cd59c359..3ff9540529 100644 --- a/opal/mca/common/cuda/common_cuda.h +++ b/opal/mca/common/cuda/common_cuda.h @@ -28,20 +28,20 @@ #define MEMHANDLE_SIZE 8 #define EVTHANDLE_SIZE 8 -struct mca_mpool_common_cuda_reg_data_t { +struct mca_rcache_common_cuda_reg_data_t { uint64_t memHandle[MEMHANDLE_SIZE]; uint64_t evtHandle[EVTHANDLE_SIZE]; uint64_t event; opal_ptr_t memh_seg_addr; size_t memh_seg_len; }; -typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t; +typedef struct mca_rcache_common_cuda_reg_data_t mca_rcache_common_cuda_reg_data_t; -struct mca_mpool_common_cuda_reg_t { - mca_mpool_base_registration_t base; - mca_mpool_common_cuda_reg_data_t data; +struct mca_rcache_common_cuda_reg_t { + mca_rcache_base_registration_t base; + mca_rcache_common_cuda_reg_data_t data; }; -typedef struct mca_mpool_common_cuda_reg_t mca_mpool_common_cuda_reg_t; +typedef struct mca_rcache_common_cuda_reg_t mca_rcache_common_cuda_reg_t; extern bool mca_common_cuda_enabled; OPAL_DECLSPEC void mca_common_cuda_register_mca_variables(void); @@ -50,7 +50,7 @@ OPAL_DECLSPEC void mca_common_cuda_register(void *ptr, size_t amount, char *msg) OPAL_DECLSPEC void mca_common_cuda_unregister(void *ptr, char *msg); -OPAL_DECLSPEC void mca_common_wait_stream_synchronize(mca_mpool_common_cuda_reg_t *rget_reg); +OPAL_DECLSPEC void mca_common_wait_stream_synchronize(mca_rcache_common_cuda_reg_t *rget_reg); OPAL_DECLSPEC int mca_common_cuda_memcpy(void *dst, void *src, size_t amount, char *msg, struct mca_btl_base_descriptor_t *, int *done); @@ -69,26 +69,26 @@ OPAL_DECLSPEC int progress_one_cuda_ipc_event(struct mca_btl_base_descriptor_t * OPAL_DECLSPEC int progress_one_cuda_dtoh_event(struct mca_btl_base_descriptor_t **); OPAL_DECLSPEC int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **); -OPAL_DECLSPEC int mca_common_cuda_memhandle_matches(mca_mpool_common_cuda_reg_t *new_reg, - mca_mpool_common_cuda_reg_t *old_reg); +OPAL_DECLSPEC int mca_common_cuda_memhandle_matches(mca_rcache_common_cuda_reg_t *new_reg, + mca_rcache_common_cuda_reg_t *old_reg); OPAL_DECLSPEC void mca_common_cuda_construct_event_and_handle(uintptr_t *event, void *handle); OPAL_DECLSPEC void mca_common_cuda_destruct_event(uintptr_t event); -OPAL_DECLSPEC int cuda_getmemhandle(void *base, size_t, mca_mpool_base_registration_t *newreg, - mca_mpool_base_registration_t *hdrreg); -OPAL_DECLSPEC int cuda_ungetmemhandle(void *reg_data, mca_mpool_base_registration_t *reg); -OPAL_DECLSPEC int cuda_openmemhandle(void *base, size_t size, mca_mpool_base_registration_t *newreg, - mca_mpool_base_registration_t *hdrreg); -OPAL_DECLSPEC int cuda_closememhandle(void *reg_data, mca_mpool_base_registration_t *reg); +OPAL_DECLSPEC int cuda_getmemhandle(void *base, size_t, mca_rcache_base_registration_t *newreg, + mca_rcache_base_registration_t *hdrreg); +OPAL_DECLSPEC int cuda_ungetmemhandle(void *reg_data, mca_rcache_base_registration_t *reg); +OPAL_DECLSPEC int cuda_openmemhandle(void *base, size_t size, mca_rcache_base_registration_t *newreg, + mca_rcache_base_registration_t *hdrreg); +OPAL_DECLSPEC int cuda_closememhandle(void *reg_data, mca_rcache_base_registration_t *reg); OPAL_DECLSPEC int mca_common_cuda_get_device(int *devicenum); OPAL_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1, int dev2); OPAL_DECLSPEC int mca_common_cuda_stage_one_init(void); OPAL_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base); OPAL_DECLSPEC void mca_common_cuda_fini(void); #if OPAL_CUDA_GDR_SUPPORT -OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg); -OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg); +OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_rcache_base_registration_t *reg); +OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_rcache_base_registration_t *reg); #endif /* OPAL_CUDA_GDR_SUPPORT */ /** * Return: 0 if no packing is required for sending (the upper layer diff --git a/opal/mca/common/cuda/help-mpi-common-cuda.txt b/opal/mca/common/cuda/help-mpi-common-cuda.txt index 9eb01e1fb7..a1877c35d6 100644 --- a/opal/mca/common/cuda/help-mpi-common-cuda.txt +++ b/opal/mca/common/cuda/help-mpi-common-cuda.txt @@ -41,13 +41,13 @@ NOTE: You can turn off this warning by setting the MCA parameter The call to cuMemHostRegister(%p, %d, 0) failed. Host: %s cuMemHostRegister return value: %d - Memory Pool: %s + Registration cache: %s # [cuMemHostRegister failed] The call to cuMemHostRegister(%p, %d, 0) failed. Host: %s cuMemHostRegister return value: %d - Memory Pool: %s + Registration cache: %s # [cuIpcGetMemHandle failed] The call to cuIpcGetMemHandle failed. This means the GPU RDMA protocol diff --git a/opal/mca/common/sm/Makefile.am b/opal/mca/common/sm/Makefile.am index 46e8b258bb..ba57c100da 100644 --- a/opal/mca/common/sm/Makefile.am +++ b/opal/mca/common/sm/Makefile.am @@ -10,7 +10,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2010-2015 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2010-2013 Los Alamos National Security, LLC. +# Copyright (c) 2010-2015 Los Alamos National Security, LLC. # All rights reserved. # $COPYRIGHT$ # @@ -22,7 +22,7 @@ # A word of explanation... # # This library is linked against various MCA components because all -# shared-memory based components (e.g., mpool, ptl, etc.) need to +# shared-memory based components (e.g., btl/sm, btl/smcuda, etc.) need to # share some common code and data. There's two cases: # # 1. libmca_common_sm.la is a shared library. By linking that shared @@ -44,12 +44,14 @@ # Header files headers = \ - common_sm.h + common_sm.h \ + common_sm_mpool.h # Source files sources = \ - common_sm.c + common_sm.c \ + common_sm_mpool.c # Help file diff --git a/opal/mca/common/sm/common_sm.c b/opal/mca/common/sm/common_sm.c index 44854c679a..826e56e01a 100644 --- a/opal/mca/common/sm/common_sm.c +++ b/opal/mca/common/sm/common_sm.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -11,7 +12,7 @@ * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2008-2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2013 Los Alamos National Security, LLC. + * Copyright (c) 2010-2015 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved * $COPYRIGHT$ @@ -39,16 +40,13 @@ #if OPAL_ENABLE_FT_CR == 1 #include "opal/runtime/opal_cr.h" #endif - +#include "common_sm.h" #include "opal/constants.h" -#include "opal/mca/mpool/sm/mpool_sm.h" -OBJ_CLASS_INSTANCE( - mca_common_sm_module_t, - opal_list_item_t, - NULL, - NULL -); + +OBJ_CLASS_INSTANCE(mca_common_sm_module_t,opal_list_item_t, + NULL, NULL); + /* ////////////////////////////////////////////////////////////////////////// */ /* static utility functions */ @@ -258,13 +256,10 @@ mca_common_sm_local_proc_reorder(opal_proc_t **procs, * * @retval addr virtual address */ -void * -mca_common_sm_seg_alloc(struct mca_mpool_base_module_t *mpool, - size_t *size, - mca_mpool_base_registration_t **registration) +void *mca_common_sm_seg_alloc (void *ctx, size_t *size) { - mca_mpool_sm_module_t *sm_module = (mca_mpool_sm_module_t *)mpool; - mca_common_sm_seg_header_t *seg = sm_module->sm_common_module->module_seg; + mca_common_sm_module_t *sm_module = (mca_common_sm_module_t *) ctx; + mca_common_sm_seg_header_t *seg = sm_module->module_seg; void *addr; opal_atomic_lock(&seg->seg_lock); @@ -275,7 +270,7 @@ mca_common_sm_seg_alloc(struct mca_mpool_base_module_t *mpool, size_t fixup; /* add base address to segment offset */ - addr = sm_module->sm_common_module->module_data_addr + seg->seg_offset; + addr = sm_module->module_data_addr + seg->seg_offset; seg->seg_offset += *size; /* fix up seg_offset so next allocation is aligned on a @@ -286,9 +281,7 @@ mca_common_sm_seg_alloc(struct mca_mpool_base_module_t *mpool, seg->seg_offset += sizeof(long) - fixup; } } - if (NULL != registration) { - *registration = NULL; - } + opal_atomic_unlock(&seg->seg_lock); return addr; } diff --git a/opal/mca/common/sm/common_sm.h b/opal/mca/common/sm/common_sm.h index 7c4e6310db..819b82f6ee 100644 --- a/opal/mca/common/sm/common_sm.h +++ b/opal/mca/common/sm/common_sm.h @@ -32,7 +32,7 @@ #include "opal/mca/btl/base/base.h" #include "opal/util/proc.h" #include "opal/mca/btl/base/btl_base_error.h" -#include "opal/mca/mpool/mpool.h" +#include "common_sm_mpool.h" BEGIN_C_DECLS @@ -66,6 +66,8 @@ typedef struct mca_common_sm_module_t { unsigned char *module_data_addr; /* shared memory backing facility object that encapsulates shmem info */ opal_shmem_ds_t shmem_ds; + /* memory pool interface to shared-memory region */ + mca_mpool_base_module_t *mpool; } mca_common_sm_module_t; OBJ_CLASS_DECLARATION(mca_common_sm_module_t); @@ -126,10 +128,7 @@ mca_common_sm_module_unlink(mca_common_sm_module_t *modp); /** * callback from the sm mpool */ -OPAL_DECLSPEC extern void * -mca_common_sm_seg_alloc(struct mca_mpool_base_module_t *mpool, - size_t *size, - mca_mpool_base_registration_t **registration); +OPAL_DECLSPEC extern void *mca_common_sm_seg_alloc (void *ctx, size_t *size); /** * This function will release all local resources attached to the @@ -150,6 +149,7 @@ mca_common_sm_fini(mca_common_sm_module_t *mca_common_sm_module); */ OPAL_DECLSPEC extern mca_common_sm_module_t *mca_common_sm_module; + END_C_DECLS #endif /* _COMMON_SM_H_ */ diff --git a/opal/mca/mpool/sm/mpool_sm_module.c b/opal/mca/common/sm/common_sm_mpool.c similarity index 52% rename from opal/mca/mpool/sm/mpool_sm_module.c rename to opal/mca/common/sm/common_sm_mpool.c index 4918b75558..be59ec1d64 100644 --- a/opal/mca/mpool/sm/mpool_sm_module.c +++ b/opal/mca/common/sm/common_sm_mpool.c @@ -23,9 +23,10 @@ #include "opal_config.h" #include -#include "opal/mca/mpool/sm/mpool_sm.h" +#include "common_sm_mpool.h" #include "opal/mca/common/sm/common_sm.h" #include "opal/mca/common/cuda/common_cuda.h" +#include "opal/mca/allocator/base/base.h" #ifdef HAVE_UNISTD_H #include #endif @@ -39,22 +40,42 @@ static void sm_module_finalize(mca_mpool_base_module_t* module); +/* + * Returns base address of shared memory mapping. + */ +static void *mca_common_sm_mpool_base (mca_mpool_base_module_t *mpool); + +/** + * Allocate block of shared memory. + */ +static void *mca_common_sm_mpool_alloc (mca_mpool_base_module_t *mpool, + size_t size, size_t align, + uint32_t flags); + +/** + * free function typedef + */ +static void mca_common_sm_mpool_free(mca_mpool_base_module_t *mpool, + void *addr); + +/** + * Fault Tolerance Event Notification Function + * @param state Checkpoint Stae + * @return OPAL_SUCCESS or failure status + */ +static int mca_common_sm_mpool_ft_event (int state); + + /* * Initializes the mpool module. */ -void mca_mpool_sm_module_init(mca_mpool_sm_module_t* mpool) +static void mca_common_sm_mpool_module_init(mca_common_sm_mpool_module_t* mpool) { - mpool->super.mpool_component = &mca_mpool_sm_component.super; - mpool->super.mpool_base = mca_mpool_sm_base; - mpool->super.mpool_alloc = mca_mpool_sm_alloc; - mpool->super.mpool_realloc = mca_mpool_sm_realloc; - mpool->super.mpool_free = mca_mpool_sm_free; - mpool->super.mpool_find = NULL; - mpool->super.mpool_register = NULL; - mpool->super.mpool_deregister = NULL; - mpool->super.mpool_release_memory = NULL; + mpool->super.mpool_base = mca_common_sm_mpool_base; + mpool->super.mpool_alloc = mca_common_sm_mpool_alloc; + mpool->super.mpool_free = mca_common_sm_mpool_free; mpool->super.mpool_finalize = sm_module_finalize; - mpool->super.mpool_ft_event = mca_mpool_sm_ft_event; + mpool->super.mpool_ft_event = mca_common_sm_mpool_ft_event; mpool->super.flags = 0; mpool->sm_size = 0; @@ -64,12 +85,74 @@ void mca_mpool_sm_module_init(mca_mpool_sm_module_t* mpool) mpool->mem_node = -1; } +mca_mpool_base_module_t *common_sm_mpool_create (mca_common_sm_mpool_resources_t *resources) +{ + mca_common_sm_mpool_module_t *mpool_module; + mca_allocator_base_component_t* allocator_component; + + /* Make a new mpool module */ + mpool_module = (mca_common_sm_mpool_module_t *) malloc (sizeof (*mpool_module)); + mca_common_sm_mpool_module_init(mpool_module); + + /* set sm_size */ + mpool_module->sm_size = resources->size; + + allocator_component = mca_allocator_component_lookup(resources->allocator); + + /* if specified allocator cannot be loaded - look for an alternative */ + if (NULL == allocator_component) { + if (opal_list_get_size(&opal_allocator_base_framework.framework_components) == 0) { + mca_base_component_list_item_t *item = + (mca_base_component_list_item_t *) + opal_list_get_first(&opal_allocator_base_framework.framework_components); + allocator_component = + (mca_allocator_base_component_t *)item->cli_component; + opal_output( + 0, "mca_common_sm_mpool_init: " + "unable to locate allocator: %s - using %s\n", + resources->allocator, + allocator_component->allocator_version.mca_component_name); + } else { + opal_output(0, "mca_common_sm_mpool_init: " + "unable to locate allocator: %s\n", + resources->allocator); + free(mpool_module); + return NULL; + } + } + + mpool_module->mem_node = resources->mem_node; + + if (NULL == (mpool_module->sm_common_module = + mca_common_sm_module_attach(&resources->bs_meta_buf, + sizeof(mca_common_sm_module_t), 8))) { + opal_output(0, "mca_common_sm_mpool_init: " + "unable to create shared memory mapping (%s)", + resources->bs_meta_buf.seg_name); + free(mpool_module); + return NULL; + } + + /* setup allocator */ + mpool_module->sm_allocator = + allocator_component->allocator_init (true, mca_common_sm_seg_alloc, + NULL, mpool_module->sm_common_module); + if (NULL == mpool_module->sm_allocator) { + opal_output(0, "mca_common_sm_mpool_init: unable to initialize allocator"); + free(mpool_module); + return NULL; + } + + return &mpool_module->super; +} + + /* * base address of shared memory mapping */ -void* mca_mpool_sm_base(mca_mpool_base_module_t* mpool) +static void *mca_common_sm_mpool_base(mca_mpool_base_module_t *mpool) { - mca_mpool_sm_module_t *sm_mpool = (mca_mpool_sm_module_t*) mpool; + mca_common_sm_mpool_module_t *sm_mpool = (mca_common_sm_mpool_module_t *) mpool; return (NULL != sm_mpool->sm_common_module) ? sm_mpool->sm_common_module->module_seg_addr : NULL; } @@ -77,43 +160,16 @@ void* mca_mpool_sm_base(mca_mpool_base_module_t* mpool) /** * allocate function */ -void* mca_mpool_sm_alloc( - mca_mpool_base_module_t* mpool, - size_t size, - size_t align, - uint32_t flags, - mca_mpool_base_registration_t** registration) +static void *mca_common_sm_mpool_alloc (mca_mpool_base_module_t* mpool, + size_t size, size_t align, uint32_t flags) { - mca_mpool_sm_module_t* mpool_sm = (mca_mpool_sm_module_t*)mpool; + mca_common_sm_mpool_module_t* mpool_sm = (mca_common_sm_mpool_module_t*)mpool; opal_hwloc_base_memory_segment_t mseg; mseg.mbs_start_addr = - mpool_sm->sm_allocator->alc_alloc(mpool_sm->sm_allocator, size, align, registration); + mpool_sm->sm_allocator->alc_alloc(mpool_sm->sm_allocator, size, align); - if(mpool_sm->mem_node >= 0) { - mseg.mbs_len = size; - opal_hwloc_base_membind(&mseg, 1, mpool_sm->mem_node); - } - - return mseg.mbs_start_addr; -} - -/** - * realloc function - */ -void* mca_mpool_sm_realloc( - mca_mpool_base_module_t* mpool, - void* addr, - size_t size, - mca_mpool_base_registration_t** registration) -{ - mca_mpool_sm_module_t* mpool_sm = (mca_mpool_sm_module_t*)mpool; - opal_hwloc_base_memory_segment_t mseg; - - mseg.mbs_start_addr = - mpool_sm->sm_allocator->alc_realloc(mpool_sm->sm_allocator, addr, size, - registration); - if(mpool_sm->mem_node >= 0) { + if (mpool_sm->mem_node >= 0) { mseg.mbs_len = size; opal_hwloc_base_membind(&mseg, 1, mpool_sm->mem_node); } @@ -124,16 +180,15 @@ void* mca_mpool_sm_realloc( /** * free function */ -void mca_mpool_sm_free(mca_mpool_base_module_t* mpool, void * addr, - mca_mpool_base_registration_t* registration) +void mca_common_sm_mpool_free(mca_mpool_base_module_t *mpool, void *addr) { - mca_mpool_sm_module_t* mpool_sm = (mca_mpool_sm_module_t*)mpool; + mca_common_sm_mpool_module_t* mpool_sm = (mca_common_sm_mpool_module_t*)mpool; mpool_sm->sm_allocator->alc_free(mpool_sm->sm_allocator, addr); } static void sm_module_finalize(mca_mpool_base_module_t* module) { - mca_mpool_sm_module_t *sm_module = (mca_mpool_sm_module_t*) module; + mca_common_sm_mpool_module_t *sm_module = (mca_common_sm_mpool_module_t*) module; if (NULL != sm_module->sm_common_module) { if (OPAL_SUCCESS == @@ -156,13 +211,13 @@ static void sm_module_finalize(mca_mpool_base_module_t* module) } #if OPAL_ENABLE_FT_CR == 0 -int mca_mpool_sm_ft_event(int state) { +int mca_common_sm_mpool_ft_event(int state) { return OPAL_SUCCESS; } #else -int mca_mpool_sm_ft_event(int state) { +int mca_common_sm_mpool_ft_event(int state) { mca_mpool_base_module_t *self_module = NULL; - mca_mpool_sm_module_t *self_sm_module = NULL; + mca_common_sm_mpool_module_t *self_sm_module = NULL; char * file_name = NULL; if(OPAL_CRS_CHECKPOINT == state) { @@ -181,7 +236,7 @@ int mca_mpool_sm_ft_event(int state) { if (opal_cr_continue_like_restart) { /* Find the sm module */ self_module = mca_mpool_base_module_lookup("sm"); - self_sm_module = (mca_mpool_sm_module_t*) self_module; + self_sm_module = (mca_common_sm_mpool_module_t*) self_module; /* Mark the old sm file for eventual removal via CRS */ if (NULL != self_sm_module->sm_common_module) { @@ -196,7 +251,7 @@ int mca_mpool_sm_ft_event(int state) { OPAL_CRS_RESTART_PRE == state) { /* Find the sm module */ self_module = mca_mpool_base_module_lookup("sm"); - self_sm_module = (mca_mpool_sm_module_t*) self_module; + self_sm_module = (mca_common_sm_mpool_module_t*) self_module; /* Mark the old sm file for eventual removal via CRS */ if (NULL != self_sm_module->sm_common_module) { diff --git a/opal/mca/common/sm/common_sm_mpool.h b/opal/mca/common/sm/common_sm_mpool.h new file mode 100644 index 0000000000..8d70bd51b1 --- /dev/null +++ b/opal/mca/common/sm/common_sm_mpool.h @@ -0,0 +1,62 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010-2015 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_COMMON_SM_MPOOL_H +#define MCA_COMMON_SM_MPOOL_H + +#include "opal_config.h" + +#include "opal/mca/event/event.h" +#include "opal/mca/shmem/shmem.h" + +#include "opal/mca/mpool/mpool.h" +#include "opal/mca/allocator/allocator.h" + +BEGIN_C_DECLS + +struct mca_common_sm_module_t; + +typedef struct mca_common_sm_mpool_resources_t { + size_t size; + int32_t mem_node; + const char *allocator; + /* backing store metadata */ + opal_shmem_ds_t bs_meta_buf; +} mca_common_sm_mpool_resources_t; + +typedef struct mca_common_sm_mpool_module_t { + mca_mpool_base_module_t super; + long sm_size; + mca_allocator_base_module_t *sm_allocator; + struct mca_common_sm_mpool_mmap_t *sm_mmap; + struct mca_common_sm_module_t *sm_common_module; + int32_t mem_node; +} mca_common_sm_mpool_module_t; + +OPAL_DECLSPEC mca_mpool_base_module_t *common_sm_mpool_create (mca_common_sm_mpool_resources_t *); + +END_C_DECLS + +#endif diff --git a/opal/mca/mpool/base/Makefile.am b/opal/mca/mpool/base/Makefile.am index 4ab8570d94..646444e231 100644 --- a/opal/mca/mpool/base/Makefile.am +++ b/opal/mca/mpool/base/Makefile.am @@ -10,6 +10,8 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2015 Los Alamos National Security, LLC. All rights +# reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -19,15 +21,14 @@ headers += \ base/base.h \ - base/mpool_base_mem_cb.h \ base/mpool_base_tree.h + libmca_mpool_la_SOURCES += \ base/mpool_base_frame.c \ - base/mpool_base_init.c \ base/mpool_base_lookup.c \ base/mpool_base_alloc.c \ - base/mpool_base_mem_cb.c \ - base/mpool_base_tree.c + base/mpool_base_tree.c \ + base/mpool_base_default.c dist_opaldata_DATA += \ base/help-mpool-base.txt diff --git a/opal/mca/mpool/base/base.h b/opal/mca/mpool/base/base.h index 355a64dfd1..88a99cad01 100644 --- a/opal/mca/mpool/base/base.h +++ b/opal/mca/mpool/base/base.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -10,6 +11,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -30,28 +33,10 @@ BEGIN_C_DECLS -static inline unsigned int my_log2(unsigned long val) { - unsigned int count = 0; - while(val > 0) { - val = val >> 1; - count++; - } - return count > 0 ? count-1: 0; -} -static inline void *down_align_addr(void* addr, unsigned int shift) { - return (void*) (((intptr_t) addr) & (~(intptr_t) 0) << shift); -} - -static inline void *up_align_addr(void*addr, unsigned int shift) { - return (void*) ((((intptr_t) addr) | ~((~(intptr_t) 0) << shift))); -} - struct mca_mpool_base_selected_module_t { opal_list_item_t super; mca_mpool_base_component_t *mpool_component; mca_mpool_base_module_t *mpool_module; - void* user_data; - struct mca_mpool_base_resources_t *mpool_resources; }; typedef struct mca_mpool_base_selected_module_t mca_mpool_base_selected_module_t; @@ -65,24 +50,16 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_mpool_base_selected_module_t); * Global functions for MCA: overall mpool open and close */ -OPAL_DECLSPEC int mca_mpool_base_init(bool enable_progress_threads, bool enable_mpi_threads); OPAL_DECLSPEC mca_mpool_base_component_t* mca_mpool_base_component_lookup(const char* name); -OPAL_DECLSPEC mca_mpool_base_module_t* mca_mpool_base_module_create( - const char* name, - void* user_data, - struct mca_mpool_base_resources_t* mpool_resources); OPAL_DECLSPEC mca_mpool_base_module_t* mca_mpool_base_module_lookup(const char* name); -OPAL_DECLSPEC int mca_mpool_base_module_destroy(mca_mpool_base_module_t *module); /* * Globals */ extern opal_list_t mca_mpool_base_modules; -OPAL_DECLSPEC extern uint32_t mca_mpool_base_page_size; -OPAL_DECLSPEC extern uint32_t mca_mpool_base_page_size_log; +extern mca_mpool_base_module_t *mca_mpool_base_default_module; +extern int mca_mpool_base_default_priority; -/* only used within base -- no need to DECLSPEC */ -extern int mca_mpool_base_used_mem_hooks; OPAL_DECLSPEC extern mca_base_framework_t opal_mpool_base_framework; diff --git a/opal/mca/mpool/base/help-mpool-base.txt b/opal/mca/mpool/base/help-mpool-base.txt index a1851708f4..17e01111fd 100644 --- a/opal/mca/mpool/base/help-mpool-base.txt +++ b/opal/mca/mpool/base/help-mpool-base.txt @@ -30,31 +30,3 @@ PID: %d %d additional leak%s recorded but %s not displayed here. Set the MCA parameter mpi_show_mpi_alloc_mem_leaks to a larger number to see that many leaks, or set it to a negative number to see all leaks. -# -[leave pinned failed] -A process attempted to use the "leave pinned" MPI feature, but no -memory registration hooks were found on the system at run time. This -may be the result of running on a system that does not support memory -hooks or having some other software subvert Open MPI's use of the -memory hooks. You can disable Open MPI's use of memory hooks by -setting both the mpi_leave_pinned and mpi_leave_pinned_pipeline MCA -parameters to 0. - -Open MPI will disable any transports that are attempting to use the -leave pinned functionality; your job may still run, but may fall back -to a slower network transport (such as TCP). - - Mpool name: %s - Process: %s - Local host: %s -# -[cannot deregister in-use memory] -Open MPI intercepted a call to free memory that is still being used by -an ongoing MPI communication. This usually reflects an error in the -MPI application; it may signify memory corruption. Open MPI will now -abort your job. - - Mpool name: %s - Local host: %s - Buffer address: %p - Buffer size: %lu diff --git a/opal/mca/mpool/base/mpool_base_alloc.c b/opal/mca/mpool/base/mpool_base_alloc.c index bcb64e3738..605ffbdf28 100644 --- a/opal/mca/mpool/base/mpool_base_alloc.c +++ b/opal/mca/mpool/base/mpool_base_alloc.c @@ -27,7 +27,6 @@ #include "opal/mca/mpool/mpool.h" #include "base.h" #include "mpool_base_tree.h" -#include "mpool_base_mem_cb.h" #include "opal/threads/mutex.h" struct opal_info_t { @@ -44,63 +43,24 @@ struct opal_info_t { }; typedef struct opal_info_t opal_info_t; -/** - * Memory Pool Registration - */ - -static void mca_mpool_base_registration_constructor( mca_mpool_base_registration_t * reg ) -{ - reg->mpool = NULL; - reg->base = NULL; - reg->bound = NULL; - reg->alloc_base = NULL; - reg->ref_count = 0; - reg->flags = 0; -} - -static void mca_mpool_base_registration_destructor( mca_mpool_base_registration_t * reg ) -{ - -} - -OBJ_CLASS_INSTANCE( - mca_mpool_base_registration_t, - opal_free_list_item_t, - mca_mpool_base_registration_constructor, - mca_mpool_base_registration_destructor); static void unregister_tree_item(mca_mpool_base_tree_item_t *mpool_tree_item) { mca_mpool_base_module_t *mpool; - mca_mpool_base_registration_t *reg; - int i; - for(i = 1; i < mpool_tree_item->count; i++) { - mpool = mpool_tree_item->mpools[i]; - reg = mpool_tree_item->regs[i]; - if(mpool && mpool->mpool_deregister) { - mpool->mpool_deregister(mpool, reg); - } - } - - mpool = mpool_tree_item->mpools[0]; - reg = mpool_tree_item->regs[0]; - mpool->mpool_free(mpool, mpool_tree_item->key, reg); + mpool = mpool_tree_item->mpool; + mpool->mpool_free(mpool, mpool_tree_item->key); } /** * Function to allocate special memory according to what the user requests in * the info object. * - * If the user passes in a valid info structure then the function will - * try to allocate the memory and register it with every mpool that there is a - * key for it in the info struct. If it fails at registering the memory with - * one of the requested mpools, an error will be returned. Also, if there is a - * key in info that does not match any mpool, an error will be returned. - * * If the info parameter is MPI_INFO_NULL, then this function will try to allocate - * the memory and register it with as many mpools as possible. However, - * if any of the registratons fail the mpool will simply be ignored. + * the memory with the optionally named mpool or malloc and try to register the + * pointer with as many registration caches as possible. Registration caches that + * fail to register the region will be ignored. The mpool name can optionally be + * specified in the info object. * * @param size the size of the memory area to allocate * @param info an info object which tells us what kind of memory to allocate @@ -108,177 +68,38 @@ static void unregister_tree_item(mca_mpool_base_tree_item_t *mpool_tree_item) * @retval pointer to the allocated memory * @retval NULL on failure */ -void *mca_mpool_base_alloc(size_t size, opal_info_t *info) +void *mca_mpool_base_alloc(size_t size, opal_info_t *info, const char *hints) { - opal_list_item_t * item; - int num_modules = opal_list_get_size(&mca_mpool_base_modules); - int reg_module_num = 0, i; - mca_mpool_base_selected_module_t * current; - mca_mpool_base_selected_module_t * no_reg_function = NULL; - mca_mpool_base_selected_module_t ** has_reg_function = NULL; - mca_mpool_base_registration_t * registration; - mca_mpool_base_tree_item_t* mpool_tree_item = NULL; + mca_mpool_base_tree_item_t *mpool_tree_item = NULL; mca_mpool_base_module_t *mpool; - void * mem = NULL; + void *mem = NULL; #if defined(TODO_BTL_GB) int flag = 0; - bool match_found = false; #endif /* defined(TODO_BTL_GB) */ - bool mpool_requested = false; - if(num_modules > 0) { - has_reg_function = (mca_mpool_base_selected_module_t **) - malloc(num_modules * sizeof(mca_mpool_base_module_t *)); - if(!has_reg_function) - goto out; + mpool_tree_item = mca_mpool_base_tree_item_get (); + if (!mpool_tree_item) { + return NULL; } - mpool_tree_item = mca_mpool_base_tree_item_get(); - - if(!mpool_tree_item) - goto out; - mpool_tree_item->num_bytes = size; mpool_tree_item->count = 0; -#if defined(TODO_BTL_GB) - if(&ompi_mpi_info_null.info == info) -#endif /* defined(TODO_BTL_GB) */ - { - for(item = opal_list_get_first(&mca_mpool_base_modules); - item != opal_list_get_end(&mca_mpool_base_modules); - item = opal_list_get_next(item)) { - current = ((mca_mpool_base_selected_module_t *) item); - if(current->mpool_module->flags & MCA_MPOOL_FLAGS_MPI_ALLOC_MEM) { - if(NULL == current->mpool_module->mpool_register){ - no_reg_function = current; - } - else { - has_reg_function[reg_module_num++] = current; - } - } - } - } -#if defined(TODO_BTL_GB) - else - { - int num_keys; - char key[MPI_MAX_INFO_KEY + 1]; - char value[MPI_MAX_INFO_VAL + 1]; - ompi_info_get_nkeys(info, &num_keys); - for(i = 0; i < num_keys; i++) - { - ompi_info_get_nthkey(info, i, key); - if ( 0 != strcmp(key, "mpool") ) { - continue; - } - mpool_requested = true; - ompi_info_get(info, key, MPI_MAX_INFO_VAL, value, &flag); - if ( !flag ) { - continue; - } - - match_found = false; - for(item = opal_list_get_first(&mca_mpool_base_modules); - item != opal_list_get_end(&mca_mpool_base_modules); - item = opal_list_get_next(item)) - { - current = ((mca_mpool_base_selected_module_t *)item); - if(0 == strcmp(value, - current->mpool_module->mpool_component->mpool_version.mca_component_name)) - { - match_found = true; - if(NULL == current->mpool_module->mpool_register) - { - if(NULL != no_reg_function) - { - /* there was more than one requested mpool that lacks - * a registration function, so return failure */ - goto out; - } - no_reg_function = current; - } - else - { - has_reg_function[reg_module_num++] = current; - } - } - } - if(!match_found) - { - /* one of the keys given to us by the user did not match any - * mpools, so return an error */ - goto out; - } - } - } -#endif /* defined(TODO_BTL_GB) */ - - if(NULL == no_reg_function && 0 == reg_module_num) - { - if(!mpool_requested) - { - /* if the info argument was NULL and there were no useable mpools - * or there user provided info object but did not specifiy a "mpool" key, - * just malloc the memory and return it */ - mem = malloc(size); - goto out; - } - - /* the user passed info but we were not able to use any of the mpools - * specified */ - goto out; + mpool = mca_mpool_base_module_lookup (hints); + if (NULL != mpool) { + mem = mpool->mpool_alloc (mpool, size, 0, 0); } - for(i = -1; i < reg_module_num; i++) { - if(-1 == i) { - if(NULL != no_reg_function) - mpool = no_reg_function->mpool_module; - else - continue; - } else { - mpool = has_reg_function[i]->mpool_module; - } - - if(NULL == mem) { - mem = mpool->mpool_alloc(mpool, size, 0, MCA_MPOOL_FLAGS_PERSIST, - ®istration); - if(NULL == mem) { - if(mpool_requested) - goto out; - continue; - } - mpool_tree_item->key = mem; - mpool_tree_item->mpools[mpool_tree_item->count] = mpool; - mpool_tree_item->regs[mpool_tree_item->count++] = registration; - } else { - if(mpool->mpool_register(mpool, mem, size, MCA_MPOOL_FLAGS_PERSIST, - MCA_MPOOL_ACCESS_ANY, ®istration) != OPAL_SUCCESS) { - if(mpool_requested) { - unregister_tree_item(mpool_tree_item); - goto out; - } - continue; - } - mpool_tree_item->mpools[mpool_tree_item->count] = mpool; - mpool_tree_item->regs[mpool_tree_item->count++] = registration; - } - } - - if(NULL == mem) { + if (NULL == mem) { + /* fall back on malloc */ mem = malloc(size); - goto out; + + mca_mpool_base_tree_item_put (mpool_tree_item); + } else { + mpool_tree_item->mpool = mpool; + mca_mpool_base_tree_insert (mpool_tree_item); } - mca_mpool_base_tree_insert(mpool_tree_item); - mpool_tree_item = NULL; /* prevent it to be deleted below */ -out: - if(mpool_tree_item) - mca_mpool_base_tree_item_put(mpool_tree_item); - - if(has_reg_function) - free(has_reg_function); - return mem; } diff --git a/opal/mca/mpool/base/mpool_base_default.c b/opal/mca/mpool/base/mpool_base_default.c new file mode 100644 index 0000000000..3114c11cd0 --- /dev/null +++ b/opal/mca/mpool/base/mpool_base_default.c @@ -0,0 +1,85 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ + +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/mpool/base/base.h" +#include "opal/constants.h" +#include "opal/util/sys_limits.h" + +static void *mca_mpool_default_alloc (mca_mpool_base_module_t *mpool, size_t size, + size_t align, uint32_t flags) +{ +#if HAVE_POSIX_MEMALIGN + void *addr = NULL; + + (void) posix_memalign (&addr, align, size); + return addr; +#else + void *addr, *ret; + + addr = malloc (size + align + sizeof (void *)); + ret = OPAL_ALIGN_PTR((intptr_t) addr + 8, align, void *); + *((void **) ret - 1) = addr; + return ret; +#endif +} + +static void *mca_mpool_default_realloc (mca_mpool_base_module_t *mpool, void *addr, size_t size) +{ +#if HAVE_POSIX_MEMALIGN + return realloc (addr, size); +#else + if (NULL != addr) { + void *base = *((void **) addr - 1); + void *ptr = realloc (base, size + (intptr_t) addr - (intptr_t) - size); + void *ret = (void *)((intptr_t) ptr + (intptr_t) addr - (intptr_t) - size); + *((void **) ret - 1) = ptr; + return ret; + } else { + return mca_mpool_default_alloc (mpool, size, 8, 0); + } +#endif +} + +static void mca_mpool_default_free (mca_mpool_base_module_t *mpool, void *addr) +{ +#if HAVE_POSIX_MEMALIGN + free (addr); +#else + if (NULL != addr) { + void *base = *((void **) addr - 1); + free (base); + } +#endif +} + +static void mca_mpool_default_finalize (struct mca_mpool_base_module_t *mpool) +{ +} + +static mca_mpool_base_module_t mca_mpool_malloc_module = { + .mpool_alloc = mca_mpool_default_alloc, + .mpool_realloc = mca_mpool_default_realloc, + .mpool_free = mca_mpool_default_free, + .mpool_finalize = mca_mpool_default_finalize, + .flags = MCA_MPOOL_FLAGS_MPI_ALLOC_MEM, +}; + +mca_mpool_base_module_t *mca_mpool_base_default_module = &mca_mpool_malloc_module; diff --git a/opal/mca/mpool/base/mpool_base_frame.c b/opal/mca/mpool/base/mpool_base_frame.c index 9545543ce8..1c3002d587 100644 --- a/opal/mca/mpool/base/mpool_base_frame.c +++ b/opal/mca/mpool/base/mpool_base_frame.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -13,6 +14,8 @@ * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -30,9 +33,7 @@ #include "opal/mca/mca.h" #include "opal/mca/base/base.h" -#include "opal/memoryhooks/memory.h" #include "opal/mca/mpool/base/base.h" -#include "mpool_base_mem_cb.h" #include "opal/constants.h" #include "opal/util/sys_limits.h" @@ -48,13 +49,33 @@ * Global variables */ -/* whether we actually used the mem hooks or not */ -int mca_mpool_base_used_mem_hooks = 0; - -uint32_t mca_mpool_base_page_size = 0; -uint32_t mca_mpool_base_page_size_log = 0; - opal_list_t mca_mpool_base_modules = {{0}}; +static char *mca_mpool_base_default_hints; + +int mca_mpool_base_default_priority = 50; + +OBJ_CLASS_INSTANCE(mca_mpool_base_selected_module_t, opal_list_item_t, NULL, NULL); + +static int mca_mpool_base_register (mca_base_register_flag_t flags) +{ + mca_mpool_base_default_hints = NULL; + (void) mca_base_var_register ("opal", "mpool", "base", "default_hints", + "Hints to use when selecting the default memory pool", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, + MCA_BASE_VAR_FLAG_INTERNAL, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL, + &mca_mpool_base_default_hints); + + mca_mpool_base_default_priority = 50; + (void) mca_base_var_register ("opal", "mpool", "base", "default_priority", + "Priority of the default mpool module", + MCA_BASE_VAR_TYPE_INT, NULL, 0, + MCA_BASE_VAR_FLAG_INTERNAL, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL, + &mca_mpool_base_default_priority); + + return OPAL_SUCCESS; +} /** * Function for finding and opening either all MCA components, or the one @@ -69,15 +90,14 @@ static int mca_mpool_base_open(mca_base_open_flag_t flags) return OPAL_ERROR; } + if (mca_mpool_base_default_hints) { + mca_mpool_base_default_module = mca_mpool_base_module_lookup (mca_mpool_base_default_hints); + } + /* Initialize the list so that in mca_mpool_base_close(), we can iterate over it (even if it's empty, as in the case of opal_info) */ - OBJ_CONSTRUCT(&mca_mpool_base_modules, opal_list_t); - /* get the page size for this architecture*/ - mca_mpool_base_page_size = opal_getpagesize(); - mca_mpool_base_page_size_log = my_log2(mca_mpool_base_page_size); - /* setup tree for tracking MPI_Alloc_mem */ mca_mpool_base_tree_init(); @@ -88,12 +108,6 @@ static int mca_mpool_base_close(void) { opal_list_item_t *item; mca_mpool_base_selected_module_t *sm; - int32_t modules_length; - - /* Need the initial length in order to know if some of the initializations - * are done in the open function. - */ - modules_length = opal_list_get_size(&mca_mpool_base_modules); /* Finalize all the mpool components and free their list items */ @@ -115,15 +129,8 @@ static int mca_mpool_base_close(void) OMPI RTE program, or [possibly] multiple if this is opal_info) */ (void) mca_base_framework_components_close(&opal_mpool_base_framework, NULL); - /* deregister memory free callback */ - if( (modules_length > 0) && mca_mpool_base_used_mem_hooks && - 0 != (OPAL_MEMORY_FREE_SUPPORT & opal_mem_hooks_support_level())) { - opal_mem_hooks_unregister_release(mca_mpool_base_mem_cb); - } - /* All done */ - return OPAL_SUCCESS; } -MCA_BASE_FRAMEWORK_DECLARE(opal, mpool, NULL, NULL, mca_mpool_base_open, +MCA_BASE_FRAMEWORK_DECLARE(opal, mpool, "Memory pools", mca_mpool_base_register, mca_mpool_base_open, mca_mpool_base_close, mca_mpool_base_static_components, 0); diff --git a/opal/mca/mpool/base/mpool_base_init.c b/opal/mca/mpool/base/mpool_base_init.c deleted file mode 100644 index 85c01f301a..0000000000 --- a/opal/mca/mpool/base/mpool_base_init.c +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include "opal/mca/mca.h" -#include "opal/mca/base/base.h" -#include "opal/mca/mpool/base/base.h" - -OBJ_CLASS_INSTANCE(mca_mpool_base_selected_module_t, opal_list_item_t, NULL, NULL); -static bool mca_mpool_enable_progress_threads = true; -static bool mca_mpool_enable_mpi_thread_multiple = true; - -/** - * Function for weeding out mpool modules that don't want to run. - * - * Call the init function on all available components to find out if they - * want to run. Select all components that don't fail. Failing modules - * will be closed and unloaded. The selected modules will be returned - * to the caller in a opal_list_t. - */ -int mca_mpool_base_init(bool enable_progress_threads, bool enable_mpi_thread_multiple) -{ - mca_mpool_enable_progress_threads = enable_progress_threads; - mca_mpool_enable_mpi_thread_multiple = enable_mpi_thread_multiple; - return OPAL_SUCCESS; -} - diff --git a/opal/mca/mpool/base/mpool_base_lookup.c b/opal/mca/mpool/base/mpool_base_lookup.c index c36ff2b3bb..fa0e0ce34a 100644 --- a/opal/mca/mpool/base/mpool_base_lookup.c +++ b/opal/mca/mpool/base/mpool_base_lookup.c @@ -1,4 +1,4 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved. * Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. All rights reserved. + * Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -30,126 +30,47 @@ #include "opal/mca/base/base.h" #include "opal/util/show_help.h" #include "opal/util/proc.h" -#include "opal/runtime/opal_params.h" #include "opal/mca/mpool/mpool.h" #include "opal/mca/mpool/base/base.h" -#include "opal/memoryhooks/memory.h" -#include "mpool_base_mem_cb.h" -mca_mpool_base_component_t* mca_mpool_base_component_lookup(const char* name) +mca_mpool_base_component_t* mca_mpool_base_component_lookup(const char *name) { + mca_base_component_list_item_t *cli; + /* Traverse the list of available modules; call their init functions. */ - opal_list_item_t* item; - for (item = opal_list_get_first(&opal_mpool_base_framework.framework_components); - item != opal_list_get_end(&opal_mpool_base_framework.framework_components); - item = opal_list_get_next(item)) { - mca_base_component_list_item_t *cli = - (mca_base_component_list_item_t *) item; - mca_mpool_base_component_t* component = - (mca_mpool_base_component_t *) cli->cli_component; - if(strcmp(component->mpool_version.mca_component_name, name) == 0) { - return component; - } + OPAL_LIST_FOREACH(cli, &opal_mpool_base_framework.framework_components, mca_base_component_list_item_t) { + mca_mpool_base_component_t* component = (mca_mpool_base_component_t *) cli->cli_component; + if (strcmp(component->mpool_version.mca_component_name, name) == 0) { + return component; + } } + return NULL; } -mca_mpool_base_module_t* mca_mpool_base_module_create( - const char* name, - void* user_data, - struct mca_mpool_base_resources_t* resources) + +mca_mpool_base_module_t *mca_mpool_base_module_lookup (const char *hints) { - mca_mpool_base_component_t* component = NULL; - mca_mpool_base_module_t* module = NULL; + mca_mpool_base_module_t *best_module = mca_mpool_base_default_module; mca_base_component_list_item_t *cli; - mca_mpool_base_selected_module_t *sm; + int best_priority = mca_mpool_base_default_priority; + int rc; OPAL_LIST_FOREACH(cli, &opal_mpool_base_framework.framework_components, mca_base_component_list_item_t) { - component = (mca_mpool_base_component_t *) cli->cli_component; - if(0 == strcmp(component->mpool_version.mca_component_name, name)) { - module = component->mpool_init(resources); - break; - } + mca_mpool_base_component_t *component = (mca_mpool_base_component_t *) cli->cli_component; + mca_mpool_base_module_t *module; + int priority; + + rc = component->mpool_query (hints, &priority, &module); + if (OPAL_SUCCESS == rc) { + if (priority > best_priority) { + best_priority = priority; + best_module = module; + } + } } - if ( NULL == module ) { - return NULL; - } - sm = OBJ_NEW(mca_mpool_base_selected_module_t); - sm->mpool_component = component; - sm->mpool_module = module; - sm->user_data = user_data; - sm->mpool_resources = resources; - opal_list_append(&mca_mpool_base_modules, (opal_list_item_t*) sm); - /* on the very first creation of a module we init the memory - callback */ - if (opal_list_get_size(&mca_mpool_base_modules) == 1) { - /* Default to not using memory hooks */ - int use_mem_hooks = 0; - - /* Use the memory hooks if leave_pinned or - leave_pinned_pipeline is enabled (note that either of these - leave_pinned variables may have been set by a user MCA - param or elsewhere in the code base). Yes, we could have - coded this more succinctly, but this is more clear. Do not - check memory hooks if the mpool explicity asked us not to. */ - if ((opal_leave_pinned > 0 || opal_leave_pinned_pipeline) && - !(module->flags & MCA_MPOOL_FLAGS_NO_HOOKS)) { - use_mem_hooks = 1; - } - - if (use_mem_hooks) { - if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) == - ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) & - opal_mem_hooks_support_level())) { - opal_mem_hooks_register_release(mca_mpool_base_mem_cb, NULL); - } else { - opal_show_help("help-mpool-base.txt", "leave pinned failed", - true, name, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), - opal_proc_local_get()->proc_hostname); - return NULL; - } - - /* Set this to true so that mpool_base_close knows to - cleanup */ - mca_mpool_base_used_mem_hooks = 1; - } - } - return module; -} - - -mca_mpool_base_module_t* mca_mpool_base_module_lookup(const char* name) -{ - mca_mpool_base_selected_module_t *mli; - - OPAL_LIST_FOREACH(mli, &mca_mpool_base_modules, mca_mpool_base_selected_module_t) { - if(0 == strcmp(mli->mpool_component->mpool_version.mca_component_name, - name)) { - return mli->mpool_module; - } - } - - return NULL; -} - - -int mca_mpool_base_module_destroy(mca_mpool_base_module_t *module) -{ - mca_mpool_base_selected_module_t *sm, *next; - - OPAL_LIST_FOREACH_SAFE(sm, next, &mca_mpool_base_modules, mca_mpool_base_selected_module_t) { - if (module == sm->mpool_module) { - opal_list_remove_item(&mca_mpool_base_modules, (opal_list_item_t*)sm); - if (NULL != sm->mpool_module->mpool_finalize) { - sm->mpool_module->mpool_finalize(sm->mpool_module); - } - OBJ_RELEASE(sm); - return OPAL_SUCCESS; - } - } - - return OPAL_ERR_NOT_FOUND; + return best_module; } diff --git a/opal/mca/mpool/base/mpool_base_tree.c b/opal/mca/mpool/base/mpool_base_tree.c index c59ec01c51..5f30620062 100644 --- a/opal/mca/mpool/base/mpool_base_tree.c +++ b/opal/mca/mpool/base/mpool_base_tree.c @@ -23,10 +23,6 @@ * * $HEADER$ */ -/** - * @file - * Description of the Registration Cache framework - */ #include "opal_config.h" diff --git a/opal/mca/mpool/base/mpool_base_tree.h b/opal/mca/mpool/base/mpool_base_tree.h index 8cdf071f85..2a31175a77 100644 --- a/opal/mca/mpool/base/mpool_base_tree.h +++ b/opal/mca/mpool/base/mpool_base_tree.h @@ -28,6 +28,7 @@ #define MCA_MPOOL_BASE_TREE_MAX 8 #include "opal/mca/mca.h" #include "opal/mca/mpool/mpool.h" +#include "opal/mca/rcache/rcache.h" BEGIN_C_DECLS @@ -46,8 +47,9 @@ struct mca_mpool_base_tree_item_t size_t num_bytes; /**< the number of bytes in this alloc, only for debugging reporting with mpi_show_mpi_alloc_mem_leaks */ - mca_mpool_base_module_t* mpools[MCA_MPOOL_BASE_TREE_MAX]; /**< the mpools */ - mca_mpool_base_registration_t* regs[MCA_MPOOL_BASE_TREE_MAX]; /**< the registrations */ + mca_mpool_base_module_t *mpool; + mca_rcache_base_module_t *rcaches[MCA_MPOOL_BASE_TREE_MAX]; /**< the registration caches */ + mca_rcache_base_registration_t *regs[MCA_MPOOL_BASE_TREE_MAX]; /**< the registrations */ uint8_t count; /**< length of the mpools/regs array */ }; typedef struct mca_mpool_base_tree_item_t mca_mpool_base_tree_item_t; diff --git a/opal/mca/mpool/gpusm/mpool_gpusm.h b/opal/mca/mpool/gpusm/mpool_gpusm.h deleted file mode 100644 index 537c95108a..0000000000 --- a/opal/mca/mpool/gpusm/mpool_gpusm.h +++ /dev/null @@ -1,105 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ -#ifndef MCA_MPOOL_GPUSM_H -#define MCA_MPOOL_GPUSM_H - -#include "opal_config.h" -#include "opal/class/opal_list.h" -#include "opal/mca/mpool/mpool.h" - -BEGIN_C_DECLS - -#define MEMHANDLE_SIZE 8 -#define EVTHANDLE_SIZE 8 -struct mca_mpool_gpusm_registration_t { - mca_mpool_base_registration_t base; - uint64_t memHandle[MEMHANDLE_SIZE]; /* CUipcMemHandle */ - uint64_t evtHandle[EVTHANDLE_SIZE]; /* CUipcEventHandle */ - uintptr_t event; /* CUevent */ -}; -typedef struct mca_mpool_gpusm_registration_t mca_mpool_gpusm_registration_t; -OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_mpool_gpusm_registration_t); - -struct mca_mpool_gpusm_component_t { - mca_mpool_base_component_t super; -}; -typedef struct mca_mpool_gpusm_component_t mca_mpool_gpusm_component_t; - -OPAL_DECLSPEC extern mca_mpool_gpusm_component_t mca_mpool_gpusm_component; - -struct mca_mpool_base_resources_t { - void *reg_data; - size_t sizeof_reg; - int (*register_mem)(void *base, size_t size, mca_mpool_base_registration_t *newreg, - mca_mpool_base_registration_t *hdrreg); - int (*deregister_mem)(void *reg_data, mca_mpool_base_registration_t *reg); -}; -typedef struct mca_mpool_base_resources_t mca_mpool_base_resources_t; - -struct mca_mpool_gpusm_module_t { - mca_mpool_base_module_t super; - struct mca_mpool_base_resources_t resources; - opal_free_list_t reg_list; -}; typedef struct mca_mpool_gpusm_module_t mca_mpool_gpusm_module_t; - -/* - * Initializes the mpool module. - */ -void mca_mpool_gpusm_module_init(mca_mpool_gpusm_module_t *mpool); - -/** - * register block of memory - */ -int mca_mpool_gpusm_register(mca_mpool_base_module_t* mpool, void *addr, - size_t size, uint32_t flags, int32_t access_flags, mca_mpool_base_registration_t **reg); - -/** - * deregister memory - */ -int mca_mpool_gpusm_deregister(mca_mpool_base_module_t *mpool, - mca_mpool_base_registration_t *reg); - -/** - * find registration for a given block of memory - */ -int mca_mpool_gpusm_find(struct mca_mpool_base_module_t* mpool, void* addr, - size_t size, mca_mpool_base_registration_t **reg); - -/** - * finalize mpool - */ -void mca_mpool_gpusm_finalize(struct mca_mpool_base_module_t *mpool); - -/** - * Fault Tolerance Event Notification Function - * @param state Checkpoint Stae - * @return OPAL_SUCCESS or failure status - */ -int mca_mpool_gpusm_ft_event(int state); - -END_C_DECLS -#endif diff --git a/opal/mca/mpool/grdma/mpool_grdma.h b/opal/mca/mpool/grdma/mpool_grdma.h deleted file mode 100644 index 1ddbd139e0..0000000000 --- a/opal/mca/mpool/grdma/mpool_grdma.h +++ /dev/null @@ -1,160 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ -#ifndef MCA_MPOOL_OPENIB_H -#define MCA_MPOOL_OPENIB_H - -#include "opal_config.h" -#include "opal/class/opal_list.h" -#include "opal/mca/event/event.h" -#include "opal/mca/mpool/mpool.h" -#if HAVE_SYS_MMAN_H -#include -#endif - -BEGIN_C_DECLS - -#define MCA_MPOOL_GRDMA_NAME_MAX 256 - -struct mca_mpool_grdma_pool_t { - opal_list_item_t super; - char *pool_name; - opal_list_t lru_list; - opal_list_t gc_list; - struct mca_rcache_base_module_t *rcache; -}; -typedef struct mca_mpool_grdma_pool_t mca_mpool_grdma_pool_t; - -OBJ_CLASS_DECLARATION(mca_mpool_grdma_pool_t); - -struct mca_mpool_grdma_component_t { - mca_mpool_base_component_t super; - opal_list_t pools; - char *rcache_name; - bool print_stats; - int leave_pinned; -}; -typedef struct mca_mpool_grdma_component_t mca_mpool_grdma_component_t; - -OPAL_DECLSPEC extern mca_mpool_grdma_component_t mca_mpool_grdma_component; - -struct mca_mpool_grdma_module_t; - -struct mca_mpool_base_resources_t { - char *pool_name; - void *reg_data; - size_t sizeof_reg; - int (*register_mem)(void *reg_data, void *base, size_t size, - mca_mpool_base_registration_t *reg); - int (*deregister_mem)(void *reg_data, mca_mpool_base_registration_t *reg); -}; -typedef struct mca_mpool_base_resources_t mca_mpool_base_resources_t; - -struct mca_mpool_grdma_module_t { - mca_mpool_base_module_t super; - struct mca_mpool_base_resources_t resources; - mca_mpool_grdma_pool_t *pool; - opal_free_list_t reg_list; - uint32_t stat_cache_hit; - uint32_t stat_cache_miss; - uint32_t stat_evicted; - uint32_t stat_cache_found; - uint32_t stat_cache_notfound; -}; -typedef struct mca_mpool_grdma_module_t mca_mpool_grdma_module_t; - -/* - * Initializes the mpool module. - */ -void mca_mpool_grdma_module_init(mca_mpool_grdma_module_t *mpool, mca_mpool_grdma_pool_t *pool); - -/* - * Returns base address of shared memory mapping. - */ -void *mca_mpool_grdma_base(mca_mpool_base_module_t *mpool); - -/** - * Allocate block of registered memory. - */ -void* mca_mpool_grdma_alloc(mca_mpool_base_module_t *mpool, size_t size, - size_t align, uint32_t flags, - mca_mpool_base_registration_t** registration); - -/** - * realloc block of registered memory - */ -void* mca_mpool_grdma_realloc( mca_mpool_base_module_t *mpool, void* addr, - size_t size, mca_mpool_base_registration_t** registration); - -/** - * register block of memory - */ -int mca_mpool_grdma_register(mca_mpool_base_module_t* mpool, void *addr, - size_t size, uint32_t flags, int32_t access_flags, mca_mpool_base_registration_t **reg); - -/** - * deregister memory - */ -int mca_mpool_grdma_deregister(mca_mpool_base_module_t *mpool, - mca_mpool_base_registration_t *reg); - -/** - * free memory allocated by alloc function - */ -void mca_mpool_grdma_free(mca_mpool_base_module_t *mpool, void * addr, - mca_mpool_base_registration_t *reg); - -/** - * find registration for a given block of memory - */ -int mca_mpool_grdma_find(struct mca_mpool_base_module_t* mpool, void* addr, - size_t size, mca_mpool_base_registration_t **reg); - -/** - * unregister all registration covering the block of memory - */ -int mca_mpool_grdma_release_memory(mca_mpool_base_module_t* mpool, void *base, - size_t size); - -/** - * finalize mpool - */ -void mca_mpool_grdma_finalize(struct mca_mpool_base_module_t *mpool); - -/** - * Fault Tolerance Event Notification Function - * @param state Checkpoint Stae - * @return OPAL_SUCCESS or failure status - */ -int mca_mpool_grdma_ft_event(int state); - -/** - * evict one unused registration from the mpool's lru. - * @return true on success, false on failure - */ -bool mca_mpool_grdma_evict (struct mca_mpool_base_module_t *mpool); - -END_C_DECLS -#endif diff --git a/opal/mca/mpool/grdma/mpool_grdma_module.c b/opal/mca/mpool/grdma/mpool_grdma_module.c deleted file mode 100644 index b33b769bf1..0000000000 --- a/opal/mca/mpool/grdma/mpool_grdma_module.c +++ /dev/null @@ -1,592 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2007 Mellanox Technologies. All rights reserved. - * Copyright (c) 2010 IBM Corporation. All rights reserved. - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1 -#include "opal_config.h" - -#include -#include -#include - -#include "opal/align.h" - -#include "opal/util/proc.h" -#if OPAL_CUDA_GDR_SUPPORT -#include "opal/mca/common/cuda/common_cuda.h" -#endif /* OPAL_CUDA_GDR_SUPPORT */ -#include "opal/mca/rcache/rcache.h" -#include "opal/mca/rcache/base/base.h" - -#include "opal/mca/mpool/base/base.h" -#include "mpool_grdma.h" - -static inline bool registration_is_cacheable(mca_mpool_base_registration_t *reg) -{ - return (mca_mpool_grdma_component.leave_pinned && - !(reg->flags & - (MCA_MPOOL_FLAGS_CACHE_BYPASS | - MCA_MPOOL_FLAGS_PERSIST | - MCA_MPOOL_FLAGS_INVALID))); -} - -#if OPAL_CUDA_GDR_SUPPORT -static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *addr, size_t size); -#endif /* OPAL_CUDA_GDR_SUPPORT */ -static void mca_mpool_grdma_pool_contructor (mca_mpool_grdma_pool_t *pool) -{ - memset ((void *)((uintptr_t)pool + sizeof (pool->super)), 0, sizeof (*pool) - sizeof (pool->super)); - - OBJ_CONSTRUCT(&pool->lru_list, opal_list_t); - OBJ_CONSTRUCT(&pool->gc_list, opal_list_t); - - pool->rcache = mca_rcache_base_module_create(mca_mpool_grdma_component.rcache_name); -} - -static void mca_mpool_grdma_pool_destructor (mca_mpool_grdma_pool_t *pool) -{ - OBJ_DESTRUCT(&pool->lru_list); - OBJ_DESTRUCT(&pool->gc_list); - - free (pool->pool_name); -} - -OBJ_CLASS_INSTANCE(mca_mpool_grdma_pool_t, opal_list_item_t, - mca_mpool_grdma_pool_contructor, - mca_mpool_grdma_pool_destructor); - -/* - * Initializes the mpool module. - */ -void mca_mpool_grdma_module_init(mca_mpool_grdma_module_t* mpool, mca_mpool_grdma_pool_t *pool) -{ - OBJ_RETAIN(pool); - mpool->pool = pool; - - mpool->super.mpool_component = &mca_mpool_grdma_component.super; - mpool->super.mpool_base = NULL; /* no base .. */ - mpool->super.mpool_alloc = mca_mpool_grdma_alloc; - mpool->super.mpool_realloc = mca_mpool_grdma_realloc; - mpool->super.mpool_free = mca_mpool_grdma_free; - mpool->super.mpool_register = mca_mpool_grdma_register; - mpool->super.mpool_find = mca_mpool_grdma_find; - mpool->super.mpool_deregister = mca_mpool_grdma_deregister; - mpool->super.mpool_release_memory = mca_mpool_grdma_release_memory; - mpool->super.mpool_finalize = mca_mpool_grdma_finalize; - mpool->super.mpool_ft_event = mca_mpool_grdma_ft_event; - mpool->super.flags = MCA_MPOOL_FLAGS_MPI_ALLOC_MEM; - mpool->super.rcache = pool->rcache; - - mpool->stat_cache_hit = mpool->stat_cache_miss = mpool->stat_evicted = 0; - mpool->stat_cache_found = mpool->stat_cache_notfound = 0; - - OBJ_CONSTRUCT(&mpool->reg_list, opal_free_list_t); - opal_free_list_init (&mpool->reg_list, mpool->resources.sizeof_reg, - opal_cache_line_size, - OBJ_CLASS(mca_mpool_base_registration_t), - 0, opal_cache_line_size, 0, -1, 32, NULL, 0, - NULL, NULL, NULL); -} - -static inline int dereg_mem(mca_mpool_base_registration_t *reg) -{ - mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) reg->mpool; - int rc; - - if(!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) - reg->mpool->rcache->rcache_delete(reg->mpool->rcache, reg); - - /* Drop the rcache lock before deregistring the memory */ - OPAL_THREAD_UNLOCK(®->mpool->rcache->lock); - rc = mpool_grdma->resources.deregister_mem(mpool_grdma->resources.reg_data, - reg); - OPAL_THREAD_LOCK(®->mpool->rcache->lock); - - if (OPAL_LIKELY(OPAL_SUCCESS == rc)) { - opal_free_list_return (&mpool_grdma->reg_list, - (opal_free_list_item_t *) reg); - } - - return rc; -} - -/** - * allocate function - */ -void* mca_mpool_grdma_alloc(mca_mpool_base_module_t *mpool, size_t size, - size_t align, uint32_t flags, mca_mpool_base_registration_t **reg) -{ - void *base_addr, *addr; - - if(0 == align) - align = mca_mpool_base_page_size; - -#if OPAL_CUDA_SUPPORT - /* CUDA cannot handle registering overlapping regions, so make - * sure each region is page sized and page aligned. */ - align = mca_mpool_base_page_size; - size = OPAL_ALIGN(size, mca_mpool_base_page_size, size_t); -#endif - -#ifdef HAVE_POSIX_MEMALIGN - if((errno = posix_memalign(&base_addr, align, size)) != 0) - return NULL; - - addr = base_addr; -#else - base_addr = malloc(size + align); - if(NULL == base_addr) - return NULL; - - addr = (void*)OPAL_ALIGN((uintptr_t)base_addr, align, uintptr_t); -#endif - - if(OPAL_SUCCESS != mca_mpool_grdma_register(mpool, addr, size, flags, - MCA_MPOOL_ACCESS_ANY, reg)) { - free(base_addr); - return NULL; - } - (*reg)->alloc_base = (unsigned char *) base_addr; - - return addr; -} - -/* This function must be called with the rcache lock held */ -static inline void do_unregistration_gc(struct mca_mpool_base_module_t *mpool) -{ - mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool; - opal_list_item_t *item; - - /* Remove registration from garbage collection list - before deregistering it */ - while (NULL != - (item = opal_list_remove_first(&mpool_grdma->pool->gc_list))) { - dereg_mem((mca_mpool_base_registration_t *) item); - } -} - -static inline bool mca_mpool_grdma_evict_lru_local (mca_mpool_grdma_pool_t *pool) -{ - mca_mpool_grdma_module_t *mpool_grdma; - mca_mpool_base_registration_t *old_reg; - - old_reg = (mca_mpool_base_registration_t *) - opal_list_remove_first (&pool->lru_list); - if (NULL == old_reg) { - return false; - } - - mpool_grdma = (mca_mpool_grdma_module_t *) old_reg->mpool; - - (void) dereg_mem (old_reg); - - mpool_grdma->stat_evicted++; - - return true; -} - -enum { - MCA_MPOOL_GRDMA_MSG_EMPTY = 0, - MCA_MPOOL_GRDMA_MSG_NEED_DEREG = 1, - MCA_MPOOL_GRDMA_MSG_BUSY = 2, - MCA_MPOOL_GRDMA_MSG_COMPLETE = 3 -}; - -bool mca_mpool_grdma_evict (struct mca_mpool_base_module_t *mpool) -{ - return mca_mpool_grdma_evict_lru_local (((mca_mpool_grdma_module_t *) mpool)->pool); -} - -/* - * register memory - */ -int mca_mpool_grdma_register (mca_mpool_base_module_t *mpool, void *addr, - size_t size, uint32_t flags, int32_t access_flags, - mca_mpool_base_registration_t **reg) -{ - mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool; - const bool bypass_cache = !!(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS); - const bool persist = !!(flags & MCA_MPOOL_FLAGS_PERSIST); - mca_mpool_base_registration_t *grdma_reg; - opal_free_list_item_t *item; - unsigned char *base, *bound; - int rc; - - OPAL_THREAD_LOCK(&mpool->rcache->lock); - - *reg = NULL; - - /* if cache bypass is requested don't use the cache */ - base = (unsigned char *) down_align_addr(addr, mca_mpool_base_page_size_log); - bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1), - mca_mpool_base_page_size_log); - if (!opal_list_is_empty (&mpool_grdma->pool->gc_list)) - do_unregistration_gc(mpool); - -#if OPAL_CUDA_GDR_SUPPORT - if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) { - size_t psize; - mca_common_cuda_get_address_range(&base, &psize, addr); - bound = base + psize - 1; - /* Check to see if this memory is in the cache and if it has been freed. If so, - * this call will boot it out of the cache. */ - check_for_cuda_freed_memory(mpool, base, psize); - } -#endif /* OPAL_CUDA_GDR_SUPPORT */ - - /* look through existing regs if not persistent registration requested. - * Persistent registration are always registered and placed in the cache */ - if(!(bypass_cache || persist)) { - /* check to see if memory is registered */ - mpool->rcache->rcache_find(mpool->rcache, base, bound - base + 1, &grdma_reg); - if (grdma_reg && !(flags & MCA_MPOOL_FLAGS_INVALID)) { - if (OPAL_UNLIKELY((access_flags & grdma_reg->access_flags) != access_flags)) { - access_flags |= grdma_reg->access_flags; - - if (0 != grdma_reg->ref_count) { - if (!(grdma_reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) { - grdma_reg->mpool->rcache->rcache_delete(grdma_reg->mpool->rcache, grdma_reg); - } - - /* mark the registration to go away when it is deregistered */ - grdma_reg->flags |= MCA_MPOOL_FLAGS_INVALID | MCA_MPOOL_FLAGS_CACHE_BYPASS; - } else { - if (registration_is_cacheable (grdma_reg)) { - /* pull the item out of the lru */ - opal_list_remove_item (&mpool_grdma->pool->lru_list, (opal_list_item_t *) grdma_reg); - } - - (void) dereg_mem (grdma_reg); - } - } else { - *reg = grdma_reg; - if (0 == grdma_reg->ref_count) { - /* Leave pinned must be set for this to still be in the rcache. */ - opal_list_remove_item(&mpool_grdma->pool->lru_list, - (opal_list_item_t *) grdma_reg); - } - - /* This segment fits fully within an existing segment. */ - mpool_grdma->stat_cache_hit++; - grdma_reg->ref_count++; - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - return OPAL_SUCCESS; - } - } - - mpool_grdma->stat_cache_miss++; - - /* Unless explicitly requested by the caller always store the - * registration in the rcache. This will speed up the case where - * no leave pinned protocol is in use but the same segment is in - * use in multiple simultaneous transactions. We used to set bypass_cache - * here is !mca_mpool_grdma_component.leave_pinned. */ - } - - item = opal_free_list_get (&mpool_grdma->reg_list); - if(NULL == item) { - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - return OPAL_ERR_OUT_OF_RESOURCE; - } - grdma_reg = (mca_mpool_base_registration_t*)item; - - grdma_reg->mpool = mpool; - grdma_reg->base = base; - grdma_reg->bound = bound; - grdma_reg->flags = flags; - grdma_reg->access_flags = access_flags; -#if OPAL_CUDA_GDR_SUPPORT - if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) { - mca_common_cuda_get_buffer_id(grdma_reg); - } -#endif /* OPAL_CUDA_GDR_SUPPORT */ - - if (false == bypass_cache) { - rc = mpool->rcache->rcache_insert(mpool->rcache, grdma_reg, 0); - - if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) { - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - opal_free_list_return (&mpool_grdma->reg_list, item); - return rc; - } - } - - while (OPAL_ERR_OUT_OF_RESOURCE == - (rc = mpool_grdma->resources.register_mem(mpool_grdma->resources.reg_data, - base, bound - base + 1, grdma_reg))) { - /* try to remove one unused reg and retry */ - if (!mca_mpool_grdma_evict (mpool)) { - break; - } - } - - if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) { - if (false == bypass_cache) { - mpool->rcache->rcache_delete(mpool->rcache, grdma_reg); - } - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - opal_free_list_return (&mpool_grdma->reg_list, item); - return rc; - } - - *reg = grdma_reg; - (*reg)->ref_count++; - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - - /* Cleanup any vmas that we have deferred deletion on */ - mpool->rcache->rcache_clean(mpool->rcache); - return OPAL_SUCCESS; -} - - -/** - * realloc function - */ -void* mca_mpool_grdma_realloc(mca_mpool_base_module_t *mpool, void *addr, - size_t size, mca_mpool_base_registration_t **reg) -{ - mca_mpool_base_registration_t *old_reg = *reg; - void *new_mem = mca_mpool_grdma_alloc(mpool, size, 0, old_reg->flags, reg); - memcpy(new_mem, addr, old_reg->bound - old_reg->base + 1); - mca_mpool_grdma_free(mpool, addr, old_reg); - - return new_mem; -} - -/** - * free function - */ -void mca_mpool_grdma_free(mca_mpool_base_module_t *mpool, void *addr, - mca_mpool_base_registration_t *registration) -{ - void *alloc_base = registration->alloc_base; - mca_mpool_grdma_deregister(mpool, registration); - free(alloc_base); -} - -int mca_mpool_grdma_find(struct mca_mpool_base_module_t *mpool, void *addr, - size_t size, mca_mpool_base_registration_t **reg) -{ - mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool; - unsigned char *base, *bound; - int rc; - - base = (unsigned char *) down_align_addr(addr, mca_mpool_base_page_size_log); - bound = (unsigned char *) up_align_addr((void*)((char*) addr + size - 1), - mca_mpool_base_page_size_log); - - OPAL_THREAD_LOCK(&mpool->rcache->lock); - - rc = mpool->rcache->rcache_find(mpool->rcache, base, bound - base + 1, reg); - if(NULL != *reg && - (mca_mpool_grdma_component.leave_pinned || - ((*reg)->flags & MCA_MPOOL_FLAGS_PERSIST) || - ((*reg)->base == base && (*reg)->bound == bound))) { - assert(((void*)(*reg)->bound) >= addr); - if(0 == (*reg)->ref_count && - mca_mpool_grdma_component.leave_pinned) { - opal_list_remove_item(&mpool_grdma->pool->lru_list, - (opal_list_item_t*)(*reg)); - } - mpool_grdma->stat_cache_found++; - (*reg)->ref_count++; - } else { - mpool_grdma->stat_cache_notfound++; - } - - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - - return rc; -} - -int mca_mpool_grdma_deregister(struct mca_mpool_base_module_t *mpool, - mca_mpool_base_registration_t *reg) -{ - mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) mpool; - int rc = OPAL_SUCCESS; - assert(reg->ref_count > 0); - - OPAL_THREAD_LOCK(&mpool->rcache->lock); - reg->ref_count--; - if(reg->ref_count > 0) { - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - return OPAL_SUCCESS; - } - - if (registration_is_cacheable(reg)) { - opal_list_append(&mpool_grdma->pool->lru_list, (opal_list_item_t *) reg); - } else { - rc = dereg_mem (reg); - } - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - - /* Cleanup any vmas that we have deferred deletion on */ - mpool->rcache->rcache_clean(mpool->rcache); - - return rc; -} - -#define GRDMA_MPOOL_NREGS 100 - -int mca_mpool_grdma_release_memory(struct mca_mpool_base_module_t *mpool, - void *base, size_t size) -{ - mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) mpool; - mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS]; - int reg_cnt, i, rc = OPAL_SUCCESS; - - OPAL_THREAD_LOCK(&mpool->rcache->lock); - do { - reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, base, size, - regs, GRDMA_MPOOL_NREGS); - - for(i = 0 ; i < reg_cnt ; ++i) { - regs[i]->flags |= MCA_MPOOL_FLAGS_INVALID; - if (regs[i]->ref_count) { - /* memory is being freed, but there are registration in use that - * covers the memory. This can happen even in a correct program, - * but may also be an user error. We can't tell. Mark the - * registration as invalid. It will not be used any more and - * will be unregistered when ref_count will become zero */ - rc = OPAL_ERROR; /* tell caller that something was wrong */ - } else { - opal_list_remove_item(&mpool_grdma->pool->lru_list,(opal_list_item_t *) regs[i]); - opal_list_append(&mpool_grdma->pool->gc_list, (opal_list_item_t *) regs[i]); - } - } - } while(reg_cnt == GRDMA_MPOOL_NREGS); - - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - - return rc; -} - -/* Make sure this registration request is not stale. In other words, ensure - * that we do not have a cuMemAlloc, cuMemFree, cuMemAlloc state. If we do - * kick out the regisrations and deregister. This function needs to be called - * with the mpool->rcache->lock held. */ -#if OPAL_CUDA_GDR_SUPPORT -static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *addr, size_t size) -{ - mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) mpool; - mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS]; - int reg_cnt, i, rc = OPAL_SUCCESS; - mca_mpool_base_registration_t *reg; - - mpool->rcache->rcache_find(mpool->rcache, addr, size, ®); - if (NULL == reg) { - return OPAL_SUCCESS; - } - - /* If not previously freed memory, just return 0 */ - if (!(mca_common_cuda_previously_freed_memory(reg))) { - return OPAL_SUCCESS; - } - - /* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "Before free"); */ - - /* This memory has been freed. Find all registrations and delete */ - do { - reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, reg->base, reg->bound - reg->base + 1, - regs, GRDMA_MPOOL_NREGS); - for(i = 0 ; i < reg_cnt ; ++i) { - regs[i]->flags |= MCA_MPOOL_FLAGS_INVALID; - if (regs[i]->ref_count) { - opal_output(0, "Release FAILED: ref_count=%d, base=%p, bound=%p, size=%d", - regs[i]->ref_count, regs[i]->base, regs[i]->bound, - (int) (regs[i]->bound - regs[i]->base + 1)); - /* memory is being freed, but there are registration in use that - * covers the memory. This can happen even in a correct program, - * but may also be an user error. We can't tell. Mark the - * registration as invalid. It will not be used any more and - * will be unregistered when ref_count will become zero */ - rc = OPAL_ERROR; /* tell caller that something was wrong */ - } else { - opal_list_remove_item(&mpool_grdma->pool->lru_list,(opal_list_item_t *) regs[i]); - /* Now deregister. Do not use gc_list as we need to kick this out now. */ - dereg_mem(regs[i]); - } - } - } while(reg_cnt == GRDMA_MPOOL_NREGS); - - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - /* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "After free");*/ - - return rc; -} -#endif /* OPAL_CUDA_GDR_SUPPORT */ - -void mca_mpool_grdma_finalize(struct mca_mpool_base_module_t *mpool) -{ - mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool; - mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS]; - int reg_cnt, i; - - /* Statistic */ - if (true == mca_mpool_grdma_component.print_stats) { - opal_output(0, "%s grdma: stats " - "(hit/miss/found/not found/evicted): %d/%d/%d/%d/%d\n", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), - mpool_grdma->stat_cache_hit, mpool_grdma->stat_cache_miss, - mpool_grdma->stat_cache_found, mpool_grdma->stat_cache_notfound, - mpool_grdma->stat_evicted); - } - - OPAL_THREAD_LOCK(&mpool->rcache->lock); - - do_unregistration_gc(mpool); - - do { - reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1, - regs, GRDMA_MPOOL_NREGS); - - for (i = 0 ; i < reg_cnt ; ++i) { - if (regs[i]->ref_count) { - regs[i]->ref_count = 0; /* otherwise dereg will fail on assert */ - } else if (mca_mpool_grdma_component.leave_pinned) { - opal_list_remove_item(&mpool_grdma->pool->lru_list, - (opal_list_item_t *) regs[i]); - } - - (void) dereg_mem(regs[i]); - } - } while (reg_cnt == GRDMA_MPOOL_NREGS); - - OBJ_RELEASE(mpool_grdma->pool); - - OBJ_DESTRUCT(&mpool_grdma->reg_list); - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - - /* Cleanup any vmas that we have deferred deletion on */ - mpool->rcache->rcache_clean(mpool->rcache); - - /* this mpool was allocated by grdma_init in mpool_grdma_component.c */ - free(mpool); -} - -int mca_mpool_grdma_ft_event(int state) { - return OPAL_SUCCESS; -} diff --git a/opal/mca/mpool/hugepage/Makefile.am b/opal/mca/mpool/hugepage/Makefile.am new file mode 100644 index 0000000000..621574b1cb --- /dev/null +++ b/opal/mca/mpool/hugepage/Makefile.am @@ -0,0 +1,52 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights +# reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +AM_CPPFLAGS = $(mpool_hugepage_CPPFLAGS) + +sources = mpool_hugepage_module.c mpool_hugepage_component.c + +if WANT_INSTALL_HEADERS +opaldir = $(opalincludedir)/$(subdir) +opal_HEADERS = mpool_hugepage.h +endif + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_opal_mpool_hugepage_DSO +component_noinst = +component_install = mca_mpool_hugepage.la +else +component_noinst = libmca_mpool_hugepage.la +component_install = +endif + +mcacomponentdir = $(opallibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_mpool_hugepage_la_SOURCES = $(sources) +mca_mpool_hugepage_la_LDFLAGS = -module -avoid-version +mca_mpool_hugepage_la_LIBADD = $(mpool_hugepage_LIBS) + +noinst_LTLIBRARIES = $(component_noinst) +libmca_mpool_hugepage_la_SOURCES = $(sources) +libmca_mpool_hugepage_la_LDFLAGS = -module -avoid-version +libmca_mpool_hugepage_la_LIBADD = $(mpool_hugepage_LIBS) diff --git a/opal/mca/mpool/hugepage/mpool_hugepage.h b/opal/mca/mpool/hugepage/mpool_hugepage.h new file mode 100644 index 0000000000..cd97711168 --- /dev/null +++ b/opal/mca/mpool/hugepage/mpool_hugepage.h @@ -0,0 +1,90 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_MPOOL_HUGEPAGE_H +#define MCA_MPOOL_HUGEPAGE_H + +#include "opal_config.h" +#include "opal/class/opal_list.h" +#include "opal/class/opal_free_list.h" +#include "opal/mca/event/event.h" +#include "opal/mca/mpool/mpool.h" +#include "opal/util/proc.h" +#include "opal/mca/allocator/allocator.h" +#include "opal/util/sys_limits.h" + +BEGIN_C_DECLS +struct mca_mpool_hugepage_module_t; +typedef struct mca_mpool_hugepage_module_t mca_mpool_hugepage_module_t; + +struct mca_mpool_hugepage_component_t { + mca_mpool_base_component_t super; + bool print_stats; + opal_list_t huge_pages; + mca_mpool_hugepage_module_t *modules; + int module_count; + unsigned long bytes_allocated; +}; +typedef struct mca_mpool_hugepage_component_t mca_mpool_hugepage_component_t; + +OPAL_DECLSPEC extern mca_mpool_hugepage_component_t mca_mpool_hugepage_component; + +struct mca_mpool_hugepage_module_t; + +struct mca_mpool_hugepage_hugepage_t { + /** opal list item superclass */ + opal_list_item_t super; + /** page size in bytes */ + unsigned long page_size; + /** path for mmapped files */ + char *path; + /** counter to help ensure unique file names for mmaped files */ + volatile int32_t count; + /** some platforms allow allocation of hugepages through mmap flags */ + int mmap_flags; +}; +typedef struct mca_mpool_hugepage_hugepage_t mca_mpool_hugepage_hugepage_t; + +OBJ_CLASS_DECLARATION(mca_mpool_hugepage_hugepage_t); + +struct mca_mpool_hugepage_module_t { + mca_mpool_base_module_t super; + mca_mpool_hugepage_hugepage_t *huge_page; + mca_allocator_base_module_t *allocator; + opal_mutex_t lock; + opal_rb_tree_t allocation_tree; +}; + +/* + * Initializes the mpool module. + */ +int mca_mpool_hugepage_module_init (mca_mpool_hugepage_module_t *mpool, + mca_mpool_hugepage_hugepage_t *huge_page); + +void *mca_mpool_hugepage_seg_alloc (void *ctx, size_t *sizep); +void mca_mpool_hugepage_seg_free (void *ctx, void *addr); + +END_C_DECLS +#endif diff --git a/opal/mca/mpool/hugepage/mpool_hugepage_component.c b/opal/mca/mpool/hugepage/mpool_hugepage_component.c new file mode 100644 index 0000000000..b6e03937e4 --- /dev/null +++ b/opal/mca/mpool/hugepage/mpool_hugepage_component.c @@ -0,0 +1,366 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012-2016 Los Alamos National Security, LLC. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1 +#include "opal_config.h" +#include "opal/mca/base/base.h" +#include "opal/runtime/opal_params.h" +#include "opal/mca/base/mca_base_pvar.h" +#include "opal/mca/mpool/base/base.h" +#include "opal/mca/allocator/base/base.h" +#include "mpool_hugepage.h" +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_MALLOC_H +#include +#endif +#ifdef HAVE_SYS_VFS_H +#include +#endif +#ifdef HAVE_SYS_MOUNT_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include +#endif +#ifdef HAVE_SYS_MMAN_H +#include +#endif + +#include + +/* + * Local functions + */ +static int mca_mpool_hugepage_open (void); +static int mca_mpool_hugepage_close (void); +static int mca_mpool_hugepage_register (void); +static int mca_mpool_hugepage_query (const char *hints, int *priority, + mca_mpool_base_module_t **module); +static void mca_mpool_hugepage_find_hugepages (void); + +static int mca_mpool_hugepage_priority; +static unsigned long mca_mpool_hugepage_page_size; + +mca_mpool_hugepage_component_t mca_mpool_hugepage_component = { + { + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + .mpool_version ={ + MCA_MPOOL_BASE_VERSION_3_0_0, + + .mca_component_name = "hugepage", + MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, + OPAL_RELEASE_VERSION), + .mca_open_component = mca_mpool_hugepage_open, + .mca_close_component = mca_mpool_hugepage_close, + .mca_register_component_params = mca_mpool_hugepage_register, + }, + .mpool_data = { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + .mpool_query = mca_mpool_hugepage_query, + }, +}; + +/** + * component open/close/init function + */ + +static int mca_mpool_hugepage_register(void) +{ + mca_mpool_hugepage_priority = 50; + (void) mca_base_component_var_register (&mca_mpool_hugepage_component.super.mpool_version, + "priority", "Default priority of the hugepage mpool component " + "(default: 50)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL, + &mca_mpool_hugepage_priority); + + mca_mpool_hugepage_page_size = 1 << 21; + (void) mca_base_component_var_register (&mca_mpool_hugepage_component.super.mpool_version, + "page_size", "Default huge page size of the hugepage mpool component " + "(default: 2M)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_LOCAL, + &mca_mpool_hugepage_page_size); + + mca_mpool_hugepage_component.bytes_allocated = 0; + (void) mca_base_component_pvar_register (&mca_mpool_hugepage_component.super.mpool_version, + "bytes_allocated", "Number of bytes currently allocated in the mpool " + "hugepage component", OPAL_INFO_LVL_3, MCA_BASE_PVAR_CLASS_SIZE, + MCA_BASE_VAR_TYPE_UNSIGNED_LONG, NULL, MCA_BASE_VAR_BIND_NO_OBJECT, + MCA_BASE_PVAR_FLAG_READONLY | MCA_BASE_PVAR_FLAG_CONTINUOUS, + NULL, NULL, NULL, &mca_mpool_hugepage_component.bytes_allocated); + + return OPAL_SUCCESS; +} + +static int mca_mpool_hugepage_open (void) +{ + mca_mpool_hugepage_module_t *hugepage_module; + mca_mpool_hugepage_hugepage_t *hp; + int module_index, rc; + + OBJ_CONSTRUCT(&mca_mpool_hugepage_component.huge_pages, opal_list_t); + mca_mpool_hugepage_find_hugepages (); + + if (0 == opal_list_get_size (&mca_mpool_hugepage_component.huge_pages)) { + return OPAL_SUCCESS; + } + + mca_mpool_hugepage_component.modules = (mca_mpool_hugepage_module_t *) + calloc (opal_list_get_size (&mca_mpool_hugepage_component.huge_pages), + sizeof (mca_mpool_hugepage_module_t)); + if (NULL == mca_mpool_hugepage_component.modules) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + module_index = 0; + OPAL_LIST_FOREACH(hp, &mca_mpool_hugepage_component.huge_pages, mca_mpool_hugepage_hugepage_t) { + hugepage_module = mca_mpool_hugepage_component.modules + module_index; + rc = mca_mpool_hugepage_module_init (hugepage_module, hp); + if (OPAL_SUCCESS != rc) { + continue; + } + module_index++; + } + + mca_mpool_hugepage_component.module_count = module_index; + + return OPAL_SUCCESS; +} + +static int mca_mpool_hugepage_close (void) +{ + OPAL_LIST_DESTRUCT(&mca_mpool_hugepage_component.huge_pages); + + for (int i = 0 ; i < mca_mpool_hugepage_component.module_count ; ++i) { + mca_mpool_hugepage_module_t *module = mca_mpool_hugepage_component.modules + i; + module->super.mpool_finalize (&module->super); + } + + free (mca_mpool_hugepage_component.modules); + mca_mpool_hugepage_component.modules = NULL; + + return OPAL_SUCCESS; +} + +static int page_compare (opal_list_item_t **a, opal_list_item_t **b) { + mca_mpool_hugepage_hugepage_t *pagea = (mca_mpool_hugepage_hugepage_t *) *a; + mca_mpool_hugepage_hugepage_t *pageb = (mca_mpool_hugepage_hugepage_t *) *b; + if (pagea->page_size > pageb->page_size) { + return 1; + } else if (pagea->page_size < pageb->page_size) { + return -1; + } + + return 0; +} + +static void mca_mpool_hugepage_find_hugepages (void) { + mca_mpool_hugepage_hugepage_t *hp; + FILE *fh; + char *path; + char buffer[1024]; + char *ctx, *tok; + + fh = fopen ("/proc/mounts", "r"); + if (NULL == fh) { + return; + } + + while (fgets (buffer, 1024, fh)) { + unsigned long page_size = 0; + + (void) strtok_r (buffer, " ", &ctx); + path = strtok_r (NULL, " ", &ctx); + tok = strtok_r (NULL, " ", &ctx); + + if (0 != strcmp (tok, "hugetlbfs")) { + continue; + } + + tok = strtok_r (NULL, " ", &ctx); + tok = strtok_r (tok, ",", &ctx); + + do { + if (0 == strncmp (tok, "pagesize", 8)) { + break; + } + tok = strtok_r (NULL, ",", &ctx); + } while (tok); + + if (!tok) { +#if HAVE_STATFS + struct statfs info; + + statfs (path, &info); + page_size = info.f_bsize; +#endif + } else { + sscanf (tok, "pagesize=%lu", &page_size); + } + + if (0 == page_size) { + /* could not get page size */ + continue; + } + + hp = OBJ_NEW(mca_mpool_hugepage_hugepage_t); + if (NULL == hp) { + break; + } + + hp->path = strdup (path); + hp->page_size = page_size; + + OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_INFO, opal_mpool_base_framework.framework_output, + "found huge page with size = %lu, path = %s, mmap flags = 0x%x", + hp->page_size, hp->path, hp->mmap_flags)); + + opal_list_append (&mca_mpool_hugepage_component.huge_pages, &hp->super); + } + + opal_list_sort (&mca_mpool_hugepage_component.huge_pages, page_compare); + + fclose (fh); +} + +static int mca_mpool_hugepage_query (const char *hints, int *priority_out, + mca_mpool_base_module_t **module) +{ + unsigned long page_size = 0; + char **hints_array; + int my_priority = mca_mpool_hugepage_priority; + int modifier; + char *tmp; + bool found = false; + + if (0 == mca_mpool_hugepage_component.module_count) { + return OPAL_ERR_NOT_AVAILABLE; + } + + if (hints) { + hints_array = opal_argv_split (hints, ','); + if (NULL == hints_array) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + for (int i = 0 ; hints_array[i] ; ++i) { + char *key = hints_array[i]; + char *value = NULL; + + if (NULL != (tmp = strchr (key, '='))) { + value = tmp + 1; + *tmp = '\0'; + } + + if (0 == strcasecmp ("mpool", key)) { + if (value && 0 == strcasecmp ("hugepage", value)) { + /* this mpool was requested by name */ + my_priority = 100; + opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_mpool_base_framework.framework_output, + "hugepage mpool matches hint: %s=%s", key, value); + } else { + /* different mpool requested */ + my_priority = 0; + opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_mpool_base_framework.framework_output, + "hugepage mpool does not match hint: %s=%s", key, value); + return OPAL_ERR_NOT_FOUND; + } + } + + if (0 == strcasecmp ("page_size", key)) { + page_size = strtoul (value, &tmp, 0); + if (*tmp) { + switch (*tmp) { + case 'g': + case 'G': + page_size *= 1024; + case 'm': + case 'M': + page_size *= 1024; + case 'k': + case 'K': + page_size *= 1024; + break; + default: + page_size = -1; + } + } + opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_mpool_base_framework.framework_output, + "hugepage mpool requested page size: %lu", page_size); + } + } + + opal_argv_free (hints_array); + } + + if (0 == page_size) { + /* use default huge page size */ + page_size = mca_mpool_hugepage_page_size; + if (my_priority < 100) { + /* take a priority hit if this mpool was not asked for by name */ + my_priority = 0; + } + opal_output_verbose (MCA_BASE_VERBOSE_WARN, opal_mpool_base_framework.framework_output, + "hugepage mpool did not match any hints: %s", hints); + } + + for (int i = 0 ; i < mca_mpool_hugepage_component.module_count ; ++i) { + mca_mpool_hugepage_module_t *hugepage_module = mca_mpool_hugepage_component.modules + i; + + if (hugepage_module->huge_page->page_size != page_size) { + continue; + } + + my_priority = (my_priority < 80) ? my_priority + 20 : 100; + + if (module) { + *module = &hugepage_module->super; + } + + opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_mpool_base_framework.framework_output, + "matches page size hint. page size: %lu, path: %s, mmap flags: " + "0x%x", page_size, hugepage_module->huge_page->path, + hugepage_module->huge_page->mmap_flags); + found = true; + break; + } + + if (!found) { + opal_output_verbose (MCA_BASE_VERBOSE_WARN, opal_mpool_base_framework.framework_output, + "could not find page matching page request: %lu", page_size); + return OPAL_ERR_NOT_FOUND; + } + + if (priority_out) { + *priority_out = my_priority; + } + + return OPAL_SUCCESS; +} diff --git a/opal/mca/mpool/hugepage/mpool_hugepage_module.c b/opal/mca/mpool/hugepage/mpool_hugepage_module.c new file mode 100644 index 0000000000..634f242ffa --- /dev/null +++ b/opal/mca/mpool/hugepage/mpool_hugepage_module.c @@ -0,0 +1,255 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1 +#include "opal_config.h" +#include "opal/align.h" +#include "mpool_hugepage.h" +#include +#include +#ifdef HAVE_MALLOC_H +#include +#endif +#include "opal/mca/mpool/base/base.h" +#include "opal/runtime/opal_params.h" +#include "opal/include/opal_stdint.h" +#include "opal/mca/allocator/base/base.h" + +#include +#include + + +static void *mca_mpool_hugepage_alloc (mca_mpool_base_module_t *mpool, size_t size, size_t align, + uint32_t flags); +static void *mca_mpool_hugepage_realloc (mca_mpool_base_module_t *mpool, void *addr, size_t size); +static void mca_mpool_hugepage_free (mca_mpool_base_module_t *mpool, void *addr); +static void mca_mpool_hugepage_finalize (mca_mpool_base_module_t *mpool); +static int mca_mpool_hugepage_ft_event (int state); + +static void mca_mpool_hugepage_hugepage_constructor (mca_mpool_hugepage_hugepage_t *huge_page) +{ + memset ((char *)huge_page + sizeof(huge_page->super), 0, sizeof (*huge_page) - sizeof (huge_page->super)); +} + +static void mca_mpool_hugepage_hugepage_destructor (mca_mpool_hugepage_hugepage_t *huge_page) +{ + free (huge_page->path); +} + +OBJ_CLASS_INSTANCE(mca_mpool_hugepage_hugepage_t, opal_list_item_t, + mca_mpool_hugepage_hugepage_constructor, + mca_mpool_hugepage_hugepage_destructor); + +static int mca_mpool_rb_hugepage_compare (void *key1, void *key2) +{ + if (key1 == key2) { + return 0; + } + + return (key1 < key2) ? -1 : 1; +} + +/* + * Initializes the mpool module. + */ +int mca_mpool_hugepage_module_init(mca_mpool_hugepage_module_t *mpool, + mca_mpool_hugepage_hugepage_t *huge_page) +{ + mca_allocator_base_component_t *allocator_component; + int rc; + + mpool->super.mpool_component = &mca_mpool_hugepage_component.super; + mpool->super.mpool_base = NULL; /* no base .. */ + mpool->super.mpool_alloc = mca_mpool_hugepage_alloc; + mpool->super.mpool_realloc = mca_mpool_hugepage_realloc; + mpool->super.mpool_free = mca_mpool_hugepage_free; + mpool->super.mpool_finalize = mca_mpool_hugepage_finalize; + mpool->super.mpool_ft_event = mca_mpool_hugepage_ft_event; + mpool->super.flags = MCA_MPOOL_FLAGS_MPI_ALLOC_MEM; + + OBJ_CONSTRUCT(&mpool->lock, opal_mutex_t); + + mpool->huge_page = huge_page; + + /* use an allocator component to reduce waste when making small allocations */ + allocator_component = mca_allocator_component_lookup ("bucket"); + if (NULL == allocator_component) { + return OPAL_ERR_NOT_AVAILABLE; + } + + mpool->allocator = allocator_component->allocator_init (true, mca_mpool_hugepage_seg_alloc, + mca_mpool_hugepage_seg_free, mpool); + + OBJ_CONSTRUCT(&mpool->allocation_tree, opal_rb_tree_t); + rc = opal_rb_tree_init (&mpool->allocation_tree, mca_mpool_rb_hugepage_compare); + if (OPAL_SUCCESS != rc) { + OBJ_DESTRUCT(&mpool->allocation_tree); + return OPAL_ERR_NOT_AVAILABLE; + } + + return OPAL_SUCCESS; +} + +void *mca_mpool_hugepage_seg_alloc (void *ctx, size_t *sizep) +{ + mca_mpool_hugepage_module_t *hugepage_module = (mca_mpool_hugepage_module_t *) ctx; + mca_mpool_hugepage_hugepage_t *huge_page = hugepage_module->huge_page; + size_t size = *sizep; + void *base = NULL; + char *path = NULL; + int flags = MAP_PRIVATE; + int fd = -1; + int rc; + + size = OPAL_ALIGN(size, huge_page->page_size, size_t); + + if (huge_page->path) { + int32_t count; + + count = opal_atomic_add_32 (&huge_page->count, 1); + + rc = asprintf (&path, "%s/hugepage.openmpi.%d.%d", huge_page->path, + getpid (), count); + if (0 > rc) { + return NULL; + } + + fd = open (path, O_RDWR | O_CREAT, 0600); + if (-1 == fd) { + free (path); + return NULL; + } + + if (0 != ftruncate (fd, size)) { + close (fd); + unlink (path); + free (path); + return NULL; + } + } else { +#if defined(MAP_ANONYMOUS) + flags |= MAP_ANONYMOUS; +#elif defined(MAP_ANON) + /* older versions of OS X do not define MAP_ANONYMOUS (10.9.x and older) */ + flags |= MAP_ANON; +#endif + } + + base = mmap (NULL, size, PROT_READ | PROT_WRITE, flags | huge_page->mmap_flags, fd, 0); + if (path) { + close (fd); + unlink (path); + free (path); + } + + if (MAP_FAILED == base) { + opal_output_verbose (MCA_BASE_VERBOSE_WARN, opal_mpool_base_framework.framework_verbose, + "could not allocate huge page(s). falling back on standard pages"); + /* fall back on regular pages */ + base = mmap (NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); + } + + if (MAP_FAILED == base) { + return NULL; + } + + opal_mutex_lock (&hugepage_module->lock); + opal_rb_tree_insert (&hugepage_module->allocation_tree, base, (void *) (intptr_t) size); + opal_atomic_add (&mca_mpool_hugepage_component.bytes_allocated, (int64_t) size); + opal_mutex_unlock (&hugepage_module->lock); + + OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_TRACE, opal_mpool_base_framework.framework_verbose, + "allocated segment %p of size %lu bytes", base, size)); + + *sizep = size; + + return base; +} + +void mca_mpool_hugepage_seg_free (void *ctx, void *addr) +{ + mca_mpool_hugepage_module_t *hugepage_module = (mca_mpool_hugepage_module_t *) ctx; + size_t size; + + opal_mutex_lock (&hugepage_module->lock); + + size = (size_t) (intptr_t) opal_rb_tree_find (&hugepage_module->allocation_tree, addr); + if (size > 0) { + opal_rb_tree_delete (&hugepage_module->allocation_tree, addr); + OPAL_OUTPUT_VERBOSE((MCA_BASE_VERBOSE_TRACE, opal_mpool_base_framework.framework_verbose, + "freeing segment %p of size %lu bytes", addr, size)); + munmap (addr, size); + opal_atomic_add (&mca_mpool_hugepage_component.bytes_allocated, -(int64_t) size); + } + + opal_mutex_unlock (&hugepage_module->lock); +} + +/** + * allocate function + */ +static void *mca_mpool_hugepage_alloc (mca_mpool_base_module_t *mpool, size_t size, + size_t align, uint32_t flags) +{ + mca_mpool_hugepage_module_t *hugepage_module = (mca_mpool_hugepage_module_t *) mpool; + return hugepage_module->allocator->alc_alloc (hugepage_module->allocator, size, align); +} + +/** + * allocate function + */ +static void *mca_mpool_hugepage_realloc (mca_mpool_base_module_t *mpool, void *addr, size_t size) +{ + mca_mpool_hugepage_module_t *hugepage_module = (mca_mpool_hugepage_module_t *) mpool; + + return hugepage_module->allocator->alc_realloc (hugepage_module->allocator, addr, size); +} + +/** + * free function + */ +static void mca_mpool_hugepage_free (mca_mpool_base_module_t *mpool, void *addr) +{ + mca_mpool_hugepage_module_t *hugepage_module = (mca_mpool_hugepage_module_t *) mpool; + + hugepage_module->allocator->alc_free (hugepage_module->allocator, addr); +} + +static void mca_mpool_hugepage_finalize (struct mca_mpool_base_module_t *mpool) +{ + mca_mpool_hugepage_module_t *hugepage_module = (mca_mpool_hugepage_module_t *) mpool; + + OBJ_DESTRUCT(&hugepage_module->lock); + + if (hugepage_module->allocator) { + (void) hugepage_module->allocator->alc_finalize (hugepage_module->allocator); + hugepage_module->allocator = NULL; + } +} + +static int mca_mpool_hugepage_ft_event (int state) { + return OPAL_SUCCESS; +} diff --git a/opal/mca/mpool/mpool.h b/opal/mca/mpool/mpool.h index a0f957438f..83ef196c72 100644 --- a/opal/mca/mpool/mpool.h +++ b/opal/mca/mpool/mpool.h @@ -29,138 +29,70 @@ #include "opal_config.h" #include "opal/mca/mca.h" #include "opal/class/opal_free_list.h" +#include "opal/mca/rcache/base/rcache_base_vma.h" #include "opal/mca/crs/crs.h" #include "opal/mca/crs/base/base.h" -#define MCA_MPOOL_FLAGS_CACHE_BYPASS 0x01 -#define MCA_MPOOL_FLAGS_PERSIST 0x02 -#define MCA_MPOOL_FLAGS_MPI_ALLOC_MEM 0x04 -#define MCA_MPOOL_FLAGS_INVALID 0x08 -#define MCA_MPOOL_FLAGS_SO_MEM 0x10 -#define MCA_MPOOL_FLAGS_CUDA_REGISTER_MEM 0x20 +#define MCA_MPOOL_ALLOC_FLAG_DEFAULT 0x00 +#define MCA_MPOOL_ALLOC_FLAG_USER 0x01 + +#define MCA_MPOOL_FLAGS_MPI_ALLOC_MEM 0x80 struct opal_info_t; - -#define MCA_MPOOL_FLAGS_CUDA_GPU_MEM 0x40 - -/* Only valid in mpool flags. Used to indicate that no external memory - * hooks (ptmalloc2, etc) are required. */ -#define MCA_MPOOL_FLAGS_NO_HOOKS 0x80 - -/* access flags */ -enum { - MCA_MPOOL_ACCESS_LOCAL_WRITE = 0x01, - MCA_MPOOL_ACCESS_REMOTE_READ = 0x02, - MCA_MPOOL_ACCESS_REMOTE_WRITE = 0x04, - MCA_MPOOL_ACCESS_REMOTE_ATOMIC = 0x08, - MCA_MPOOL_ACCESS_ANY = 0x0f, -}; - -struct mca_mpool_base_resources_t; - -struct mca_mpool_base_registration_t { - opal_free_list_item_t super; - struct mca_mpool_base_module_t *mpool; - unsigned char* base; - unsigned char* bound; - unsigned char* alloc_base; - int32_t ref_count; - uint32_t flags; - void *mpool_context; -#if OPAL_CUDA_GDR_SUPPORT - unsigned long long gpu_bufID; -#endif /* OPAL_CUDA_GDR_SUPPORT */ - int32_t access_flags; -}; - -typedef struct mca_mpool_base_registration_t mca_mpool_base_registration_t; - -OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_mpool_base_registration_t); +struct mca_mpool_base_module_t; +typedef struct mca_mpool_base_module_t mca_mpool_base_module_t; /** - * component initialize + * component query function + * + * @param[in] hints memory pool hints in order of priority. this should + * be replaced by opal_info_t when the work to move + * info down to opal is complete. + * @param[out] priority relative priority of this memory pool component + * @param[out] module best match module + * + * This function should parse the provided hints and return a relative priority + * of the component based on the number of hints matched. For example, if the + * hints are "page_size=2M,high-bandwidth" and a pool matches the page_size but + * not the high-bandwidth hint then the component should return a lower priority + * than if both matched but a higher priority than if a pool matches only the + * high-bandwidth hint. + * + * Memory pools should try to support at a minimum name=value but can define + * any additional keys. */ -typedef struct mca_mpool_base_module_t* (*mca_mpool_base_component_init_fn_t)( - struct mca_mpool_base_resources_t*); +typedef int (*mca_mpool_base_component_query_fn_t) (const char *hints, int *priority, + mca_mpool_base_module_t **module); /** * allocate function typedef */ -typedef void* (*mca_mpool_base_module_alloc_fn_t)( - struct mca_mpool_base_module_t* mpool, - size_t size, - size_t align, - uint32_t flags, - mca_mpool_base_registration_t** registration); +typedef void *(*mca_mpool_base_module_alloc_fn_t) (mca_mpool_base_module_t *mpool, + size_t size, size_t align, + uint32_t flags); /** - * realloc function typedef + * allocate function typedef */ -typedef void* (*mca_mpool_base_module_realloc_fn_t)( - struct mca_mpool_base_module_t* mpool, - void* addr, - size_t size, - mca_mpool_base_registration_t** registration); +typedef void *(*mca_mpool_base_module_realloc_fn_t) (mca_mpool_base_module_t *mpool, + void *addr, size_t size); /** * free function typedef */ -typedef void (*mca_mpool_base_module_free_fn_t)( - struct mca_mpool_base_module_t* mpool, - void *addr, - mca_mpool_base_registration_t* registration); - -/** - * register memory - */ -typedef int (*mca_mpool_base_module_register_fn_t)( - struct mca_mpool_base_module_t* mpool, - void * addr, - size_t size, - uint32_t flags, - int32_t access_flags, - mca_mpool_base_registration_t** registration); - -/** - * deregister memory - */ -typedef int (*mca_mpool_base_module_deregister_fn_t)( - struct mca_mpool_base_module_t* mpool, - mca_mpool_base_registration_t* registration); - -/** - * find registration in this memory pool - */ - -typedef int (*mca_mpool_base_module_find_fn_t) ( - struct mca_mpool_base_module_t* mpool, void* addr, size_t size, - mca_mpool_base_registration_t **reg); - -/** - * release registration - */ - -typedef int (*mca_mpool_base_module_release_fn_t) ( - struct mca_mpool_base_module_t* mpool, - mca_mpool_base_registration_t* registration); - - -/** - * release memory region - */ -typedef int (*mca_mpool_base_module_release_memory_fn_t) ( - struct mca_mpool_base_module_t* mpool, void *base, size_t size); +typedef void (*mca_mpool_base_module_free_fn_t) (mca_mpool_base_module_t *mpool, + void *addr); /** * if appropriate - returns base address of memory pool */ -typedef void* (*mca_mpool_base_module_address_fn_t)(struct mca_mpool_base_module_t* mpool); +typedef void* (*mca_mpool_base_module_address_fn_t) (mca_mpool_base_module_t *mpool); /** * finalize */ -typedef void (*mca_mpool_base_module_finalize_fn_t)(struct mca_mpool_base_module_t*); +typedef void (*mca_mpool_base_module_finalize_fn_t)(mca_mpool_base_module_t *mpool); /** @@ -176,10 +108,10 @@ typedef int (*mca_mpool_base_module_ft_event_fn_t)(int state); * and open/close/init functions. */ struct mca_mpool_base_component_2_0_0_t { - mca_base_component_t mpool_version; /**< version */ - mca_base_component_data_t mpool_data;/**< metadata */ + mca_base_component_t mpool_version; /**< version */ + mca_base_component_data_t mpool_data;/**< metadata */ - mca_mpool_base_component_init_fn_t mpool_init; /**< init function */ + mca_mpool_base_component_query_fn_t mpool_query; /**< query for matching pools */ }; /** * Convenience typedef. @@ -196,25 +128,19 @@ typedef struct mca_mpool_base_component_2_0_0_t mca_mpool_base_component_t; * details. */ struct mca_mpool_base_module_t { - mca_mpool_base_component_t *mpool_component; /**< component stuct */ + mca_mpool_base_component_t *mpool_component; /**< component stuct */ mca_mpool_base_module_address_fn_t mpool_base; /**< returns the base address */ mca_mpool_base_module_alloc_fn_t mpool_alloc; /**< allocate function */ mca_mpool_base_module_realloc_fn_t mpool_realloc; /**< reallocate function */ mca_mpool_base_module_free_fn_t mpool_free; /**< free function */ - mca_mpool_base_module_register_fn_t mpool_register; /**< register memory */ - mca_mpool_base_module_deregister_fn_t mpool_deregister; /**< deregister memory */ - mca_mpool_base_module_find_fn_t mpool_find; /**< find regisrations in the cache */ - mca_mpool_base_module_release_fn_t mpool_release; /**< release a registration from the cache */ - mca_mpool_base_module_release_memory_fn_t mpool_release_memory; /**< release memor region from the cache */ + mca_mpool_base_module_finalize_fn_t mpool_finalize; /**< finalize */ mca_mpool_base_module_ft_event_fn_t mpool_ft_event; /**< ft_event */ - struct mca_rcache_base_module_t *rcache; /* the rcache associated with this mpool */ uint32_t flags; /**< mpool flags */ + + size_t mpool_allocation_unit; /**< allocation unit used by this mpool */ + char *mpool_name; /**< name of this pool module */ }; -/** - * Convenience typedef - */ -typedef struct mca_mpool_base_module_t mca_mpool_base_module_t; /** @@ -237,7 +163,7 @@ typedef struct mca_mpool_base_module_t mca_mpool_base_module_t; * @retval pointer to the allocated memory * @retval NULL on failure */ -OPAL_DECLSPEC void * mca_mpool_base_alloc(size_t size, struct opal_info_t * info); +OPAL_DECLSPEC void * mca_mpool_base_alloc(size_t size, struct opal_info_t * info, const char *hints); /** * Function to free memory previously allocated by mca_mpool_base_alloc @@ -261,21 +187,11 @@ OPAL_DECLSPEC int mca_mpool_base_free(void * base); */ OPAL_DECLSPEC int mca_mpool_base_tree_node_compare(void * key1, void * key2); - -OPAL_DECLSPEC int mca_mpool_base_insert( - void * addr, - size_t size, - mca_mpool_base_module_t* mpool, - void* user_in, - mca_mpool_base_registration_t* registration); - -OPAL_DECLSPEC int mca_mpool_base_remove(void * base); - /** * Macro for use in components that are of type mpool */ -#define MCA_MPOOL_BASE_VERSION_2_0_0 \ - OPAL_MCA_BASE_VERSION_2_1_0("mpool", 2, 0, 0) +#define MCA_MPOOL_BASE_VERSION_3_0_0 \ + OPAL_MCA_BASE_VERSION_2_1_0("mpool", 3, 0, 0) #endif /* MCA_MPOOL_H */ diff --git a/opal/mca/mpool/sm/Makefile.am b/opal/mca/mpool/sm/Makefile.am deleted file mode 100644 index 1d196ed723..0000000000 --- a/opal/mca/mpool/sm/Makefile.am +++ /dev/null @@ -1,54 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2013 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2010-2014 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2011 NVIDIA Corporation. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -sources = \ - mpool_sm.h \ - mpool_sm_module.c \ - mpool_sm_component.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_opal_mpool_sm_DSO -component_noinst = -component_install = mca_mpool_sm.la -else -component_noinst = libmca_mpool_sm.la -component_install = -endif - -# See opal/mca/common/sm/Makefile.am for an explanation of -# libmca_common_sm.la. - -mcacomponentdir = $(opallibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_mpool_sm_la_SOURCES = $(sources) -mca_mpool_sm_la_LDFLAGS = -module -avoid-version -mca_mpool_sm_la_LIBADD = \ - $(OPAL_TOP_BUILDDIR)/opal/mca/common/sm/lib@OPAL_LIB_PREFIX@mca_common_sm.la -if OPAL_cuda_support -mca_mpool_sm_la_LIBADD += \ - $(OPAL_TOP_BUILDDIR)/opal/mca/common/cuda/lib@OPAL_LIB_PREFIX@mca_common_cuda.la -endif - -noinst_LTLIBRARIES = $(component_noinst) -libmca_mpool_sm_la_SOURCES = $(sources) -libmca_mpool_sm_la_LDFLAGS = -module -avoid-version diff --git a/opal/mca/mpool/sm/mpool_sm.h b/opal/mca/mpool/sm/mpool_sm.h deleted file mode 100644 index 23044cef36..0000000000 --- a/opal/mca/mpool/sm/mpool_sm.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2012 Los Alamos National Security, LLC. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ -#ifndef MCA_MPOOL_SM_H -#define MCA_MPOOL_SM_H - -#include "opal_config.h" - -#include "opal/mca/event/event.h" -#include "opal/mca/shmem/shmem.h" - -#include "opal/mca/common/sm/common_sm.h" -#include "opal/mca/mpool/mpool.h" -#include "opal/mca/allocator/allocator.h" - -BEGIN_C_DECLS - -struct mca_mpool_sm_component_t { - mca_mpool_base_component_t super; - /* mca_allocator_base_module_t* sm_allocator; */ - char *sm_allocator_name; - int verbose; - /* struct mca_mpool_sm_mmap_t *sm_mmap; */ -}; -typedef struct mca_mpool_sm_component_t mca_mpool_sm_component_t; - -typedef struct mca_mpool_base_resources_t { - size_t size; - int32_t mem_node; - /* backing store metadata */ - opal_shmem_ds_t bs_meta_buf; -} mca_mpool_base_resources_t; - -OPAL_MODULE_DECLSPEC extern mca_mpool_sm_component_t mca_mpool_sm_component; - -typedef struct mca_mpool_sm_module_t { - mca_mpool_base_module_t super; - long sm_size; - mca_allocator_base_module_t *sm_allocator; - struct mca_mpool_sm_mmap_t *sm_mmap; - mca_common_sm_module_t *sm_common_module; - int32_t mem_node; -} mca_mpool_sm_module_t; - -/* - * Initializes the mpool module. - */ -void mca_mpool_sm_module_init(mca_mpool_sm_module_t* mpool); - - -/* - * Returns base address of shared memory mapping. - */ -void* mca_mpool_sm_base(mca_mpool_base_module_t*); - -/** - * Allocate block of shared memory. - */ -void* mca_mpool_sm_alloc( - mca_mpool_base_module_t* mpool, - size_t size, - size_t align, - uint32_t flags, - mca_mpool_base_registration_t** registration); - -/** - * realloc function typedef - */ -void* mca_mpool_sm_realloc( - mca_mpool_base_module_t* mpool, - void* addr, - size_t size, - mca_mpool_base_registration_t** registration); - -/** - * free function typedef - */ -void mca_mpool_sm_free( - mca_mpool_base_module_t* mpool, - void * addr, - mca_mpool_base_registration_t* registration); - -/** - * Fault Tolerance Event Notification Function - * @param state Checkpoint Stae - * @return OPAL_SUCCESS or failure status - */ -int mca_mpool_sm_ft_event(int state); - -END_C_DECLS - -#endif diff --git a/opal/mca/mpool/sm/mpool_sm_component.c b/opal/mca/mpool/sm/mpool_sm_component.c deleted file mode 100644 index bee668b400..0000000000 --- a/opal/mca/mpool/sm/mpool_sm_component.c +++ /dev/null @@ -1,210 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2010-2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2014 NVIDIA Corporation. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" -#ifdef HAVE_UNISTD_H -#include -#endif /* HAVE_UNISTD_H*/ -#include -#include -#include "opal/mca/base/base.h" - -#include "opal/mca/allocator/base/base.h" -#include "mpool_sm.h" -#include "opal/mca/common/sm/common_sm.h" - -#if OPAL_ENABLE_FT_CR == 1 -#include "opal/runtime/opal_cr.h" -#endif - -/* - * Local functions - */ -static int -mca_mpool_sm_register(void); - -static int -mca_mpool_sm_open(void); - -static int -mca_mpool_sm_close(void); - -static mca_mpool_base_module_t * -mca_mpool_sm_init(struct mca_mpool_base_resources_t* resources); - -mca_mpool_sm_component_t mca_mpool_sm_component = { - { - /* First, the mca_base_component_t struct containing meta - information about the component itself */ - - .mpool_version = { - MCA_MPOOL_BASE_VERSION_2_0_0, - - .mca_component_name = "sm", - MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, - OPAL_RELEASE_VERSION), - .mca_open_component = mca_mpool_sm_open, - .mca_close_component = mca_mpool_sm_close, - .mca_register_component_params = mca_mpool_sm_register, - }, - .mpool_data = { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - - .mpool_init = mca_mpool_sm_init, - } -}; - -static long default_min = 134217728; -static unsigned long long opal_mpool_sm_min_size; -static int opal_mpool_sm_verbose; - -static int mca_mpool_sm_register(void) -{ - /* register SM component parameters */ - (void) mca_base_var_group_component_register(&mca_mpool_sm_component.super.mpool_version, - "Shared memory pool"); - - mca_mpool_sm_component.sm_allocator_name = "bucket"; - (void) mca_base_component_var_register(&mca_mpool_sm_component.super.mpool_version, - "allocator", "Name of allocator component " - "to use with sm mpool", MCA_BASE_VAR_TYPE_STRING, - NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_mpool_sm_component.sm_allocator_name); - - /* register as an unsigned long long to get up to 64 bits for the size */ - opal_mpool_sm_min_size = default_min; - (void) mca_base_component_var_register(&mca_mpool_sm_component.super.mpool_version, - "min_size", "Minimum size of the sm mpool shared memory file", - MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &opal_mpool_sm_min_size); - - opal_mpool_sm_verbose = 0; - (void) mca_base_component_var_register(&mca_mpool_sm_component.super.mpool_version, - "verbose", "Enable verbose output for mpool sm component", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &opal_mpool_sm_verbose); - - return OPAL_SUCCESS; -} - -/** - * component open/close/init function - */ -static int mca_mpool_sm_open(void) -{ - if (opal_mpool_sm_verbose != 0) { - mca_mpool_sm_component.verbose = opal_output_open(NULL); - } else { - mca_mpool_sm_component.verbose = -1; - } - - return OPAL_SUCCESS; -} - -static int mca_mpool_sm_close( void ) -{ - return OPAL_SUCCESS; -} - -static mca_mpool_base_module_t * -mca_mpool_sm_init(struct mca_mpool_base_resources_t *resources) -{ - mca_mpool_sm_module_t *mpool_module; - mca_allocator_base_component_t* allocator_component; - - /* Make a new mpool module */ - mpool_module = - (mca_mpool_sm_module_t *)malloc(sizeof(mca_mpool_sm_module_t)); - mca_mpool_sm_module_init(mpool_module); - - /* set sm_size */ - mpool_module->sm_size = resources->size; - - /* clip at the min size */ - if (mpool_module->sm_size < (long) opal_mpool_sm_min_size) { - mpool_module->sm_size = (long) opal_mpool_sm_min_size; - } - - allocator_component = mca_allocator_component_lookup( - mca_mpool_sm_component.sm_allocator_name); - - /* if specified allocator cannot be loaded - look for an alternative */ - if (NULL == allocator_component) { - if (opal_list_get_size(&opal_allocator_base_framework.framework_components) == 0) { - mca_base_component_list_item_t *item = - (mca_base_component_list_item_t *) - opal_list_get_first(&opal_allocator_base_framework.framework_components); - allocator_component = - (mca_allocator_base_component_t *)item->cli_component; - opal_output( - 0, "mca_mpool_sm_init: " - "unable to locate allocator: %s - using %s\n", - mca_mpool_sm_component.sm_allocator_name, - allocator_component->allocator_version.mca_component_name); - } else { - opal_output(0, "mca_mpool_sm_init: " - "unable to locate allocator: %s\n", - mca_mpool_sm_component.sm_allocator_name); - free(mpool_module); - return NULL; - } - } - - mpool_module->mem_node = resources->mem_node; - - opal_output(mca_mpool_sm_component.verbose, - "mca_mpool_sm_init: shared memory size used: (%ld)", - mpool_module->sm_size); - - if (NULL == (mpool_module->sm_common_module = - mca_common_sm_module_attach(&resources->bs_meta_buf, - sizeof(mca_common_sm_module_t), 8))) { - opal_output(mca_mpool_sm_component.verbose, "mca_mpool_sm_init: " - "unable to create shared memory mapping (%s)", - resources->bs_meta_buf.seg_name); - free(mpool_module); - return NULL; - } - - /* setup allocator */ - mpool_module->sm_allocator = - allocator_component->allocator_init(true, - mca_common_sm_seg_alloc, - NULL, &(mpool_module->super)); - if (NULL == mpool_module->sm_allocator) { - opal_output(0, "mca_mpool_sm_init: unable to initialize allocator"); - free(mpool_module); - return NULL; - } - - return &mpool_module->super; -} - diff --git a/opal/mca/mpool/udreg/mpool_udreg.h b/opal/mca/mpool/udreg/mpool_udreg.h deleted file mode 100644 index 50d8d37e36..0000000000 --- a/opal/mca/mpool/udreg/mpool_udreg.h +++ /dev/null @@ -1,174 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - */ -#ifndef MCA_MPOOL_UDREG_H -#define MCA_MPOOL_UDREG_H - -#include "opal_config.h" -#include "opal/class/opal_list.h" -#include "opal/class/opal_free_list.h" -#include "opal/mca/event/event.h" -#include "opal/mca/mpool/mpool.h" -#include "opal/util/proc.h" -#if HAVE_SYS_MMAN_H -#include -#endif - -BEGIN_C_DECLS - -struct mca_mpool_udreg_component_t { - mca_mpool_base_component_t super; - bool print_stats; - int leave_pinned; - opal_list_t huge_pages; - bool use_huge_pages; -}; -typedef struct mca_mpool_udreg_component_t mca_mpool_udreg_component_t; - -OPAL_DECLSPEC extern mca_mpool_udreg_component_t mca_mpool_udreg_component; - -struct mca_mpool_udreg_module_t; - -struct mca_mpool_base_resources_t { - /* the start of this mpool should match grdma */ - char *pool_name; - void *reg_data; - size_t sizeof_reg; - int (*register_mem)(void *reg_data, void *base, size_t size, - mca_mpool_base_registration_t *reg); - int (*deregister_mem)(void *reg_data, mca_mpool_base_registration_t *reg); - - /* udreg specific resources */ - bool use_kernel_cache; - bool use_evict_w_unreg; - int max_entries; - size_t page_size; -}; -typedef struct mca_mpool_base_resources_t mca_mpool_base_resources_t; - -struct mca_mpool_udreg_hugepage_t { - opal_list_item_t super; - unsigned long page_size; - char *path; - opal_list_t allocations; - int cnt; -}; -typedef struct mca_mpool_udreg_hugepage_t mca_mpool_udreg_hugepage_t; - -OBJ_CLASS_DECLARATION(mca_mpool_udreg_hugepage_t); - -struct mca_mpool_udreg_hugepage_alloc_t { - opal_list_item_t super; - int fd; - char *path; - void *ptr; - size_t size; - mca_mpool_udreg_hugepage_t *huge_table; -}; -typedef struct mca_mpool_udreg_hugepage_alloc_t mca_mpool_udreg_hugepage_alloc_t; - -OBJ_CLASS_DECLARATION(mca_mpool_udreg_hugepage_pool_item_t); - -struct mca_mpool_udreg_module_t { - mca_mpool_base_module_t super; - struct mca_mpool_base_resources_t resources; - opal_free_list_t reg_list; - mca_mpool_udreg_hugepage_t *huge_page; - opal_mutex_t lock; - void *udreg_handle; - /** used to communicate the access flags to the underlying registration - * function */ - int requested_access_flags; -}; -typedef struct mca_mpool_udreg_module_t mca_mpool_udreg_module_t; - - -/* - * Initializes the mpool module. - */ -int mca_mpool_udreg_module_init(mca_mpool_udreg_module_t *mpool); - -/* - * Returns base address of shared memory mapping. - */ -void *mca_mpool_udreg_base(mca_mpool_base_module_t *mpool); - -/** - * Allocate block of registered memory. - */ -void* mca_mpool_udreg_alloc(mca_mpool_base_module_t *mpool, size_t size, - size_t align, uint32_t flags, - mca_mpool_base_registration_t** registration); - -/** - * realloc block of registered memory - */ -void* mca_mpool_udreg_realloc( mca_mpool_base_module_t *mpool, void* addr, - size_t size, mca_mpool_base_registration_t** registration); - -/** - * register block of memory - */ -int mca_mpool_udreg_register(mca_mpool_base_module_t* mpool, void *addr, - size_t size, uint32_t flags, int32_t access_flags, mca_mpool_base_registration_t **reg); - -/** - * deregister memory - */ -int mca_mpool_udreg_deregister(mca_mpool_base_module_t *mpool, - mca_mpool_base_registration_t *reg); - -/** - * free memory allocated by alloc function - */ -void mca_mpool_udreg_free(mca_mpool_base_module_t *mpool, void * addr, - mca_mpool_base_registration_t *reg); - -/** - * find registration for a given block of memory - */ -int mca_mpool_udreg_find(struct mca_mpool_base_module_t* mpool, void* addr, - size_t size, mca_mpool_base_registration_t **reg); - -/** - * finalize mpool - */ -void mca_mpool_udreg_finalize(struct mca_mpool_base_module_t *mpool); - -/** - * Fault Tolerance Event Notification Function - * @param state Checkpoint Stae - * @return OPAL_SUCCESS or failure status - */ -int mca_mpool_udreg_ft_event(int state); - -/** - * evict one unused registration from the mpool's lru. - * @return true on success, false on failure - */ -bool mca_mpool_udreg_evict (struct mca_mpool_base_module_t *mpool); - -END_C_DECLS -#endif diff --git a/opal/mca/mpool/udreg/mpool_udreg_component.c b/opal/mca/mpool/udreg/mpool_udreg_component.c deleted file mode 100644 index 64e7da92e0..0000000000 --- a/opal/mca/mpool/udreg/mpool_udreg_component.c +++ /dev/null @@ -1,206 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1 -#include "opal_config.h" -#include "opal/mca/base/base.h" -#include "opal/runtime/opal_params.h" -#include "mpool_udreg.h" -#ifdef HAVE_UNISTD_H -#include -#endif -#ifdef HAVE_MALLOC_H -#include -#endif - -#include - -/* - * Local functions - */ -static int udreg_open(void); -static int udreg_close(void); -static int udreg_register(void); -static mca_mpool_base_module_t* udreg_init( - struct mca_mpool_base_resources_t* resources); - -mca_mpool_udreg_component_t mca_mpool_udreg_component = { - { - /* First, the mca_base_component_t struct containing meta - information about the component itself */ - - .mpool_version ={ - MCA_MPOOL_BASE_VERSION_2_0_0, - - .mca_component_name = "udreg", - MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, - OPAL_RELEASE_VERSION), - .mca_open_component = udreg_open, - .mca_close_component = udreg_close, - .mca_register_component_params = udreg_register, - }, - .mpool_data = { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - - .mpool_init = udreg_init - } -}; - -/** - * component open/close/init function - */ -static int udreg_open(void) -{ - OBJ_CONSTRUCT(&mca_mpool_udreg_component.huge_pages, opal_list_t); - - return OPAL_SUCCESS; -} - - -static int udreg_register(void) -{ - mca_mpool_udreg_component.print_stats = false; - (void) mca_base_component_var_register(&mca_mpool_udreg_component.super.mpool_version, - "print_stats", "print pool usage statistics at the end of the run", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_mpool_udreg_component.print_stats); - - return OPAL_SUCCESS; -} - - -static int udreg_close(void) -{ - opal_list_item_t *item; - - while (NULL != (item = opal_list_remove_first (&mca_mpool_udreg_component.huge_pages))) { - OBJ_RELEASE(item); - } - - OBJ_DESTRUCT(&mca_mpool_udreg_component.huge_pages); - - return OPAL_SUCCESS; -} - -static int page_compare (opal_list_item_t **a, - opal_list_item_t **b) { - mca_mpool_udreg_hugepage_t *pagea = (mca_mpool_udreg_hugepage_t *) *a; - mca_mpool_udreg_hugepage_t *pageb = (mca_mpool_udreg_hugepage_t *) *b; - if (pagea->page_size > pageb->page_size) { - return 1; - } else if (pagea->page_size < pageb->page_size) { - return -1; - } - - return 0; -} - -static void udreg_find_hugepages (void) { - FILE *fh; - char *path; - char buffer[1024]; - char *ctx, *tok; - - fh = fopen ("/proc/mounts", "r"); - if (NULL == fh) { - return; - } - - while (fgets (buffer, 1024, fh)) { - mca_mpool_udreg_hugepage_t *pool; - - (void) strtok_r (buffer, " ", &ctx); - path = strtok_r (NULL, " ", &ctx); - tok = strtok_r (NULL, " ", &ctx); - - if (0 != strcmp (tok, "hugetlbfs")) { - continue; - } - - pool = OBJ_NEW(mca_mpool_udreg_hugepage_t); - if (NULL == pool) { - break; - } - - pool->path = strdup (path); - - tok = strtok_r (NULL, " ", &ctx); - tok = strtok_r (tok, ",", &ctx); - - do { - if (0 == strncmp (tok, "pagesize", 8)) { - break; - } - tok = strtok_r (NULL, ",", &ctx); - } while (tok); - sscanf (tok, "pagesize=%lu", &pool->page_size); - - opal_list_append (&mca_mpool_udreg_component.huge_pages, &pool->super); - } - - fclose (fh); - - opal_list_sort (&mca_mpool_udreg_component.huge_pages, page_compare); - - mca_mpool_udreg_component.use_huge_pages = - !!(opal_list_get_size (&mca_mpool_udreg_component.huge_pages)); -} - - - -static mca_mpool_base_module_t * -udreg_init(struct mca_mpool_base_resources_t *resources) -{ - mca_mpool_udreg_module_t* mpool_module; - static int inited = false; - int rc; - - /* Set this here (vs in component.c) because - opal_leave_pinned* may have been set after MCA params were - read (e.g., by the openib btl) */ - mca_mpool_udreg_component.leave_pinned = (int) - (1 == opal_leave_pinned || opal_leave_pinned_pipeline); - - if (!inited) { - inited = true; - udreg_find_hugepages (); - } - - mpool_module = - (mca_mpool_udreg_module_t *) malloc (sizeof (mca_mpool_udreg_module_t)); - - memmove (&mpool_module->resources, resources, sizeof (*resources)); - - rc = mca_mpool_udreg_module_init(mpool_module); - if (OPAL_SUCCESS != rc) { - free (mpool_module); - return NULL; - } - - return &mpool_module->super; -} diff --git a/opal/mca/mpool/udreg/mpool_udreg_module.c b/opal/mca/mpool/udreg/mpool_udreg_module.c deleted file mode 100644 index 66243300ff..0000000000 --- a/opal/mca/mpool/udreg/mpool_udreg_module.c +++ /dev/null @@ -1,546 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2007 Mellanox Technologies. All rights reserved. - * Copyright (c) 2010 IBM Corporation. All rights reserved. - * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1 -#include "opal_config.h" -#include "opal/align.h" -#include "mpool_udreg.h" -#include -#include -#ifdef HAVE_MALLOC_H -#include -#endif -#include "opal/mca/mpool/base/base.h" -#include "opal/runtime/opal_params.h" -#include "opal/include/opal_stdint.h" - -#include - -#include - -#include - -static void *mca_mpool_udreg_reg_func (void *addr, uint64_t len, void *reg_context); -static uint32_t mca_mpool_udreg_dereg_func (void *device_data, void *dreg_context); - -static void mca_mpool_udreg_hugepage_constructor (mca_mpool_udreg_hugepage_t *huge_page) -{ - memset ((char *)huge_page + sizeof(huge_page->super), 0, sizeof (*huge_page) - sizeof (huge_page->super)); - OBJ_CONSTRUCT(&huge_page->allocations, opal_list_t); -} - -static void mca_mpool_udreg_hugepage_destructor (mca_mpool_udreg_hugepage_t *huge_page) -{ - opal_list_item_t *item; - - if (huge_page->path) { - free (huge_page->path); - } - - while (NULL != (item = opal_list_remove_first (&huge_page->allocations))) { - OBJ_RELEASE(item); - } - - OBJ_DESTRUCT(&huge_page->allocations); -} - -OBJ_CLASS_INSTANCE(mca_mpool_udreg_hugepage_t, opal_list_item_t, - mca_mpool_udreg_hugepage_constructor, - mca_mpool_udreg_hugepage_destructor); - -static void mca_mpool_udreg_hugepage_alloc_constructor (mca_mpool_udreg_hugepage_alloc_t *alloc) -{ - memset ((char *)alloc + sizeof(alloc->super), 0, sizeof (*alloc) - sizeof (alloc->super)); - alloc->fd = -1; -} - -static void mca_mpool_udreg_hugepage_alloc_destructor (mca_mpool_udreg_hugepage_alloc_t *alloc) -{ - if (NULL != alloc->ptr) { - munmap (alloc->ptr, alloc->size); - } - - if (NULL == alloc->path) { - return; - } - - free (alloc->path); -} - -OBJ_CLASS_INSTANCE(mca_mpool_udreg_hugepage_alloc_t, opal_list_item_t, - mca_mpool_udreg_hugepage_alloc_constructor, - mca_mpool_udreg_hugepage_alloc_destructor); - - -static mca_mpool_udreg_hugepage_t *udreg_find_matching_pagesize (size_t size) { - mca_mpool_udreg_hugepage_t *huge_table; - opal_list_item_t *item; - - for (item = opal_list_get_first (&mca_mpool_udreg_component.huge_pages) ; - item != opal_list_get_end (&mca_mpool_udreg_component.huge_pages) ; - item = opal_list_get_next (item)) { - huge_table = (mca_mpool_udreg_hugepage_t *) item; - - if (huge_table->page_size == size) { - return huge_table; - } - } - - return NULL; -} - - -/* - * Initializes the mpool module. - */ -int mca_mpool_udreg_module_init(mca_mpool_udreg_module_t* mpool) -{ - struct udreg_cache_attr cache_attr; - int urc; - - mpool->super.mpool_component = &mca_mpool_udreg_component.super; - mpool->super.mpool_base = NULL; /* no base .. */ - mpool->super.mpool_alloc = mca_mpool_udreg_alloc; - mpool->super.mpool_realloc = mca_mpool_udreg_realloc; - mpool->super.mpool_free = mca_mpool_udreg_free; - mpool->super.mpool_register = mca_mpool_udreg_register; - mpool->super.mpool_find = mca_mpool_udreg_find; - mpool->super.mpool_deregister = mca_mpool_udreg_deregister; - /* This module relies on udreg for notification of memory release */ - mpool->super.mpool_release_memory = NULL; - mpool->super.mpool_finalize = mca_mpool_udreg_finalize; - mpool->super.mpool_ft_event = mca_mpool_udreg_ft_event; - mpool->super.flags = MCA_MPOOL_FLAGS_MPI_ALLOC_MEM | MCA_MPOOL_FLAGS_NO_HOOKS; - - if (4096 < mpool->resources.page_size) { - mpool->huge_page = udreg_find_matching_pagesize (mpool->resources.page_size); - } else { - mpool->huge_page = NULL; - } - - cache_attr.modes = 0; - - /* Create udreg cache */ - if (mpool->resources.use_kernel_cache) { - cache_attr.modes |= UDREG_CC_MODE_USE_KERNEL_CACHE; - } - - if (mpool->resources.use_evict_w_unreg) { - cache_attr.modes |= UDREG_CC_MODE_USE_EVICT_W_UNREG; - } - - if (mca_mpool_udreg_component.leave_pinned) { - cache_attr.modes |= UDREG_CC_MODE_USE_LAZY_DEREG; - } - - OBJ_CONSTRUCT(&mpool->lock,opal_mutex_t); - - strncpy (cache_attr.cache_name, mpool->resources.pool_name, UDREG_MAX_CACHENAME_LEN); - cache_attr.max_entries = mpool->resources.max_entries; - cache_attr.debug_mode = 0; - cache_attr.debug_rank = 0; - cache_attr.reg_context = mpool; - cache_attr.dreg_context = mpool; - cache_attr.destructor_context = mpool; - cache_attr.device_reg_func = mca_mpool_udreg_reg_func; - cache_attr.device_dereg_func = mca_mpool_udreg_dereg_func; - cache_attr.destructor_callback = NULL; - - /* attempt to create the udreg cache. this will fail if one already exists */ - (void) UDREG_CacheCreate (&cache_attr); - - urc = UDREG_CacheAccess (mpool->resources.pool_name, (udreg_cache_handle_t *) &mpool->udreg_handle); - if (UDREG_RC_SUCCESS != urc) { - return OPAL_ERROR; - } - - OBJ_CONSTRUCT(&mpool->reg_list, opal_free_list_t); - opal_free_list_init (&mpool->reg_list, mpool->resources.sizeof_reg, - opal_cache_line_size, - OBJ_CLASS(mca_mpool_base_registration_t), - 0, opal_cache_line_size, 0, -1, 32, NULL, 0, - NULL, NULL, NULL); - - return OPAL_SUCCESS; -} - -/* udreg callback functions */ -static void *mca_mpool_udreg_reg_func (void *addr, uint64_t len, void *reg_context) -{ - mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) reg_context; - mca_mpool_base_registration_t *udreg_reg; - opal_free_list_item_t *item; - int rc; - - item = opal_free_list_get (&mpool_udreg->reg_list); - if (NULL == item) { - return NULL; - } - udreg_reg = (mca_mpool_base_registration_t *) item; - - udreg_reg->mpool = reg_context; - udreg_reg->base = addr; - udreg_reg->bound = (void *)((uintptr_t) addr + len); - /* pull the access flags out of the mpool module */ - udreg_reg->access_flags = mpool_udreg->requested_access_flags; - - rc = mpool_udreg->resources.register_mem(mpool_udreg->resources.reg_data, - addr, len, udreg_reg); - if (OPAL_SUCCESS != rc) { - opal_free_list_return (&mpool_udreg->reg_list, item); - udreg_reg = NULL; - } - - return udreg_reg; -} - -static uint32_t mca_mpool_udreg_dereg_func (void *device_data, void *dreg_context) -{ - mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) dreg_context; - mca_mpool_base_registration_t *udreg_reg = (mca_mpool_base_registration_t *) device_data; - int rc; - - if (udreg_reg->ref_count) { - /* there are still users of this registration. leave it alone */ - return 0; - } - - rc = mpool_udreg->resources.deregister_mem(mpool_udreg->resources.reg_data, udreg_reg); - - if (OPAL_LIKELY(OPAL_SUCCESS == rc)) { - opal_free_list_return (&mpool_udreg->reg_list, - (opal_free_list_item_t *) udreg_reg); - } - /* might be worth printing out a warning if an error occurs here */ - - return 0; -} - -/* */ - -static int mca_mpool_udreg_alloc_huge (mca_mpool_udreg_module_t *mpool, size_t size, - void **addr, void **base_addr) { - mca_mpool_udreg_hugepage_alloc_t *alloc; - int rc; - - alloc = OBJ_NEW(mca_mpool_udreg_hugepage_alloc_t); - alloc->size = size; - - rc = asprintf (&alloc->path, "%s/hugepage.openmpi.%d.%d", mpool->huge_page->path, - getpid (), mpool->huge_page->cnt++); - if (0 > rc) { - OBJ_RELEASE(alloc); - return -1; - } - - alloc->fd = open (alloc->path, O_RDWR | O_CREAT, 0600); - if (-1 == alloc->fd) { - OBJ_RELEASE(alloc); - return -1; - } - - if (0 != ftruncate (alloc->fd, size)) { - close (alloc->fd); - unlink (alloc->path); - OBJ_RELEASE(alloc); - return -1; - } - - alloc->ptr = mmap (NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, - alloc->fd, 0); - if (NULL == alloc->ptr) { - OBJ_RELEASE(alloc); - return -1; - } - - close (alloc->fd); - unlink (alloc->path); - - alloc->huge_table = mpool->huge_page; - - opal_list_append (&mpool->huge_page->allocations, &alloc->super); - - *addr = alloc->ptr; - *base_addr = alloc; - - return 0; -} - - -static void mca_mpool_udreg_free_huge (mca_mpool_udreg_hugepage_alloc_t *alloc) { - opal_list_remove_item (&alloc->huge_table->allocations, &alloc->super); - OBJ_RELEASE(alloc); -} - -/** - * allocate function - */ -void* mca_mpool_udreg_alloc(mca_mpool_base_module_t *mpool, size_t size, - size_t align, uint32_t flags, mca_mpool_base_registration_t **reg) -{ - mca_mpool_udreg_module_t *udreg_module = (mca_mpool_udreg_module_t *) mpool; - void *base_addr, *addr; - - if(0 == align) - align = mca_mpool_base_page_size; - -#if OPAL_CUDA_SUPPORT - /* CUDA cannot handle registering overlapping regions, so make - * sure each region is page sized and page aligned. */ - align = mca_mpool_base_page_size; - size = OPAL_ALIGN(size, mca_mpool_base_page_size, size_t); -#endif - - addr = base_addr = NULL; - - if (NULL != udreg_module->huge_page) { - size = OPAL_ALIGN(size, udreg_module->huge_page->page_size, size_t); - mca_mpool_udreg_alloc_huge (udreg_module, size, &addr, &base_addr); - } else { -#ifdef HAVE_POSIX_MEMALIGN - if((errno = posix_memalign(&base_addr, align, size)) != 0) - return NULL; - - addr = base_addr; -#else - base_addr = malloc(size + align); - if(NULL == base_addr) - return NULL; - - addr = (void*)OPAL_ALIGN((uintptr_t)base_addr, align, uintptr_t); -#endif - } - - if (OPAL_SUCCESS != mca_mpool_udreg_register(mpool, addr, size, flags, MCA_MPOOL_ACCESS_ANY, reg)) { - if (udreg_module->huge_page) { - mca_mpool_udreg_free_huge ((mca_mpool_udreg_hugepage_alloc_t *) base_addr); - } else { - free(base_addr); - } - - return NULL; - } - - (*reg)->alloc_base = (unsigned char *) base_addr; - - return addr; -} - -bool mca_mpool_udreg_evict (struct mca_mpool_base_module_t *mpool) -{ - mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) mpool; - udreg_return_t urc; - - urc = UDREG_Evict (mpool_udreg->udreg_handle); - return (UDREG_RC_SUCCESS == urc); -} - -/* - * register memory - */ -int mca_mpool_udreg_register(mca_mpool_base_module_t *mpool, void *addr, - size_t size, uint32_t flags, int32_t access_flags, - mca_mpool_base_registration_t **reg) -{ - mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) mpool; - mca_mpool_base_registration_t *udreg_reg, *old_reg; - bool bypass_cache = !!(flags & MCA_MPOOL_FLAGS_CACHE_BYPASS); - udreg_entry_t *udreg_entry; - udreg_return_t urc; - - *reg = NULL; - - OPAL_THREAD_LOCK(&mpool_udreg->lock); - - /* we hold the lock so no other thread can modify these flags until the registration is complete */ - mpool_udreg->requested_access_flags = access_flags; - - if (false == bypass_cache) { - /* Get a udreg entry for this region */ - do { - while (UDREG_RC_SUCCESS != - (urc = UDREG_Register (mpool_udreg->udreg_handle, addr, size, &udreg_entry))) { - /* try to remove one unused reg and retry */ - if (!mca_mpool_udreg_evict (mpool)) { - OPAL_THREAD_UNLOCK(&mpool_udreg->lock); - return OPAL_ERR_OUT_OF_RESOURCE; - } - } - - udreg_reg = (mca_mpool_base_registration_t *) udreg_entry->device_data; - - if ((udreg_reg->access_flags & access_flags) == access_flags) { - /* sufficient access */ - break; - } - - old_reg = udreg_reg; - - /* to not confuse udreg make sure the new registration covers the same address - * range as the old one. */ - addr = old_reg->base; - size = (size_t)((intptr_t) old_reg->bound - (intptr_t) old_reg->base); - - /* make the new access flags more permissive */ - mpool_udreg->requested_access_flags = access_flags | old_reg->access_flags; - - /* get a new registration */ - udreg_reg = mca_mpool_udreg_reg_func (addr, size, mpool); - if (NULL == udreg_reg) { - OPAL_THREAD_UNLOCK(&mpool_udreg->lock); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* update the device data with the new registration */ - udreg_entry->device_data = udreg_reg; - - /* ensure that mca_mpool_udreg_deregister does not call into udreg since - * we are forcefully evicting the registration here */ - old_reg->flags |= MCA_MPOOL_FLAGS_CACHE_BYPASS | MCA_MPOOL_FLAGS_INVALID; - - mca_mpool_udreg_dereg_func (old_reg, mpool); - } while (0); - - udreg_reg->mpool_context = udreg_entry; - } else { - /* if cache bypass is requested don't use the udreg cache */ - while (NULL == (udreg_reg = mca_mpool_udreg_reg_func (addr, size, mpool))) { - /* try to remove one unused reg and retry */ - if (!mca_mpool_udreg_evict (mpool)) { - OPAL_THREAD_UNLOCK(&mpool_udreg->lock); - return OPAL_ERR_OUT_OF_RESOURCE; - } - } - udreg_reg->mpool_context = NULL; - } - - OPAL_THREAD_UNLOCK(&mpool_udreg->lock); - - udreg_reg->flags = flags; - - *reg = udreg_reg; - udreg_reg->ref_count++; - - return OPAL_SUCCESS; -} - - -/** - * realloc function - */ -void* mca_mpool_udreg_realloc(mca_mpool_base_module_t *mpool, void *addr, - size_t size, mca_mpool_base_registration_t **reg) -{ - mca_mpool_base_registration_t *old_reg = *reg; - void *new_mem = mca_mpool_udreg_alloc(mpool, size, 0, old_reg->flags, reg); - memcpy(new_mem, addr, old_reg->bound - old_reg->base + 1); - mca_mpool_udreg_free(mpool, addr, old_reg); - - return new_mem; -} - -/** - * free function - */ -void mca_mpool_udreg_free(mca_mpool_base_module_t *mpool, void *addr, - mca_mpool_base_registration_t *registration) -{ - mca_mpool_udreg_module_t *udreg_module = (mca_mpool_udreg_module_t *) mpool; - mca_mpool_udreg_deregister(mpool, registration); - - if (udreg_module->huge_page) { - mca_mpool_udreg_free_huge ((mca_mpool_udreg_hugepage_alloc_t *) registration->alloc_base); - } else { - free (registration->alloc_base); - } -} - -int mca_mpool_udreg_find(struct mca_mpool_base_module_t *mpool, void *addr, - size_t size, mca_mpool_base_registration_t **reg) -{ - *reg = NULL; - return OPAL_ERR_NOT_FOUND; -} - -int mca_mpool_udreg_deregister(struct mca_mpool_base_module_t *mpool, - mca_mpool_base_registration_t *reg) -{ - mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t *) mpool; - - assert(reg->ref_count > 0); - - --reg->ref_count; - - if (!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) { - OPAL_THREAD_LOCK(&mpool_udreg->lock); - UDREG_DecrRefcount (mpool_udreg->udreg_handle, reg->mpool_context); - OPAL_THREAD_UNLOCK(&mpool_udreg->lock); - } else { - mca_mpool_udreg_dereg_func (reg, mpool); - } - - return OPAL_SUCCESS; -} - -void mca_mpool_udreg_finalize(struct mca_mpool_base_module_t *mpool) -{ - mca_mpool_udreg_module_t *mpool_udreg = (mca_mpool_udreg_module_t*)mpool; - - /* Statistic */ - if (true == mca_mpool_udreg_component.print_stats) { - uint64_t hit = 0, miss = 0, evicted = 0; - - (void) UDREG_GetStat (mpool_udreg->udreg_handle, - UDREG_STAT_CACHE_HIT, &hit); - - (void) UDREG_GetStat (mpool_udreg->udreg_handle, - UDREG_STAT_CACHE_MISS, &miss); - - (void) UDREG_GetStat (mpool_udreg->udreg_handle, - UDREG_STAT_CACHE_EVICTED, &evicted); - - opal_output(0, "%s udreg: stats (hit/miss/evicted): %" PRIu64 "/%" PRIu64 "/%" PRIu64 "\n", - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), hit, miss, evicted); - } - - UDREG_CacheRelease (mpool_udreg->udreg_handle); - OBJ_DESTRUCT(&mpool_udreg->reg_list); - OBJ_DESTRUCT(&mpool_udreg->lock); -} - -int mca_mpool_udreg_ft_event(int state) { - return OPAL_SUCCESS; -} - - - - - - - - - - diff --git a/opal/mca/mpool/udreg/owner.txt b/opal/mca/mpool/udreg/owner.txt deleted file mode 100644 index 52961b5d12..0000000000 --- a/opal/mca/mpool/udreg/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: LANL -status: maintenance diff --git a/opal/mca/rcache/base/Makefile.am b/opal/mca/rcache/base/Makefile.am index f000f5cc2a..7b7782a1fc 100644 --- a/opal/mca/rcache/base/Makefile.am +++ b/opal/mca/rcache/base/Makefile.am @@ -9,8 +9,8 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2013 Los Alamos National Security, LLC. -# All rights reserved +# Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights +# reserved # $COPYRIGHT$ # # Additional copyrights may follow @@ -19,8 +19,15 @@ # headers += \ - base/base.h + base/base.h \ + base/rcache_base_vma.h \ + base/rcache_base_vma_tree.h \ + base/rcache_base_mem_cb.h + libmca_rcache_la_SOURCES += \ base/rcache_base_frame.c \ - base/rcache_base_create.c + base/rcache_base_create.c \ + base/rcache_base_vma.c \ + base/rcache_base_vma_tree.c \ + base/rcache_base_mem_cb.c diff --git a/opal/mca/rcache/base/base.h b/opal/mca/rcache/base/base.h index 439f78535b..d4bfeb27ee 100644 --- a/opal/mca/rcache/base/base.h +++ b/opal/mca/rcache/base/base.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -9,8 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2012-2013 Los Alamos National Security, LLC. - * All rights reserved + * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,7 +35,8 @@ BEGIN_C_DECLS /* * create a module by name */ -OPAL_DECLSPEC mca_rcache_base_module_t* mca_rcache_base_module_create(const char* name); +OPAL_DECLSPEC mca_rcache_base_module_t *mca_rcache_base_module_create (const char *name, void *user_data, + mca_rcache_base_resources_t *rcache_resources); /* * MCA framework @@ -45,13 +47,18 @@ struct mca_rcache_base_selected_module_t { opal_list_item_t super; mca_rcache_base_component_t *rcache_component; mca_rcache_base_module_t *rcache_module; + void *user_data; }; typedef struct mca_rcache_base_selected_module_t mca_rcache_base_selected_module_t; OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_rcache_base_selected_module_t); -OPAL_DECLSPEC mca_rcache_base_component_t* mca_rcache_base_component_lookup(const char* name); -OPAL_DECLSPEC mca_rcache_base_module_t* mca_rcache_base_module_lookup(const char* name); +OPAL_DECLSPEC mca_rcache_base_component_t *mca_rcache_base_component_lookup(const char *name); +OPAL_DECLSPEC mca_rcache_base_module_t *mca_rcache_base_module_lookup (const char *name); +OPAL_DECLSPEC int mca_rcache_base_module_destroy(mca_rcache_base_module_t *module); + +/* only used within base -- no need to DECLSPEC */ +extern int mca_rcache_base_used_mem_hooks; /* * Globals diff --git a/opal/mca/rcache/base/rcache_base_create.c b/opal/mca/rcache/base/rcache_base_create.c index b2ae62b25f..1219677d66 100644 --- a/opal/mca/rcache/base/rcache_base_create.c +++ b/opal/mca/rcache/base/rcache_base_create.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -9,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -25,33 +28,85 @@ #include "opal/mca/base/base.h" #include "opal/mca/rcache/rcache.h" #include "opal/mca/rcache/base/base.h" +#include "opal/mca/rcache/base/rcache_base_mem_cb.h" +#include "opal/util/show_help.h" +#include "opal/util/proc.h" +#include "opal/runtime/opal_params.h" +#include "opal/memoryhooks/memory.h" - -mca_rcache_base_module_t* mca_rcache_base_module_create(const char* name) +mca_rcache_base_module_t* mca_rcache_base_module_create (const char* name, void *user_data, + struct mca_rcache_base_resources_t* resources) { - mca_base_component_list_item_t* cli; mca_rcache_base_component_t* component = NULL; mca_rcache_base_module_t* module = NULL; + mca_base_component_list_item_t *cli; mca_rcache_base_selected_module_t *sm; - bool found = false; OPAL_LIST_FOREACH(cli, &opal_rcache_base_framework.framework_components, mca_base_component_list_item_t) { component = (mca_rcache_base_component_t *) cli->cli_component; if(0 == strcmp(component->rcache_version.mca_component_name, name)) { - found = true; + module = component->rcache_init (resources); break; } } - if (!found) { + if ( NULL == module ) { return NULL; } - module = component->rcache_init(); + sm = OBJ_NEW(mca_rcache_base_selected_module_t); sm->rcache_component = component; sm->rcache_module = module; + sm->user_data = user_data; opal_list_append(&mca_rcache_base_modules, (opal_list_item_t*) sm); + + /* on the very first creation of a module we init the memory + callback */ + if (!mca_rcache_base_used_mem_hooks) { + /* Use the memory hooks if leave_pinned or + * leave_pinned_pipeline is enabled (note that either of these + * leave_pinned variables may have been set by a user MCA + * param or elsewhere in the code base). Yes, we could havexc + * coded this more succinctly, but this is more clear. Do not + * check memory hooks if the rcache does not provide an + * range invalidation function.. */ + if ((opal_leave_pinned > 0 || opal_leave_pinned_pipeline) && + module->rcache_invalidate_range) { + if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) == + ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) & + opal_mem_hooks_support_level())) { + opal_mem_hooks_register_release(mca_rcache_base_mem_cb, NULL); + } else { + opal_show_help("help-rcache-base.txt", "leave pinned failed", + true, name, OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), + opal_proc_local_get()->proc_hostname); + return NULL; + } + + /* Set this to true so that rcache_base_close knows to + cleanup */ + mca_rcache_base_used_mem_hooks = 1; + } + } + return module; } +int mca_rcache_base_module_destroy(mca_rcache_base_module_t *module) +{ + mca_rcache_base_selected_module_t *sm, *next; + + OPAL_LIST_FOREACH_SAFE(sm, next, &mca_rcache_base_modules, mca_rcache_base_selected_module_t) { + if (module == sm->rcache_module) { + opal_list_remove_item(&mca_rcache_base_modules, (opal_list_item_t*)sm); + if (NULL != sm->rcache_module->rcache_finalize) { + sm->rcache_module->rcache_finalize(sm->rcache_module); + } + OBJ_RELEASE(sm); + return OPAL_SUCCESS; + } + } + + return OPAL_ERR_NOT_FOUND; +} diff --git a/opal/mca/rcache/base/rcache_base_frame.c b/opal/mca/rcache/base/rcache_base_frame.c index 72334c2ded..84b2a726d3 100644 --- a/opal/mca/rcache/base/rcache_base_frame.c +++ b/opal/mca/rcache/base/rcache_base_frame.c @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -28,7 +29,9 @@ #include "opal/mca/base/base.h" #include "opal/mca/rcache/rcache.h" #include "opal/mca/rcache/base/base.h" +#include "opal/memoryhooks/memory.h" #include "opal/constants.h" +#include "rcache_base_mem_cb.h" /* * The following file was created by configure. It contains extern @@ -38,6 +41,24 @@ #include "opal/mca/rcache/base/static-components.h" +int mca_rcache_base_used_mem_hooks; + +/** + * Memory Pool Registration + */ + +static void mca_rcache_base_registration_constructor( mca_rcache_base_registration_t * reg ) +{ + reg->rcache = NULL; + reg->base = NULL; + reg->bound = NULL; + reg->ref_count = 0; + reg->flags = 0; +} + +OBJ_CLASS_INSTANCE(mca_rcache_base_registration_t, opal_free_list_item_t, + mca_rcache_base_registration_constructor, NULL); + /* * Global variables @@ -49,28 +70,32 @@ OBJ_CLASS_INSTANCE(mca_rcache_base_selected_module_t, opal_list_item_t, NULL, NU static int mca_rcache_base_close(void) { - opal_list_item_t *item; - mca_rcache_base_selected_module_t *sm; + opal_list_item_t *item; + mca_rcache_base_selected_module_t *sm; - /* Finalize all the rcache components and free their list items */ + /* Finalize all the rcache components and free their list items */ - for (item = opal_list_remove_first(&mca_rcache_base_modules); - NULL != item; - item = opal_list_remove_first(&mca_rcache_base_modules)) { - sm = (mca_rcache_base_selected_module_t *) item; + while (NULL != (item = opal_list_remove_first(&mca_rcache_base_modules))) { + sm = (mca_rcache_base_selected_module_t *) item; - /* Blatently ignore the return code (what would we do to recover, - anyway? This component is going away, so errors don't matter - anymore). Note that it's legal for the module to have NULL for - the finalize function. */ + /* Blatently ignore the return code (what would we do to recover, + anyway? This component is going away, so errors don't matter + anymore). Note that it's legal for the module to have NULL for + the finalize function. */ - if (NULL != sm->rcache_module->rcache_finalize) { - sm->rcache_module->rcache_finalize(sm->rcache_module); + if (NULL != sm->rcache_module->rcache_finalize) { + sm->rcache_module->rcache_finalize(sm->rcache_module); + } + OBJ_RELEASE(sm); } - OBJ_RELEASE(sm); - } - /* Close all remaining available components */ + /* deregister memory free callback */ + if (mca_rcache_base_used_mem_hooks) { + opal_mem_hooks_unregister_release(mca_rcache_base_mem_cb); + } + /* All done */ + + /* Close all remaining available components */ return mca_base_framework_components_close(&opal_rcache_base_framework, NULL); } @@ -80,16 +105,16 @@ static int mca_rcache_base_close(void) */ static int mca_rcache_base_open(mca_base_open_flag_t flags) { - /* Initialize the list so that in mca_rcache_base_close(), we can - iterate over it (even if it's empty, as in the case of the opal_info-tool) */ + /* Initialize the list so that in mca_rcache_base_close(), we can + iterate over it (even if it's empty, as in the case of the opal_info-tool) */ - OBJ_CONSTRUCT(&mca_rcache_base_modules, opal_list_t); + OBJ_CONSTRUCT(&mca_rcache_base_modules, opal_list_t); /* Open up all available components */ return mca_base_framework_components_open(&opal_rcache_base_framework, flags); } -MCA_BASE_FRAMEWORK_DECLARE(opal, rcache, "OPAL Rcache", NULL, +MCA_BASE_FRAMEWORK_DECLARE(opal, rcache, "OPAL Registration Cache", NULL, mca_rcache_base_open, mca_rcache_base_close, mca_rcache_base_static_components, 0); diff --git a/opal/mca/mpool/base/mpool_base_mem_cb.c b/opal/mca/rcache/base/rcache_base_mem_cb.c similarity index 65% rename from opal/mca/mpool/base/mpool_base_mem_cb.c rename to opal/mca/rcache/base/rcache_base_mem_cb.c index 354e376ad8..7f177a3e2f 100644 --- a/opal/mca/mpool/base/mpool_base_mem_cb.c +++ b/opal/mca/rcache/base/rcache_base_mem_cb.c @@ -1,4 +1,4 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. + * Copyright (c) 2012-2015 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ * @@ -32,8 +32,8 @@ #include "opal/util/proc.h" #include "opal/runtime/opal_params.h" -#include "opal/mca/mpool/base/mpool_base_mem_cb.h" -#include "opal/mca/mpool/base/base.h" +#include "opal/mca/rcache/base/rcache_base_mem_cb.h" +#include "opal/mca/rcache/base/base.h" #include "opal/mca/mca.h" #include "opal/memoryhooks/memory.h" @@ -47,43 +47,34 @@ static char msg[512]; * from_alloc==true, then you cannot call malloc (or any of its * friends)! */ -void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata, - bool from_alloc) +void mca_rcache_base_mem_cb (void* base, size_t size, void* cbdata, bool from_alloc) { - mca_mpool_base_selected_module_t* current; + mca_rcache_base_selected_module_t* current; int rc; - opal_list_item_t* item; /* Only do anything meaningful if the OPAL layer is up and running and size != 0 */ - if ((from_alloc && (!opal_initialized)) || - size == 0) { + if ((from_alloc && (!opal_initialized)) || size == 0) { return; } - for(item = opal_list_get_first(&mca_mpool_base_modules); - item != opal_list_get_end(&mca_mpool_base_modules); - item = opal_list_get_next(item)) { - - current = (mca_mpool_base_selected_module_t*) item; - - if(current->mpool_module->mpool_release_memory != NULL) { - rc = current->mpool_module->mpool_release_memory(current->mpool_module, - base, size); - + OPAL_LIST_FOREACH(current, &mca_rcache_base_modules, mca_rcache_base_selected_module_t) { + if (current->rcache_module->rcache_invalidate_range != NULL) { + rc = current->rcache_module->rcache_invalidate_range (current->rcache_module, + base, size); if (rc != OPAL_SUCCESS) { if (from_alloc) { int len; - len = snprintf(msg, sizeof(msg), "[%s:%d] Attempt to free memory that is still in use by an ongoing MPI communication (buffer %p, size %lu). MPI job will now abort.\n", - opal_proc_local_get()->proc_hostname, - getpid(), - base, (unsigned long) size); + len = snprintf(msg, sizeof(msg), "[%s:%d] Attempt to free memory that is still in " + "use by an ongoing MPI communication (buffer %p, size %lu). MPI job " + "will now abort.\n", opal_proc_local_get()->proc_hostname, + getpid(), base, (unsigned long) size); msg[sizeof(msg) - 1] = '\0'; write(2, msg, len); } else { - opal_show_help("help-mpool-base.txt", + opal_show_help("help-rcache-base.txt", "cannot deregister in-use memory", true, - current->mpool_component->mpool_version.mca_component_name, + current->rcache_component->rcache_version.mca_component_name, opal_proc_local_get()->proc_hostname, base, (unsigned long) size); } diff --git a/opal/mca/mpool/base/mpool_base_mem_cb.h b/opal/mca/rcache/base/rcache_base_mem_cb.h similarity index 70% rename from opal/mca/mpool/base/mpool_base_mem_cb.h rename to opal/mca/rcache/base/rcache_base_mem_cb.h index 7484731014..3192aeecb4 100644 --- a/opal/mca/mpool/base/mpool_base_mem_cb.h +++ b/opal/mca/rcache/base/rcache_base_mem_cb.h @@ -1,3 +1,4 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology @@ -9,6 +10,8 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -18,8 +21,8 @@ /** * @file */ -#ifndef MCA_MPOOL_BASE_MEM_CB_H -#define MCA_MPOOL_BASE_MEM_CB_H +#ifndef MCA_RCACHE_BASE_MEM_CB_H +#define MCA_RCACHE_BASE_MEM_CB_H #include "opal_config.h" @@ -28,12 +31,8 @@ BEGIN_C_DECLS /* * memory hook callback, called when memory is free'd out from under us */ -void mca_mpool_base_mem_cb(void* base, size_t size, void* cbdata, - bool from_alloc); +void mca_rcache_base_mem_cb (void* base, size_t size, void* cbdata, bool from_alloc); END_C_DECLS -#endif /* MCA_MPOOL_BASE_MEM_CB_H */ - - - +#endif /* MCA_RCACHE_BASE_MEM_CB_H */ diff --git a/opal/mca/rcache/base/rcache_base_vma.c b/opal/mca/rcache/base/rcache_base_vma.c new file mode 100644 index 0000000000..39f534b07b --- /dev/null +++ b/opal/mca/rcache/base/rcache_base_vma.c @@ -0,0 +1,151 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2007 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2009-2013 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#include MCA_memory_IMPLEMENTATION_HEADER +#include "opal/mca/memory/memory.h" +#include "opal/mca/rcache/rcache.h" +#include "rcache_base_vma.h" +#include "rcache_base_vma_tree.h" + +/** + * Initialize the rcache + */ + +static void mca_rcache_base_vma_module_construct (mca_rcache_base_vma_module_t *vma_module) { + OBJ_CONSTRUCT(&vma_module->vma_lock, opal_recursive_mutex_t); + (void) mca_rcache_base_vma_tree_init (vma_module); +} + +static void mca_rcache_base_vma_module_destruct (mca_rcache_base_vma_module_t *vma_module) { + OBJ_DESTRUCT(&vma_module->vma_lock); + mca_rcache_base_vma_tree_finalize (vma_module); +} + +OBJ_CLASS_INSTANCE(mca_rcache_base_vma_module_t, opal_object_t, + mca_rcache_base_vma_module_construct, + mca_rcache_base_vma_module_destruct); + +mca_rcache_base_vma_module_t *mca_rcache_base_vma_module_alloc (void) +{ + return OBJ_NEW(mca_rcache_base_vma_module_t); +} + +int mca_rcache_base_vma_find (mca_rcache_base_vma_module_t *vma_module, void *addr, + size_t size, mca_rcache_base_registration_t **reg) +{ + int rc; + unsigned char *bound_addr; + + if (size == 0) { + return OPAL_ERROR; + } + + bound_addr = (unsigned char *) ((intptr_t) addr + size - 1); + + /* Check to ensure that the cache is valid */ + if (OPAL_UNLIKELY(opal_memory_changed() && + NULL != opal_memory->memoryc_process && + OPAL_SUCCESS != (rc = opal_memory->memoryc_process()))) { + return rc; + } + + *reg = mca_rcache_base_vma_tree_find (vma_module, (unsigned char *) addr, bound_addr); + + return OPAL_SUCCESS; +} + +int mca_rcache_base_vma_find_all (mca_rcache_base_vma_module_t *vma_module, void *addr, + size_t size, mca_rcache_base_registration_t **regs, + int reg_cnt) +{ + int rc; + unsigned char *bound_addr; + + if(size == 0) { + return OPAL_ERROR; + } + + bound_addr = (unsigned char *) ((intptr_t) addr + size - 1); + + /* Check to ensure that the cache is valid */ + if (OPAL_UNLIKELY(opal_memory_changed() && + NULL != opal_memory->memoryc_process && + OPAL_SUCCESS != (rc = opal_memory->memoryc_process()))) { + return rc; + } + + return mca_rcache_base_vma_tree_find_all (vma_module, (unsigned char *) addr, + bound_addr, regs, reg_cnt); +} + +int mca_rcache_base_vma_insert (mca_rcache_base_vma_module_t *vma_module, + mca_rcache_base_registration_t *reg, size_t limit) +{ + size_t reg_size = reg->bound - reg->base + 1; + int rc; + + if (limit != 0 && reg_size > limit) { + /* return out of resources if request is bigger than cache size + * return temp out of resources otherwise */ + return OPAL_ERR_OUT_OF_RESOURCE; + } + + /* Check to ensure that the cache is valid */ + if (OPAL_UNLIKELY(opal_memory_changed() && + NULL != opal_memory->memoryc_process && + OPAL_SUCCESS != (rc = opal_memory->memoryc_process()))) { + return rc; + } + + rc = mca_rcache_base_vma_tree_insert (vma_module, reg, limit); + if (OPAL_LIKELY(OPAL_SUCCESS == rc)) { + /* If we successfully registered, then tell the memory manager + to start monitoring this region */ + opal_memory->memoryc_register (reg->base, (uint64_t) reg_size, + (uint64_t) (uintptr_t) reg); + } + + return rc; +} + +int mca_rcache_base_vma_delete (mca_rcache_base_vma_module_t *vma_module, + mca_rcache_base_registration_t *reg) +{ + /* Tell the memory manager that we no longer care about this + region */ + opal_memory->memoryc_deregister (reg->base, + (uint64_t) (reg->bound - reg->base), + (uint64_t) (uintptr_t) reg); + return mca_rcache_base_vma_tree_delete (vma_module, reg); +} + +void mca_rcache_base_vma_dump_range (mca_rcache_base_vma_module_t *vma_module, + unsigned char *base, size_t size, char *msg) +{ + mca_rcache_base_vma_tree_dump_range (vma_module, base, size, msg); +} diff --git a/opal/mca/rcache/base/rcache_base_vma.h b/opal/mca/rcache/base/rcache_base_vma.h new file mode 100644 index 0000000000..7cbb010365 --- /dev/null +++ b/opal/mca/rcache/base/rcache_base_vma.h @@ -0,0 +1,74 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2007 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * Registration cache VMA lookup + */ + +#ifndef MCA_RCACHE_BASE_VMA_H +#define MCA_RCACHE_BASE_VMA_H + +#include "opal_config.h" +#include "opal/class/opal_list.h" +#include "opal/class/opal_rb_tree.h" + +BEGIN_C_DECLS + +struct mca_rcache_base_registration_t; + +struct mca_rcache_base_vma_module_t { + opal_object_t super; + opal_rb_tree_t rb_tree; + opal_list_t vma_list; + size_t reg_cur_cache_size; + opal_mutex_t vma_lock; +}; +typedef struct mca_rcache_base_vma_module_t mca_rcache_base_vma_module_t; + +OBJ_CLASS_DECLARATION(mca_rcache_base_vma_module_t); + +mca_rcache_base_vma_module_t *mca_rcache_base_vma_module_alloc (void); + +int mca_rcache_base_vma_find (mca_rcache_base_vma_module_t *vma_module, void *addr, + size_t size, struct mca_rcache_base_registration_t **reg); + +int mca_rcache_base_vma_find_all (mca_rcache_base_vma_module_t *vma_module, void *addr, + size_t size, struct mca_rcache_base_registration_t **regs, + int reg_cnt); + +int mca_rcache_base_vma_insert (mca_rcache_base_vma_module_t *vma_module, + struct mca_rcache_base_registration_t *registration, + size_t limit); + +int mca_rcache_base_vma_delete (mca_rcache_base_vma_module_t *vma_module, + struct mca_rcache_base_registration_t *registration); + +void mca_rcache_base_vma_dump_range (mca_rcache_base_vma_module_t *vma_module, + unsigned char *base, size_t size, char *msg); + +END_C_DECLS + +#endif /* MCA_RCACHE_BASE_VMA_H */ diff --git a/opal/mca/rcache/base/rcache_base_vma_tree.c b/opal/mca/rcache/base/rcache_base_vma_tree.c new file mode 100644 index 0000000000..25c6eecb43 --- /dev/null +++ b/opal/mca/rcache/base/rcache_base_vma_tree.c @@ -0,0 +1,565 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2015 Research Organization for Information Science + * and Technology (RIST). All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal/util/output.h" +#include "rcache_base_vma_tree.h" + +OBJ_CLASS_INSTANCE(mca_rcache_base_vma_reg_list_item_t, opal_list_item_t, NULL, NULL); + +static void mca_rcache_base_vma_item_construct (mca_rcache_base_vma_item_t *vma_item) +{ + OBJ_CONSTRUCT(&vma_item->reg_list, opal_list_t); +} + +static void mca_rcache_base_vma_item_destruct (mca_rcache_base_vma_item_t *vma_item) +{ + OPAL_LIST_DESTRUCT(&vma_item->reg_list); +} + +OBJ_CLASS_INSTANCE(mca_rcache_base_vma_item_t, opal_list_item_t, + mca_rcache_base_vma_item_construct, + mca_rcache_base_vma_item_destruct); + + +/** + * Function for the red black tree to compare 2 keys + * + * @param key1 a pointer to the 1st key + * @param key2 a pointer to the second key + * + * @retval -1 if key1 is below key2 + * @retval 1 if key 1 is above key2 + * @retval 0 if the keys are the same + */ + +static int mca_rcache_base_vma_tree_node_compare(void *key1, void *key2) +{ + mca_rcache_base_vma_item_t *vma1 = (mca_rcache_base_vma_item_t *) key1, + *vma2 = (mca_rcache_base_vma_item_t *) key2; + + if (vma1->start < vma2->start) { + return -1; + } + + if (vma1->start > vma2->start) { + return 1; + } + + return 0; +} + +static int mca_rcache_base_vma_tree_node_compare_search(void *key1, void *key2) +{ + mca_rcache_base_vma_item_t *vma = (mca_rcache_base_vma_item_t *) key2; + uintptr_t addr = (uintptr_t) key1; + + if (vma->end < addr) { + return 1; + } + + if (vma->start <= addr) { + return 0; + } + + return -1; +} + +static int mca_rcache_base_vma_tree_node_compare_closest(void *key1, void *key2) +{ + mca_rcache_base_vma_item_t *vma = (mca_rcache_base_vma_item_t *) key2, *prev_vma; + uintptr_t addr = (uintptr_t) key1; + + if (vma->end < addr) { + return 1; + } + + if (vma->start <= addr) { + return 0; + } + + prev_vma = (mca_rcache_base_vma_item_t *) opal_list_get_prev (&vma->super); + if (prev_vma == (mca_rcache_base_vma_item_t *) opal_list_get_end (&vma->vma_module->vma_list) + || prev_vma->end < addr) { + return 0; + } + + return -1; +} + +static inline +mca_rcache_base_vma_item_t *mca_rcache_base_vma_new (mca_rcache_base_vma_module_t *vma_module, + uintptr_t start, uintptr_t end) +{ + mca_rcache_base_vma_item_t *vma_item = OBJ_NEW(mca_rcache_base_vma_item_t); + + if (NULL == vma_item) { + return NULL; + } + + vma_item->start = start; + vma_item->end = end; + vma_item->vma_module = vma_module; + + (void) opal_rb_tree_insert (&vma_module->rb_tree, vma_item, vma_item); + + return vma_item; +} + +static inline int mca_rcache_base_vma_compare_regs (mca_rcache_base_registration_t *reg1, + mca_rcache_base_registration_t *reg2) +{ + /* persisten registration are on top */ + if ((reg1->flags & MCA_RCACHE_FLAGS_PERSIST) && + !(reg2->flags & MCA_RCACHE_FLAGS_PERSIST)) { + return 1; + } + + if (!(reg1->flags & MCA_RCACHE_FLAGS_PERSIST) && + (reg2->flags & MCA_RCACHE_FLAGS_PERSIST)) { + return -1; + } + + if (reg1->bound != reg2->bound) { + return (int)(reg1->bound - reg2->bound); + } + + /* tie breaker */ + return (int)((intptr_t)reg1 - (intptr_t)reg2); +} + +static inline int mca_rcache_base_vma_add_reg (mca_rcache_base_vma_item_t *vma_item, + struct mca_rcache_base_registration_t *reg) +{ + mca_rcache_base_vma_reg_list_item_t *item, *entry; + + entry = OBJ_NEW(mca_rcache_base_vma_reg_list_item_t); + + if (!entry) { + return -1; + } + + entry->reg = reg; + + OPAL_LIST_FOREACH(item, &vma_item->reg_list, mca_rcache_base_vma_reg_list_item_t) { + if (mca_rcache_base_vma_compare_regs(item->reg, reg) > 0) { + continue; + } + + opal_list_insert_pos (&vma_item->reg_list, &item->super, &entry->super); + return 0; + } + + opal_list_append (&vma_item->reg_list, &entry->super); + + return 0; +} + +static inline void mca_rcache_base_vma_remove_reg (mca_rcache_base_vma_item_t *vma_item, + struct mca_rcache_base_registration_t *reg) +{ + mca_rcache_base_vma_reg_list_item_t *item; + + OPAL_LIST_FOREACH(item, &vma_item->reg_list, mca_rcache_base_vma_reg_list_item_t) { + if(item->reg == reg) { + opal_list_remove_item(&vma_item->reg_list, &item->super); + OBJ_RELEASE(item); + break; + } + } +} + +static inline int mca_rcache_base_vma_copy_reg_list (mca_rcache_base_vma_item_t *to, + mca_rcache_base_vma_item_t *from) +{ + + mca_rcache_base_vma_reg_list_item_t *item_f, *item_t; + OPAL_LIST_FOREACH(item_f, &from->reg_list, mca_rcache_base_vma_reg_list_item_t) { + item_t = OBJ_NEW(mca_rcache_base_vma_reg_list_item_t); + + if (NULL == item_t) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + item_t->reg = item_f->reg; + + opal_list_append (&to->reg_list, &item_t->super); + } + + return OPAL_SUCCESS; +} + +/* returns 1 iff two lists contain the same entries */ +static inline int mca_rcache_base_vma_compare_reg_lists (mca_rcache_base_vma_item_t *vma1, + mca_rcache_base_vma_item_t *vma2) +{ + mca_rcache_base_vma_reg_list_item_t *i1, *i2; + + if (!vma1 || !vma2 || opal_list_get_size (&vma1->reg_list) != opal_list_get_size (&vma2->reg_list)) { + return 0; + } + + i2 = (mca_rcache_base_vma_reg_list_item_t *) opal_list_get_first(&vma2->reg_list); + + OPAL_LIST_FOREACH(i1, &vma1->reg_list, mca_rcache_base_vma_reg_list_item_t) { + if ((void *) i2 == (void *) opal_list_get_end (&vma2->reg_list) || i1->reg != i2->reg) { + return 0; + } + + i2 = (mca_rcache_base_vma_reg_list_item_t *) opal_list_get_next (&i2->super); + } + + return 1; +} + +int mca_rcache_base_vma_tree_init (mca_rcache_base_vma_module_t *vma_module) +{ + OBJ_CONSTRUCT(&vma_module->rb_tree, opal_rb_tree_t); + OBJ_CONSTRUCT(&vma_module->vma_list, opal_list_t); + vma_module->reg_cur_cache_size = 0; + return opal_rb_tree_init (&vma_module->rb_tree, mca_rcache_base_vma_tree_node_compare); +} + +void mca_rcache_base_vma_tree_finalize (mca_rcache_base_vma_module_t *vma_module) +{ + opal_rb_tree_init(&vma_module->rb_tree, mca_rcache_base_vma_tree_node_compare); + OBJ_DESTRUCT(&vma_module->vma_list); + OBJ_DESTRUCT(&vma_module->rb_tree); +} + +mca_rcache_base_registration_t *mca_rcache_base_vma_tree_find (mca_rcache_base_vma_module_t *vma_module, + unsigned char *base, unsigned char *bound) +{ + mca_rcache_base_vma_item_t *vma; + mca_rcache_base_vma_reg_list_item_t *item; + + vma = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree, base, + mca_rcache_base_vma_tree_node_compare_search); + if (!vma) { + return NULL; + } + + OPAL_LIST_FOREACH(item, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) { + if(item->reg->flags & MCA_RCACHE_FLAGS_INVALID) { + continue; + } + + if(item->reg->bound >= bound) + return item->reg; + if(!(item->reg->flags & MCA_RCACHE_FLAGS_PERSIST)) + break; + } + + return NULL; +} + +static inline bool is_reg_in_array (mca_rcache_base_registration_t **regs, + int cnt, mca_rcache_base_registration_t *p) +{ + for (int i = 0 ; i < cnt ; ++i) { + if (regs[i] == p) { + return true; + } + } + + return false; +} + +int mca_rcache_base_vma_tree_find_all (mca_rcache_base_vma_module_t *vma_module, unsigned char *base, + unsigned char *bound, mca_rcache_base_registration_t **regs, + int reg_cnt) +{ + int cnt = 0; + + if(opal_list_get_size(&vma_module->vma_list) == 0) + return cnt; + + do { + mca_rcache_base_vma_item_t *vma; + mca_rcache_base_vma_reg_list_item_t *vma_item; + vma = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree, base, + mca_rcache_base_vma_tree_node_compare_closest); + + if (NULL == vma) { + /* base is bigger than any registered memory */ + break; + } + + if (base < (unsigned char *) vma->start) { + base = (unsigned char *) vma->start; + continue; + } + + OPAL_LIST_FOREACH(vma_item, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) { + if ((vma_item->reg->flags & MCA_RCACHE_FLAGS_INVALID) || + is_reg_in_array (regs, cnt, vma_item->reg)) { + continue; + } + regs[cnt++] = vma_item->reg; + if (cnt == reg_cnt) { + return cnt; /* no space left in the provided array */ + } + } + + base = (unsigned char *)vma->end + 1; + } while(bound >= base); + + return cnt; +} + +static inline int mca_rcache_base_vma_can_insert (mca_rcache_base_vma_module_t *vma_module, size_t nbytes, size_t limit) +{ + return (0 == limit || vma_module->reg_cur_cache_size + nbytes <= limit); +} + +static inline void mca_rcache_base_vma_update_byte_count (mca_rcache_base_vma_module_t *vma_module, + size_t nbytes) +{ + vma_module->reg_cur_cache_size += nbytes; +} + +int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module, + mca_rcache_base_registration_t *reg, size_t limit) +{ + mca_rcache_base_vma_item_t *i; + uintptr_t begin = (uintptr_t)reg->base, end = (uintptr_t)reg->bound; + + i = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree, + (void *) begin, mca_rcache_base_vma_tree_node_compare_closest); + + if (!i) { + i = (mca_rcache_base_vma_item_t *) opal_list_get_end (&vma_module->vma_list); + } + + while (begin <= end) { + mca_rcache_base_vma_item_t *vma = NULL; + + if (opal_list_get_end (&vma_module->vma_list) == &i->super) { + if (mca_rcache_base_vma_can_insert (vma_module, end - begin + 1, limit)) { + vma = mca_rcache_base_vma_new(vma_module, begin, end); + } + + if (!vma) { + goto remove; + } + + mca_rcache_base_vma_update_byte_count (vma_module, end - begin + 1); + + opal_list_append(&vma_module->vma_list, &vma->super); + begin = vma->end + 1; + mca_rcache_base_vma_add_reg (vma, reg); + return OPAL_SUCCESS; + } + + if (i->start > begin) { + uintptr_t tend = (i->start <= end) ? (i->start - 1) : end; + if (mca_rcache_base_vma_can_insert(vma_module, tend - begin + 1, limit)) { + vma = mca_rcache_base_vma_new(vma_module, begin, tend); + } + + if (!vma) { + goto remove; + } + + mca_rcache_base_vma_update_byte_count (vma_module, tend - begin + 1); + + /* insert before */ + opal_list_insert_pos(&vma_module->vma_list, &i->super, &vma->super); + i = vma; + begin = vma->end + 1; + mca_rcache_base_vma_add_reg (vma, reg); + } else if(i->start == begin) { + if (i->end > end) { + vma = mca_rcache_base_vma_new (vma_module, end + 1, i->end); + if (!vma) { + goto remove; + } + + i->end = end; + + mca_rcache_base_vma_copy_reg_list (vma, i); + + /* add after */ + opal_list_insert_pos (&vma_module->vma_list, + opal_list_get_next (&i->super), + &vma->super); + mca_rcache_base_vma_add_reg (i, reg); + begin = end + 1; + } else { + mca_rcache_base_vma_add_reg(i, reg); + begin = i->end + 1; + } + } else { + vma = mca_rcache_base_vma_new (vma_module, begin, i->end); + + if (!vma) { + goto remove; + } + + i->end = begin - 1; + + mca_rcache_base_vma_copy_reg_list (vma, i); + + /* add after */ + opal_list_insert_pos (&vma_module->vma_list, + opal_list_get_next (&i->super), + &vma->super); + } + + i = (mca_rcache_base_vma_item_t *) opal_list_get_next (&i->super); + } + + return OPAL_SUCCESS; + +remove: + mca_rcache_base_vma_tree_delete (vma_module, reg); + return OPAL_ERR_TEMP_OUT_OF_RESOURCE; +} + +/** + * Function to remove previously memory from the tree without freeing it + * + * @param base pointer to the memory to free + * + * @retval OPAL_SUCCESS + * @retval OPAL_ERR_BAD_PARAM if the passed base pointer was invalid + */ +int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module, + mca_rcache_base_registration_t *reg) +{ + mca_rcache_base_vma_item_t *vma; + + vma = (mca_rcache_base_vma_item_t *) + opal_rb_tree_find_with (&vma_module->rb_tree, reg->base, + mca_rcache_base_vma_tree_node_compare_search); + + if (!vma) { + return OPAL_ERROR; + } + + while (vma != (mca_rcache_base_vma_item_t *) opal_list_get_end (&vma_module->vma_list) + && vma->start <= (uintptr_t) reg->bound) { + mca_rcache_base_vma_remove_reg(vma, reg); + + if(opal_list_is_empty(&vma->reg_list)) { + mca_rcache_base_vma_item_t *next = + (mca_rcache_base_vma_item_t *) opal_list_get_next (&vma->super); + opal_rb_tree_delete (&vma_module->rb_tree, vma); + mca_rcache_base_vma_update_byte_count (vma_module, + vma->start - vma->end - 1); + opal_list_remove_item (&vma_module->vma_list, &vma->super); + OBJ_RELEASE(vma); + vma = next; + } else { + int merged; + + do { + mca_rcache_base_vma_item_t *prev = NULL, *next = NULL; + if (opal_list_get_first (&vma_module->vma_list) != &vma->super) { + prev = (mca_rcache_base_vma_item_t *) opal_list_get_prev(vma); + } + + merged = 0; + + if (prev && vma->start == prev->end + 1 && + mca_rcache_base_vma_compare_reg_lists(vma, prev)) { + prev->end = vma->end; + opal_list_remove_item(&vma_module->vma_list, &vma->super); + opal_rb_tree_delete(&vma_module->rb_tree, vma); + OBJ_RELEASE(vma); + vma = prev; + merged = 1; + } + + if (opal_list_get_last (&vma_module->vma_list) != &vma->super) { + next = (mca_rcache_base_vma_item_t *) opal_list_get_next (vma); + } + + if (next && vma->end + 1 == next->start && + mca_rcache_base_vma_compare_reg_lists (vma, next)) { + vma->end = next->end; + opal_list_remove_item(&vma_module->vma_list, &next->super); + opal_rb_tree_delete(&vma_module->rb_tree, next); + OBJ_RELEASE(next); + merged = 1; + } + } while (merged); + + vma = (mca_rcache_base_vma_item_t *) opal_list_get_next (vma); + } + } + + return 0; +} + +/* Dump out rcache entries within a range of memory. Useful for debugging. */ +void mca_rcache_base_vma_tree_dump_range (mca_rcache_base_vma_module_t *vma_module, + unsigned char *base, size_t size, char *msg) +{ + unsigned char * bound = base + size -1; + mca_rcache_base_registration_t *reg; + + if (NULL == msg) { + msg = ""; + } + + opal_output(0, "Dumping rcache entries: %s", msg); + + if(opal_list_is_empty(&vma_module->vma_list)) { + opal_output(0, " rcache is empty"); + return; + } + + do { + mca_rcache_base_vma_item_t *vma; + mca_rcache_base_vma_reg_list_item_t *vma_item; + vma = (mca_rcache_base_vma_item_t *) + opal_rb_tree_find_with (&vma_module->rb_tree, base, + mca_rcache_base_vma_tree_node_compare_closest); + + if (NULL == vma) { + /* base is bigger than any registered memory */ + break; + } + + if (base < (unsigned char *) vma->start) { + base = (unsigned char *) vma->start; + continue; + } + + opal_output(0, " vma: base=%p, bound=%p, size=%lu, number of registrations=%d", + (void *)vma->start, (void *)vma->end, vma->end - vma->start + 1, + (int) opal_list_get_size(&vma->reg_list)); + OPAL_LIST_FOREACH(vma_item, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) { + reg = vma_item->reg; + opal_output(0, " reg: base=%p, bound=%p, ref_count=%d, flags=0x%x", + reg->base, reg->bound, reg->ref_count, reg->flags); + } + base = (unsigned char *)vma->end + 1; + } while (bound >= base); +} diff --git a/opal/mca/rcache/base/rcache_base_vma_tree.h b/opal/mca/rcache/base/rcache_base_vma_tree.h new file mode 100644 index 0000000000..de7b3648a7 --- /dev/null +++ b/opal/mca/rcache/base/rcache_base_vma_tree.h @@ -0,0 +1,109 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2007 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2009 IBM Corporation. All rights reserved. + * + * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + * Registation cache VMA tree implementation + */ +#ifndef MCA_RCACHE_BASE_VMA_TREE_H +#define MCA_RCACHE_BASE_VMA_TREE_H + +#include "opal_config.h" + +#include "opal/mca/rcache/rcache.h" +#include "rcache_base_vma.h" + +/* + * Data structures for the tree of allocated memory + */ + +struct mca_rcache_base_vma_reg_list_item_t +{ + opal_list_item_t super; + mca_rcache_base_registration_t *reg; +}; +typedef struct mca_rcache_base_vma_reg_list_item_t mca_rcache_base_vma_reg_list_item_t; +OBJ_CLASS_DECLARATION(mca_rcache_base_vma_reg_list_item_t); + +/** + * The item in the vma_tree itself + */ +struct mca_rcache_base_vma_item_t +{ + opal_list_item_t super; /**< the parent class */ + uintptr_t start; /**< the base of the memory range */ + uintptr_t end; /**< the bound of the memory range */ + opal_list_t reg_list; /**< list of regs on this vma */ + mca_rcache_base_vma_module_t *vma_module; /**< pointer to rcache vma belongs to */ +}; +typedef struct mca_rcache_base_vma_item_t mca_rcache_base_vma_item_t; + +OBJ_CLASS_DECLARATION(mca_rcache_base_vma_item_t); + + +/* + * initialize the vma tree + */ +int mca_rcache_base_vma_tree_init (mca_rcache_base_vma_module_t *vma_module); + +/* + * clean up the vma tree + */ +void mca_rcache_base_vma_tree_finalize(mca_rcache_base_vma_module_t *vma_module); + +/** + * Returns the item in the vma tree + */ +mca_rcache_base_registration_t *mca_rcache_base_vma_tree_find (mca_rcache_base_vma_module_t *vma_module, + unsigned char *base, + unsigned char *bound); +/** + * Returns all registration that overlaps given memory region + */ +int mca_rcache_base_vma_tree_find_all ( + mca_rcache_base_vma_module_t *vma_module, unsigned char *base, + unsigned char *bound, mca_rcache_base_registration_t **regs, + int reg_cnt); + +/* + * insert an item in the vma tree + */ +int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module, + mca_rcache_base_registration_t* reg, size_t limit); + +/* + * remove an item from the vma tree + */ +int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module, + mca_rcache_base_registration_t *reg); + +/* + * Dump out the contents of the rcache for debugging. + */ +void mca_rcache_base_vma_tree_dump_range (mca_rcache_base_vma_module_t *vma_module, + unsigned char *base, size_t size, char *msg); + + +#endif /* MCA_RCACHE_BASE_VMA_TREE_H */ diff --git a/opal/mca/mpool/gpusm/Makefile.am b/opal/mca/rcache/gpusm/Makefile.am similarity index 66% rename from opal/mca/mpool/gpusm/Makefile.am rename to opal/mca/rcache/gpusm/Makefile.am index ce0f64f390..f2a0bdb050 100644 --- a/opal/mca/mpool/gpusm/Makefile.am +++ b/opal/mca/rcache/gpusm/Makefile.am @@ -11,6 +11,8 @@ # All rights reserved. # Copyright (c) 2010-2014 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2012 NVIDIA Corporation. All rights reserved. +# Copyright (c) 2015 Los Alamos National Security, LLC. All rights +# reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -18,40 +20,40 @@ # $HEADER$ # -AM_CPPFLAGS = $(mpool_gpusm_CPPFLAGS) +AM_CPPFLAGS = $(rcache_gpusm_CPPFLAGS) sources = \ - mpool_gpusm_module.c \ - mpool_gpusm_component.c + rcache_gpusm_module.c \ + rcache_gpusm_component.c if WANT_INSTALL_HEADERS opaldir = $(opalincludedir)/$(subdir) -opal_HEADERS = mpool_gpusm.h +opal_HEADERS = rcache_gpusm.h endif # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la # (for static builds). -if MCA_BUILD_opal_mpool_gpusm_DSO +if MCA_BUILD_opal_rcache_gpusm_DSO component_noinst = -component_install = mca_mpool_gpusm.la +component_install = mca_rcache_gpusm.la else -component_noinst = libmca_mpool_gpusm.la +component_noinst = libmca_rcache_gpusm.la component_install = endif mcacomponentdir = $(opallibdir) mcacomponent_LTLIBRARIES = $(component_install) -mca_mpool_gpusm_la_SOURCES = $(sources) -mca_mpool_gpusm_la_LDFLAGS = -module -avoid-version -mca_mpool_gpusm_la_LIBADD = $(mpool_gpusm_LIBS) +mca_rcache_gpusm_la_SOURCES = $(sources) +mca_rcache_gpusm_la_LDFLAGS = -module -avoid-version +mca_rcache_gpusm_la_LIBADD = $(rcache_gpusm_LIBS) if OPAL_cuda_support -mca_mpool_gpusm_la_LIBADD += \ +mca_rcache_gpusm_la_LIBADD += \ $(OPAL_TOP_BUILDDIR)/opal/mca/common/cuda/lib@OPAL_LIB_PREFIX@mca_common_cuda.la endif noinst_LTLIBRARIES = $(component_noinst) -libmca_mpool_gpusm_la_SOURCES = $(sources) -libmca_mpool_gpusm_la_LDFLAGS = -module -avoid-version -libmca_mpool_gpusm_la_LIBADD = $(mpool_gpusm_LIBS) +libmca_rcache_gpusm_la_SOURCES = $(sources) +libmca_rcache_gpusm_la_LDFLAGS = -module -avoid-version +libmca_rcache_gpusm_la_LIBADD = $(rcache_gpusm_LIBS) diff --git a/opal/mca/mpool/gpusm/configure.m4 b/opal/mca/rcache/gpusm/configure.m4 similarity index 71% rename from opal/mca/mpool/gpusm/configure.m4 rename to opal/mca/rcache/gpusm/configure.m4 index 52f0b765c2..2b792d7cc8 100644 --- a/opal/mca/mpool/gpusm/configure.m4 +++ b/opal/mca/rcache/gpusm/configure.m4 @@ -1,6 +1,8 @@ # -*- shell-script -*- # # Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved. +# Copyright (c) 2015 Los Alamos National Security, LLC. All rights +# reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -14,8 +16,8 @@ # the configure sequence by the opal_configure_options.m4 code. # -AC_DEFUN([MCA_opal_mpool_gpusm_CONFIG],[ - AC_CONFIG_FILES([opal/mca/mpool/gpusm/Makefile]) +AC_DEFUN([MCA_opal_rcache_gpusm_CONFIG],[ + AC_CONFIG_FILES([opal/mca/rcache/gpusm/Makefile]) # Use CUDA_SUPPORT which was filled in by the opal configure code. AS_IF([test "x$CUDA_SUPPORT" = "x1"], diff --git a/opal/mca/mpool/gpusm/owner.txt b/opal/mca/rcache/gpusm/owner.txt similarity index 100% rename from opal/mca/mpool/gpusm/owner.txt rename to opal/mca/rcache/gpusm/owner.txt diff --git a/opal/mca/rcache/gpusm/rcache_gpusm.h b/opal/mca/rcache/gpusm/rcache_gpusm.h new file mode 100644 index 0000000000..00733f3c00 --- /dev/null +++ b/opal/mca/rcache/gpusm/rcache_gpusm.h @@ -0,0 +1,88 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved. + * Copyright (c) 2015 Los Alamos National Security, LLC. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_RCACHE_GPUSM_H +#define MCA_RCACHE_GPUSM_H + +#include "opal_config.h" +#include "opal/class/opal_list.h" +#include "opal/mca/rcache/rcache.h" + +BEGIN_C_DECLS + +#define MEMHANDLE_SIZE 8 +#define EVTHANDLE_SIZE 8 +struct mca_rcache_gpusm_registration_t { + mca_rcache_base_registration_t base; + uint64_t memHandle[MEMHANDLE_SIZE]; /* CUipcMemHandle */ + uint64_t evtHandle[EVTHANDLE_SIZE]; /* CUipcEventHandle */ + uintptr_t event; /* CUevent */ +}; +typedef struct mca_rcache_gpusm_registration_t mca_rcache_gpusm_registration_t; +OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_rcache_gpusm_registration_t); + +struct mca_rcache_gpusm_component_t { + mca_rcache_base_component_t super; +}; +typedef struct mca_rcache_gpusm_component_t mca_rcache_gpusm_component_t; + +OPAL_DECLSPEC extern mca_rcache_gpusm_component_t mca_rcache_gpusm_component; + +struct mca_rcache_gpusm_module_t { + mca_rcache_base_module_t super; + opal_free_list_t reg_list; +}; typedef struct mca_rcache_gpusm_module_t mca_rcache_gpusm_module_t; + +/* + * Initializes the rcache module. + */ +void mca_rcache_gpusm_module_init(mca_rcache_gpusm_module_t *rcache); + +/** + * register block of memory + */ +int mca_rcache_gpusm_register(mca_rcache_base_module_t* rcache, void *addr, + size_t size, uint32_t flags, int32_t access_flags, mca_rcache_base_registration_t **reg); + +/** + * deregister memory + */ +int mca_rcache_gpusm_deregister(mca_rcache_base_module_t *rcache, + mca_rcache_base_registration_t *reg); + +/** + * find registration for a given block of memory + */ +int mca_rcache_gpusm_find(struct mca_rcache_base_module_t* rcache, void* addr, + size_t size, mca_rcache_base_registration_t **reg); + +/** + * finalize rcache + */ +void mca_rcache_gpusm_finalize(struct mca_rcache_base_module_t *rcache); + +END_C_DECLS +#endif diff --git a/opal/mca/mpool/gpusm/mpool_gpusm_component.c b/opal/mca/rcache/gpusm/rcache_gpusm_component.c similarity index 76% rename from opal/mca/mpool/gpusm/mpool_gpusm_component.c rename to opal/mca/rcache/gpusm/rcache_gpusm_component.c index 9a444e1466..fed00b62cf 100644 --- a/opal/mca/mpool/gpusm/mpool_gpusm_component.c +++ b/opal/mca/rcache/gpusm/rcache_gpusm_component.c @@ -26,7 +26,7 @@ #define OPAL_DISABLE_ENABLE_MEM_DEBUG 1 #include "opal_config.h" #include "opal/mca/base/base.h" -#include "mpool_gpusm.h" +#include "rcache_gpusm.h" #ifdef HAVE_UNISTD_H #include #endif @@ -40,15 +40,15 @@ static int gpusm_open(void); static int gpusm_close(void); static int gpusm_register(void); -static mca_mpool_base_module_t* gpusm_init(struct mca_mpool_base_resources_t* resources); +static mca_rcache_base_module_t* gpusm_init(struct mca_rcache_base_resources_t* resources); -mca_mpool_gpusm_component_t mca_mpool_gpusm_component = { +mca_rcache_gpusm_component_t mca_rcache_gpusm_component = { { /* First, the mca_base_component_t struct containing meta information about the component itself */ - .mpool_version = { - MCA_MPOOL_BASE_VERSION_2_0_0, + .rcache_version = { + MCA_RCACHE_BASE_VERSION_3_0_0, .mca_component_name = "gpusm", MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, @@ -57,12 +57,12 @@ mca_mpool_gpusm_component_t mca_mpool_gpusm_component = { .mca_close_component = gpusm_close, .mca_register_component_params = gpusm_register, }, - .mpool_data = { + .rcache_data = { /* The component is checkpoint ready */ MCA_BASE_METADATA_PARAM_CHECKPOINT }, - .mpool_init = gpusm_init, + .rcache_init = gpusm_init, } }; @@ -88,16 +88,18 @@ static int gpusm_close(void) } -static mca_mpool_base_module_t* gpusm_init(struct mca_mpool_base_resources_t *resources) +static mca_rcache_base_module_t* gpusm_init(struct mca_rcache_base_resources_t *resources) { - mca_mpool_gpusm_module_t* mpool_module; + mca_rcache_gpusm_module_t* rcache_module; - mpool_module = - (mca_mpool_gpusm_module_t*)malloc(sizeof(mca_mpool_gpusm_module_t)); + (void) resources; - mpool_module->resources = *resources; + rcache_module = (mca_rcache_gpusm_module_t *) calloc (1, sizeof (*rcache_module)); + if (NULL == rcache_module) { + return NULL; + } - mca_mpool_gpusm_module_init(mpool_module); + mca_rcache_gpusm_module_init(rcache_module); - return &mpool_module->super; + return &rcache_module->super; } diff --git a/opal/mca/mpool/gpusm/mpool_gpusm_module.c b/opal/mca/rcache/gpusm/rcache_gpusm_module.c similarity index 59% rename from opal/mca/mpool/gpusm/mpool_gpusm_module.c rename to opal/mca/rcache/gpusm/rcache_gpusm_module.c index 98740bbdcd..caf8913a93 100644 --- a/opal/mca/mpool/gpusm/mpool_gpusm_module.c +++ b/opal/mca/rcache/gpusm/rcache_gpusm_module.c @@ -38,15 +38,15 @@ */ #include "opal_config.h" -#include "opal/mca/mpool/base/base.h" -#include "opal/mca/mpool/gpusm/mpool_gpusm.h" +#include "opal/mca/rcache/base/base.h" +#include "opal/mca/rcache/gpusm/rcache_gpusm.h" #include "opal/mca/common/cuda/common_cuda.h" /** * Called when the registration free list is created. An event is created * for each entry. */ -static void mca_mpool_gpusm_registration_constructor( mca_mpool_gpusm_registration_t *item ) +static void mca_rcache_gpusm_registration_constructor( mca_rcache_gpusm_registration_t *item ) { mca_common_cuda_construct_event_and_handle(&item->event, (void *)&item->evtHandle); @@ -55,47 +55,34 @@ static void mca_mpool_gpusm_registration_constructor( mca_mpool_gpusm_registrati /** * Called when the program is exiting. This destroys the events. */ -static void mca_mpool_gpusm_registration_destructor( mca_mpool_gpusm_registration_t *item ) +static void mca_rcache_gpusm_registration_destructor( mca_rcache_gpusm_registration_t *item ) { mca_common_cuda_destruct_event(item->event); } -OBJ_CLASS_INSTANCE(mca_mpool_gpusm_registration_t, mca_mpool_base_registration_t, - mca_mpool_gpusm_registration_constructor, - mca_mpool_gpusm_registration_destructor); +OBJ_CLASS_INSTANCE(mca_rcache_gpusm_registration_t, mca_rcache_base_registration_t, + mca_rcache_gpusm_registration_constructor, + mca_rcache_gpusm_registration_destructor); /* - * Initializes the mpool module. + * Initializes the rcache module. */ -void mca_mpool_gpusm_module_init(mca_mpool_gpusm_module_t* mpool) +void mca_rcache_gpusm_module_init(mca_rcache_gpusm_module_t* rcache) { - mpool->super.mpool_component = &mca_mpool_gpusm_component.super; - mpool->super.mpool_base = NULL; - mpool->super.mpool_alloc = NULL; - mpool->super.mpool_realloc = NULL; - mpool->super.mpool_free = NULL; - mpool->super.mpool_register = mca_mpool_gpusm_register; - mpool->super.mpool_find = mca_mpool_gpusm_find; - mpool->super.mpool_deregister = mca_mpool_gpusm_deregister; - mpool->super.mpool_release_memory = NULL; - mpool->super.mpool_finalize = mca_mpool_gpusm_finalize; - mpool->super.mpool_ft_event = mca_mpool_gpusm_ft_event; - mpool->super.rcache = NULL; - mpool->super.flags = 0; + rcache->super.rcache_component = &mca_rcache_gpusm_component.super; + rcache->super.rcache_register = mca_rcache_gpusm_register; + rcache->super.rcache_find = mca_rcache_gpusm_find; + rcache->super.rcache_deregister = mca_rcache_gpusm_deregister; + rcache->super.rcache_finalize = mca_rcache_gpusm_finalize; - mpool->resources.reg_data = NULL; - mpool->resources.sizeof_reg = sizeof(struct mca_mpool_common_cuda_reg_t); - mpool->resources.register_mem = cuda_getmemhandle; - mpool->resources.deregister_mem = cuda_ungetmemhandle; - - OBJ_CONSTRUCT(&mpool->reg_list, opal_free_list_t); + OBJ_CONSTRUCT(&rcache->reg_list, opal_free_list_t); /* Start with 0 entries in the free list since CUDA may not have * been initialized when this free list is created and there is * some CUDA specific activities that need to be done. */ - opal_free_list_init (&mpool->reg_list, mpool->resources.sizeof_reg, + opal_free_list_init (&rcache->reg_list, sizeof(struct mca_rcache_common_cuda_reg_t), opal_cache_line_size, - OBJ_CLASS(mca_mpool_gpusm_registration_t), + OBJ_CLASS(mca_rcache_gpusm_registration_t), 0,opal_cache_line_size, 0, -1, 64, NULL, 0, NULL, NULL, NULL); @@ -105,11 +92,11 @@ void mca_mpool_gpusm_module_init(mca_mpool_gpusm_module_t* mpool) * Just go ahead and get a new registration. The find and register * functions are the same thing for this memory pool. */ -int mca_mpool_gpusm_find(mca_mpool_base_module_t *mpool, void *addr, +int mca_rcache_gpusm_find(mca_rcache_base_module_t *rcache, void *addr, size_t size, - mca_mpool_base_registration_t **reg) + mca_rcache_base_registration_t **reg) { - return mca_mpool_gpusm_register(mpool, addr, size, 0, 0, reg); + return mca_rcache_gpusm_register(rcache, addr, size, 0, 0, reg); } /* @@ -118,12 +105,12 @@ int mca_mpool_gpusm_find(mca_mpool_base_module_t *mpool, void *addr, * buffer. There is no need to deregister the memory handle so the * deregister function is a no-op. */ -int mca_mpool_gpusm_register(mca_mpool_base_module_t *mpool, void *addr, +int mca_rcache_gpusm_register(mca_rcache_base_module_t *rcache, void *addr, size_t size, uint32_t flags, int32_t access_flags, - mca_mpool_base_registration_t **reg) + mca_rcache_base_registration_t **reg) { - mca_mpool_gpusm_module_t *mpool_gpusm = (mca_mpool_gpusm_module_t*)mpool; - mca_mpool_base_registration_t *gpusm_reg; + mca_rcache_gpusm_module_t *rcache_gpusm = (mca_rcache_gpusm_module_t*)rcache; + mca_rcache_base_registration_t *gpusm_reg; opal_free_list_item_t *item; unsigned char *base, *bound; int rc; @@ -137,22 +124,22 @@ int mca_mpool_gpusm_register(mca_mpool_base_module_t *mpool, void *addr, base = addr; bound = (unsigned char *)addr + size - 1; - item = opal_free_list_get (&mpool_gpusm->reg_list); + item = opal_free_list_get (&rcache_gpusm->reg_list); if(NULL == item) { return OPAL_ERR_OUT_OF_RESOURCE; } - gpusm_reg = (mca_mpool_base_registration_t*)item; + gpusm_reg = (mca_rcache_base_registration_t*)item; - gpusm_reg->mpool = mpool; + gpusm_reg->rcache = rcache; gpusm_reg->base = base; gpusm_reg->bound = bound; gpusm_reg->flags = flags; gpusm_reg->access_flags = access_flags; - rc = mpool_gpusm->resources.register_mem(base, size, gpusm_reg, NULL); + rc = cuda_getmemhandle (base, size, gpusm_reg, NULL); if(rc != OPAL_SUCCESS) { - opal_free_list_return (&mpool_gpusm->reg_list, item); + opal_free_list_return (&rcache_gpusm->reg_list, item); return rc; } @@ -165,36 +152,32 @@ int mca_mpool_gpusm_register(mca_mpool_base_module_t *mpool, void *addr, /* * Return the registration to the free list. */ -int mca_mpool_gpusm_deregister(struct mca_mpool_base_module_t *mpool, - mca_mpool_base_registration_t *reg) +int mca_rcache_gpusm_deregister(struct mca_rcache_base_module_t *rcache, + mca_rcache_base_registration_t *reg) { int rc; - mca_mpool_gpusm_module_t *mpool_gpusm = (mca_mpool_gpusm_module_t *)mpool; + mca_rcache_gpusm_module_t *rcache_gpusm = (mca_rcache_gpusm_module_t *)rcache; - rc = mpool_gpusm->resources.deregister_mem(mpool, reg); - opal_free_list_return (&mpool_gpusm->reg_list, (opal_free_list_item_t *) reg); + rc = cuda_ungetmemhandle (NULL, reg); + opal_free_list_return (&rcache_gpusm->reg_list, (opal_free_list_item_t *) reg); return OPAL_SUCCESS; } /** * Free up the resources. */ -void mca_mpool_gpusm_finalize(struct mca_mpool_base_module_t *mpool) +void mca_rcache_gpusm_finalize(struct mca_rcache_base_module_t *rcache) { opal_free_list_item_t *item; - mca_mpool_gpusm_module_t *mpool_gpusm = (mca_mpool_gpusm_module_t *)mpool; + mca_rcache_gpusm_module_t *rcache_gpusm = (mca_rcache_gpusm_module_t *)rcache; /* Need to run the destructor on each item in the free list explicitly. * The destruction of the free list only runs the destructor on the * main free list, not each item. */ - while (NULL != (item = (opal_free_list_item_t *)opal_lifo_pop(&(mpool_gpusm->reg_list.super)))) { + while (NULL != (item = (opal_free_list_item_t *)opal_lifo_pop(&(rcache_gpusm->reg_list.super)))) { OBJ_DESTRUCT(item); } - OBJ_DESTRUCT(&mpool_gpusm->reg_list); + OBJ_DESTRUCT(&rcache_gpusm->reg_list); return; } - -int mca_mpool_gpusm_ft_event(int state) { - return OPAL_SUCCESS; -} diff --git a/opal/mca/mpool/grdma/Makefile.am b/opal/mca/rcache/grdma/Makefile.am similarity index 68% rename from opal/mca/mpool/grdma/Makefile.am rename to opal/mca/rcache/grdma/Makefile.am index 7f29b2eab7..b10cf7ada7 100644 --- a/opal/mca/mpool/grdma/Makefile.am +++ b/opal/mca/rcache/grdma/Makefile.am @@ -10,7 +10,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2010-2014 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2012 Los Alamos National Security, LLC. All rights +# Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights # reserved. # $COPYRIGHT$ # @@ -19,40 +19,40 @@ # $HEADER$ # -AM_CPPFLAGS = $(mpool_grdma_CPPFLAGS) +AM_CPPFLAGS = $(rcache_grdma_CPPFLAGS) sources = \ - mpool_grdma_module.c \ - mpool_grdma_component.c + rcache_grdma_module.c \ + rcache_grdma_component.c if WANT_INSTALL_HEADERS opaldir = $(opalincludedir)/$(subdir) -opal_HEADERS = mpool_grdma.h +opal_HEADERS = rcache_grdma.h endif # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la # (for static builds). -if MCA_BUILD_opal_mpool_grdma_DSO +if MCA_BUILD_opal_rcache_grdma_DSO component_noinst = -component_install = mca_mpool_grdma.la +component_install = mca_rcache_grdma.la else -component_noinst = libmca_mpool_grdma.la +component_noinst = libmca_rcache_grdma.la component_install = endif mcacomponentdir = $(opallibdir) mcacomponent_LTLIBRARIES = $(component_install) -mca_mpool_grdma_la_SOURCES = $(sources) -mca_mpool_grdma_la_LDFLAGS = -module -avoid-version -mca_mpool_grdma_la_LIBADD = $(mpool_grdma_LIBS) +mca_rcache_grdma_la_SOURCES = $(sources) +mca_rcache_grdma_la_LDFLAGS = -module -avoid-version +mca_rcache_grdma_la_LIBADD = $(rcache_grdma_LIBS) if OPAL_cuda_support -mca_mpool_grdma_la_LIBADD += \ +mca_rcache_grdma_la_LIBADD += \ $(OPAL_TOP_BUILDDIR)/opal/mca/common/cuda/lib@OPAL_LIB_PREFIX@mca_common_cuda.la endif noinst_LTLIBRARIES = $(component_noinst) -libmca_mpool_grdma_la_SOURCES = $(sources) -libmca_mpool_grdma_la_LDFLAGS = -module -avoid-version -libmca_mpool_grdma_la_LIBADD = $(mpool_grdma_LIBS) +libmca_rcache_grdma_la_SOURCES = $(sources) +libmca_rcache_grdma_la_LDFLAGS = -module -avoid-version +libmca_rcache_grdma_la_LIBADD = $(rcache_grdma_LIBS) diff --git a/opal/mca/mpool/grdma/owner.txt b/opal/mca/rcache/grdma/owner.txt similarity index 100% rename from opal/mca/mpool/grdma/owner.txt rename to opal/mca/rcache/grdma/owner.txt diff --git a/opal/mca/rcache/grdma/rcache_grdma.h b/opal/mca/rcache/grdma/rcache_grdma.h new file mode 100644 index 0000000000..ec80f6c29c --- /dev/null +++ b/opal/mca/rcache/grdma/rcache_grdma.h @@ -0,0 +1,82 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_RCACHE_GRDMA_H +#define MCA_RCACHE_GRDMA_H + +#include "opal_config.h" +#include "opal/class/opal_list.h" +#include "opal/mca/event/event.h" +#include "opal/mca/rcache/rcache.h" +#if HAVE_SYS_MMAN_H +#include +#endif + +BEGIN_C_DECLS + +struct mca_rcache_grdma_cache_t { + opal_list_item_t super; + char *cache_name; + opal_list_t lru_list; + opal_list_t gc_list; + mca_rcache_base_vma_module_t *vma_module; +}; +typedef struct mca_rcache_grdma_cache_t mca_rcache_grdma_cache_t; + +OBJ_CLASS_DECLARATION(mca_rcache_grdma_cache_t); + +struct mca_rcache_grdma_component_t { + mca_rcache_base_component_t super; + opal_list_t caches; + char *rcache_name; + bool print_stats; + int leave_pinned; +}; +typedef struct mca_rcache_grdma_component_t mca_rcache_grdma_component_t; + +OPAL_DECLSPEC extern mca_rcache_grdma_component_t mca_rcache_grdma_component; + +struct mca_rcache_grdma_module_t; + +struct mca_rcache_grdma_module_t { + mca_rcache_base_module_t super; + struct mca_rcache_base_resources_t resources; + mca_rcache_grdma_cache_t *cache; + opal_free_list_t reg_list; + uint32_t stat_cache_hit; + uint32_t stat_cache_miss; + uint32_t stat_evicted; + uint32_t stat_cache_found; + uint32_t stat_cache_notfound; +}; +typedef struct mca_rcache_grdma_module_t mca_rcache_grdma_module_t; + +/* + * Initializes the rcache module. + */ +void mca_rcache_grdma_module_init(mca_rcache_grdma_module_t *rcache, mca_rcache_grdma_cache_t *cache); + +END_C_DECLS +#endif diff --git a/opal/mca/mpool/grdma/mpool_grdma_component.c b/opal/mca/rcache/grdma/rcache_grdma_component.c similarity index 54% rename from opal/mca/mpool/grdma/mpool_grdma_component.c rename to opal/mca/rcache/grdma/rcache_grdma_component.c index 2782faa422..08760effaf 100644 --- a/opal/mca/mpool/grdma/mpool_grdma_component.c +++ b/opal/mca/rcache/grdma/rcache_grdma_component.c @@ -26,7 +26,7 @@ #include "opal_config.h" #include "opal/mca/base/base.h" #include "opal/runtime/opal_params.h" -#include "mpool_grdma.h" +#include "rcache_grdma.h" #ifdef HAVE_UNISTD_H #include #endif @@ -39,16 +39,16 @@ static int grdma_open(void); static int grdma_close(void); static int grdma_register(void); -static mca_mpool_base_module_t* grdma_init( - struct mca_mpool_base_resources_t* resources); +static mca_rcache_base_module_t* grdma_init( + struct mca_rcache_base_resources_t* resources); -mca_mpool_grdma_component_t mca_mpool_grdma_component = { +mca_rcache_grdma_component_t mca_rcache_grdma_component = { { /* First, the mca_base_component_t struct containing meta information about the component itself */ - .mpool_version = { - MCA_MPOOL_BASE_VERSION_2_0_0, + .rcache_version = { + MCA_RCACHE_BASE_VERSION_3_0_0, .mca_component_name = "grdma", MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, @@ -57,12 +57,12 @@ mca_mpool_grdma_component_t mca_mpool_grdma_component = { .mca_close_component = grdma_close, .mca_register_component_params = grdma_register, }, - .mpool_data = { + .rcache_data = { /* The component is checkpoint ready */ MCA_BASE_METADATA_PARAM_CHECKPOINT }, - .mpool_init = grdma_init, + .rcache_init = grdma_init, } }; @@ -71,7 +71,7 @@ mca_mpool_grdma_component_t mca_mpool_grdma_component = { */ static int grdma_open(void) { - OBJ_CONSTRUCT(&mca_mpool_grdma_component.pools, opal_list_t); + OBJ_CONSTRUCT(&mca_rcache_grdma_component.caches, opal_list_t); return OPAL_SUCCESS; } @@ -79,22 +79,13 @@ static int grdma_open(void) static int grdma_register(void) { - mca_mpool_grdma_component.rcache_name = "vma"; - (void) mca_base_component_var_register(&mca_mpool_grdma_component.super.mpool_version, - "rcache_name", - "The name of the registration cache the mpool should use", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &mca_mpool_grdma_component.rcache_name); - - mca_mpool_grdma_component.print_stats = false; - (void) mca_base_component_var_register(&mca_mpool_grdma_component.super.mpool_version, - "print_stats", "print pool usage statistics at the end of the run", + mca_rcache_grdma_component.print_stats = false; + (void) mca_base_component_var_register(&mca_rcache_grdma_component.super.rcache_version, + "print_stats", "print registration cache usage statistics at the end of the run", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &mca_mpool_grdma_component.print_stats); + &mca_rcache_grdma_component.print_stats); return OPAL_SUCCESS; } @@ -102,56 +93,48 @@ static int grdma_register(void) static int grdma_close(void) { - OBJ_DESTRUCT(&mca_mpool_grdma_component.pools); - + OPAL_LIST_DESTRUCT(&mca_rcache_grdma_component.caches); return OPAL_SUCCESS; } -static mca_mpool_base_module_t * -grdma_init(struct mca_mpool_base_resources_t *resources) +static mca_rcache_base_module_t * +grdma_init(struct mca_rcache_base_resources_t *resources) { - mca_mpool_grdma_module_t* mpool_module; - mca_mpool_grdma_pool_t *pool = NULL; - opal_list_item_t *item; + mca_rcache_grdma_module_t *rcache_module; + mca_rcache_grdma_cache_t *cache = NULL, *item, *next; /* Set this here (vs in component.c) because opal_leave_pinned* may have been set after MCA params were read (e.g., by the openib btl) */ - mca_mpool_grdma_component.leave_pinned = (int) + mca_rcache_grdma_component.leave_pinned = (int) (1 == opal_leave_pinned || opal_leave_pinned_pipeline); /* find the specified pool */ - for (item = opal_list_get_first (&mca_mpool_grdma_component.pools) ; - item != opal_list_get_end (&mca_mpool_grdma_component.pools) ; - item = opal_list_get_next (item)) { - pool = (mca_mpool_grdma_pool_t *) item; - - if (0 == strcmp (pool->pool_name, resources->pool_name)) { + OPAL_LIST_FOREACH_SAFE(item, next, &mca_rcache_grdma_component.caches, mca_rcache_grdma_cache_t) { + if (0 == strcmp (cache->cache_name, resources->cache_name)) { + cache = item; break; } - - pool = NULL; } - if (NULL == pool) { - /* create new pool */ - pool = OBJ_NEW(mca_mpool_grdma_pool_t); - if (NULL == pool) { + if (NULL == cache) { + /* create new cache */ + cache = OBJ_NEW(mca_rcache_grdma_cache_t); + if (NULL == cache) { return NULL; } - pool->pool_name = strdup (resources->pool_name); + cache->cache_name = strdup (resources->cache_name); - opal_list_append (&mca_mpool_grdma_component.pools, &pool->super); + opal_list_append (&mca_rcache_grdma_component.caches, &cache->super); } - mpool_module = - (mca_mpool_grdma_module_t *) malloc (sizeof (mca_mpool_grdma_module_t)); + rcache_module = (mca_rcache_grdma_module_t *) malloc (sizeof (*rcache_module)); - mpool_module->resources = *resources; + rcache_module->resources = *resources; - mca_mpool_grdma_module_init(mpool_module, pool); + mca_rcache_grdma_module_init (rcache_module, cache); - return &mpool_module->super; + return &rcache_module->super; } diff --git a/opal/mca/rcache/grdma/rcache_grdma_module.c b/opal/mca/rcache/grdma/rcache_grdma_module.c new file mode 100644 index 0000000000..d0789b60fe --- /dev/null +++ b/opal/mca/rcache/grdma/rcache_grdma_module.c @@ -0,0 +1,513 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1 +#include "opal_config.h" + +#include +#include +#include + +#include "opal/align.h" + +#include "opal/util/proc.h" +#if OPAL_CUDA_GDR_SUPPORT +#include "opal/mca/common/cuda/common_cuda.h" +#endif /* OPAL_CUDA_GDR_SUPPORT */ +#include "opal/mca/rcache/rcache.h" +#include "opal/mca/rcache/base/base.h" + +#include "opal/util/sys_limits.h" +#include "opal/align.h" +#include "rcache_grdma.h" + + +static int mca_rcache_grdma_register (mca_rcache_base_module_t *rcache, void *addr, + size_t size, uint32_t flags, int32_t access_flags, + mca_rcache_base_registration_t **reg); +static int mca_rcache_grdma_deregister (mca_rcache_base_module_t *rcache, + mca_rcache_base_registration_t *reg); +static int mca_rcache_grdma_find (mca_rcache_base_module_t *rcache, void *addr, + size_t size, mca_rcache_base_registration_t **reg); +static int mca_rcache_grdma_invalidate_range (mca_rcache_base_module_t *rcache, void *base, + size_t size); +static void mca_rcache_grdma_finalize (mca_rcache_base_module_t *rcache); +static bool mca_rcache_grdma_evict (mca_rcache_base_module_t *rcache); + +static inline bool registration_is_cacheable(mca_rcache_base_registration_t *reg) +{ + return (mca_rcache_grdma_component.leave_pinned && + !(reg->flags & + (MCA_RCACHE_FLAGS_CACHE_BYPASS | + MCA_RCACHE_FLAGS_PERSIST | + MCA_RCACHE_FLAGS_INVALID))); +} + +#if OPAL_CUDA_GDR_SUPPORT +static int check_for_cuda_freed_memory(mca_rcache_base_module_t *rcache, void *addr, size_t size); +#endif /* OPAL_CUDA_GDR_SUPPORT */ +static void mca_rcache_grdma_cache_contructor (mca_rcache_grdma_cache_t *cache) +{ + memset ((void *)((uintptr_t)cache + sizeof (cache->super)), 0, sizeof (*cache) - sizeof (cache->super)); + + OBJ_CONSTRUCT(&cache->lru_list, opal_list_t); + OBJ_CONSTRUCT(&cache->gc_list, opal_list_t); + + cache->vma_module = mca_rcache_base_vma_module_alloc (); +} + +static void mca_rcache_grdma_cache_destructor (mca_rcache_grdma_cache_t *cache) +{ + OBJ_DESTRUCT(&cache->lru_list); + OBJ_DESTRUCT(&cache->gc_list); + + free (cache->cache_name); +} + +OBJ_CLASS_INSTANCE(mca_rcache_grdma_cache_t, opal_list_item_t, + mca_rcache_grdma_cache_contructor, + mca_rcache_grdma_cache_destructor); + +/* + * Initializes the rcache module. + */ +void mca_rcache_grdma_module_init(mca_rcache_grdma_module_t* rcache, mca_rcache_grdma_cache_t *cache) +{ + OBJ_RETAIN(cache); + rcache->cache = cache; + + rcache->super.rcache_component = &mca_rcache_grdma_component.super; + rcache->super.rcache_register = mca_rcache_grdma_register; + rcache->super.rcache_find = mca_rcache_grdma_find; + rcache->super.rcache_deregister = mca_rcache_grdma_deregister; + rcache->super.rcache_invalidate_range = mca_rcache_grdma_invalidate_range; + rcache->super.rcache_finalize = mca_rcache_grdma_finalize; + rcache->super.rcache_evict = mca_rcache_grdma_evict; + + rcache->stat_cache_hit = rcache->stat_cache_miss = rcache->stat_evicted = 0; + rcache->stat_cache_found = rcache->stat_cache_notfound = 0; + + OBJ_CONSTRUCT(&rcache->reg_list, opal_free_list_t); + opal_free_list_init (&rcache->reg_list, rcache->resources.sizeof_reg, + opal_cache_line_size, + OBJ_CLASS(mca_rcache_base_registration_t), + 0, opal_cache_line_size, 0, -1, 32, NULL, 0, + NULL, NULL, NULL); +} + +static inline int dereg_mem(mca_rcache_base_registration_t *reg) +{ + mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) reg->rcache; + int rc; + + if(!(reg->flags & MCA_RCACHE_FLAGS_CACHE_BYPASS)) { + mca_rcache_base_vma_delete (rcache_grdma->cache->vma_module, reg); + } + + rc = rcache_grdma->resources.deregister_mem (rcache_grdma->resources.reg_data, reg); + if (OPAL_LIKELY(OPAL_SUCCESS == rc)) { + opal_free_list_return (&rcache_grdma->reg_list, + (opal_free_list_item_t *) reg); + } + + return rc; +} + +/* This function must be called with the rcache lock held */ +static inline void do_unregistration_gc (mca_rcache_base_module_t *rcache) +{ + mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) rcache; + opal_list_item_t *item; + + /* Remove registration from garbage collection list + before deregistering it */ + while (NULL != + (item = opal_list_remove_first(&rcache_grdma->cache->gc_list))) { + dereg_mem((mca_rcache_base_registration_t *) item); + } +} + +static inline bool mca_rcache_grdma_evict_lru_local (mca_rcache_grdma_cache_t *cache) +{ + mca_rcache_grdma_module_t *rcache_grdma; + mca_rcache_base_registration_t *old_reg; + + old_reg = (mca_rcache_base_registration_t *) + opal_list_remove_first (&cache->lru_list); + if (NULL == old_reg) { + return false; + } + + rcache_grdma = (mca_rcache_grdma_module_t *) old_reg->rcache; + + (void) dereg_mem (old_reg); + + rcache_grdma->stat_evicted++; + + return true; +} + +static bool mca_rcache_grdma_evict (mca_rcache_base_module_t *rcache) +{ + return mca_rcache_grdma_evict_lru_local (((mca_rcache_grdma_module_t *) rcache)->cache); +} + +/* + * register memory + */ +static int mca_rcache_grdma_register (mca_rcache_base_module_t *rcache, void *addr, + size_t size, uint32_t flags, int32_t access_flags, + mca_rcache_base_registration_t **reg) +{ + mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t*)rcache; + const bool bypass_cache = !!(flags & MCA_RCACHE_FLAGS_CACHE_BYPASS); + const bool persist = !!(flags & MCA_RCACHE_FLAGS_PERSIST); + mca_rcache_base_registration_t *grdma_reg; + opal_free_list_item_t *item; + unsigned char *base, *bound; + unsigned int page_size = opal_getpagesize (); + int rc; + + OPAL_THREAD_LOCK(&rcache_grdma->cache->vma_module->vma_lock); + + *reg = NULL; + + /* if cache bypass is requested don't use the cache */ + base = OPAL_DOWN_ALIGN_PTR(addr, page_size, unsigned char *); + bound = OPAL_ALIGN_PTR((intptr_t) addr + size, page_size, unsigned char *) - 1; + if (!opal_list_is_empty (&rcache_grdma->cache->gc_list)) + do_unregistration_gc(rcache); + +#if OPAL_CUDA_GDR_SUPPORT + if (flags & MCA_RCACHE_FLAGS_CUDA_GPU_MEM) { + size_t psize; + mca_common_cuda_get_address_range(&base, &psize, addr); + bound = base + psize - 1; + /* Check to see if this memory is in the cache and if it has been freed. If so, + * this call will boot it out of the cache. */ + check_for_cuda_freed_memory(rcache, base, psize); + } +#endif /* OPAL_CUDA_GDR_SUPPORT */ + + /* look through existing regs if not persistent registration requested. + * Persistent registration are always registered and placed in the cache */ + if(!(bypass_cache || persist)) { + /* check to see if memory is registered */ + mca_rcache_base_vma_find (rcache_grdma->cache->vma_module, base, bound - base + 1, &grdma_reg); + if (grdma_reg && !(flags & MCA_RCACHE_FLAGS_INVALID)) { + if (OPAL_UNLIKELY((access_flags & grdma_reg->access_flags) != access_flags)) { + access_flags |= grdma_reg->access_flags; + + if (0 != grdma_reg->ref_count) { + if (!(grdma_reg->flags & MCA_RCACHE_FLAGS_CACHE_BYPASS)) { + mca_rcache_base_vma_delete (rcache_grdma->cache->vma_module, grdma_reg); + } + + /* mark the registration to go away when it is deregistered */ + grdma_reg->flags |= MCA_RCACHE_FLAGS_INVALID | MCA_RCACHE_FLAGS_CACHE_BYPASS; + } else { + if (registration_is_cacheable (grdma_reg)) { + /* pull the item out of the lru */ + opal_list_remove_item (&rcache_grdma->cache->lru_list, (opal_list_item_t *) grdma_reg); + } + + (void) dereg_mem (grdma_reg); + } + } else { + *reg = grdma_reg; + if (0 == grdma_reg->ref_count) { + /* Leave pinned must be set for this to still be in the rcache. */ + opal_list_remove_item(&rcache_grdma->cache->lru_list, + (opal_list_item_t *) grdma_reg); + } + + /* This segment fits fully within an existing segment. */ + rcache_grdma->stat_cache_hit++; + grdma_reg->ref_count++; + OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock); + return OPAL_SUCCESS; + } + } + + rcache_grdma->stat_cache_miss++; + + /* Unless explicitly requested by the caller always store the + * registration in the rcache. This will speed up the case where + * no leave pinned protocol is in use but the same segment is in + * use in multiple simultaneous transactions. We used to set bypass_cache + * here is !mca_rcache_grdma_component.leave_pinned. */ + } + + item = opal_free_list_get (&rcache_grdma->reg_list); + if(NULL == item) { + OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock); + return OPAL_ERR_OUT_OF_RESOURCE; + } + grdma_reg = (mca_rcache_base_registration_t*)item; + + grdma_reg->rcache = rcache; + grdma_reg->base = base; + grdma_reg->bound = bound; + grdma_reg->flags = flags; + grdma_reg->access_flags = access_flags; +#if OPAL_CUDA_GDR_SUPPORT + if (flags & MCA_RCACHE_FLAGS_CUDA_GPU_MEM) { + mca_common_cuda_get_buffer_id(grdma_reg); + } +#endif /* OPAL_CUDA_GDR_SUPPORT */ + + if (false == bypass_cache) { + rc = mca_rcache_base_vma_insert (rcache_grdma->cache->vma_module, grdma_reg, 0); + + if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) { + OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock); + opal_free_list_return (&rcache_grdma->reg_list, item); + return rc; + } + } + + while (OPAL_ERR_OUT_OF_RESOURCE == + (rc = rcache_grdma->resources.register_mem(rcache_grdma->resources.reg_data, + base, bound - base + 1, grdma_reg))) { + /* try to remove one unused reg and retry */ + if (!mca_rcache_grdma_evict (rcache)) { + break; + } + } + + if (OPAL_UNLIKELY(rc != OPAL_SUCCESS)) { + if (false == bypass_cache) { + mca_rcache_base_vma_delete (rcache_grdma->cache->vma_module, grdma_reg); + } + OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock); + opal_free_list_return (&rcache_grdma->reg_list, item); + return rc; + } + + *reg = grdma_reg; + (*reg)->ref_count++; + OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock); + + return OPAL_SUCCESS; +} + +static int mca_rcache_grdma_find (mca_rcache_base_module_t *rcache, void *addr, + size_t size, mca_rcache_base_registration_t **reg) +{ + mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t*)rcache; + unsigned long page_size = opal_getpagesize (); + unsigned char *base, *bound; + int rc; + + base = OPAL_DOWN_ALIGN_PTR(addr, page_size, unsigned char *); + bound = OPAL_ALIGN_PTR((intptr_t) addr + size - 1, page_size, unsigned char *); + + OPAL_THREAD_LOCK(&rcache_grdma->cache->vma_module->vma_lock); + + rc = mca_rcache_base_vma_find (rcache_grdma->cache->vma_module, base, bound - base + 1, reg); + if(NULL != *reg && + (mca_rcache_grdma_component.leave_pinned || + ((*reg)->flags & MCA_RCACHE_FLAGS_PERSIST) || + ((*reg)->base == base && (*reg)->bound == bound))) { + assert(((void*)(*reg)->bound) >= addr); + if(0 == (*reg)->ref_count && + mca_rcache_grdma_component.leave_pinned) { + opal_list_remove_item(&rcache_grdma->cache->lru_list, + (opal_list_item_t*)(*reg)); + } + rcache_grdma->stat_cache_found++; + (*reg)->ref_count++; + } else { + rcache_grdma->stat_cache_notfound++; + } + + OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock); + + return rc; +} + +static int mca_rcache_grdma_deregister (mca_rcache_base_module_t *rcache, + mca_rcache_base_registration_t *reg) +{ + mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) rcache; + int rc = OPAL_SUCCESS; + assert(reg->ref_count > 0); + + OPAL_THREAD_LOCK(&rcache_grdma->cache->vma_module->vma_lock); + reg->ref_count--; + if(reg->ref_count > 0) { + OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock); + return OPAL_SUCCESS; + } + + if (registration_is_cacheable(reg)) { + opal_list_append(&rcache_grdma->cache->lru_list, (opal_list_item_t *) reg); + } else { + rc = dereg_mem (reg); + } + OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock); + + return rc; +} + +#define GRDMA_RCACHE_NREGS 100 + +static int mca_rcache_grdma_invalidate_range (mca_rcache_base_module_t *rcache, + void *base, size_t size) +{ + mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) rcache; + mca_rcache_base_registration_t *regs[GRDMA_RCACHE_NREGS]; + int reg_cnt, i, rc = OPAL_SUCCESS; + + OPAL_THREAD_LOCK(&rcache_grdma->cache->vma_module->vma_lock); + do { + reg_cnt = mca_rcache_base_vma_find_all (rcache_grdma->cache->vma_module, base, + size, regs, GRDMA_RCACHE_NREGS); + + for(i = 0 ; i < reg_cnt ; ++i) { + regs[i]->flags |= MCA_RCACHE_FLAGS_INVALID; + if (regs[i]->ref_count) { + /* memory is being freed, but there are registration in use that + * covers the memory. This can happen even in a correct program, + * but may also be an user error. We can't tell. Mark the + * registration as invalid. It will not be used any more and + * will be unregistered when ref_count will become zero */ + rc = OPAL_ERROR; /* tell caller that something was wrong */ + } else { + opal_list_remove_item(&rcache_grdma->cache->lru_list,(opal_list_item_t *) regs[i]); + opal_list_append(&rcache_grdma->cache->gc_list, (opal_list_item_t *) regs[i]); + } + } + } while (reg_cnt == GRDMA_RCACHE_NREGS); + + OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock); + + return rc; +} + +/* Make sure this registration request is not stale. In other words, ensure + * that we do not have a cuMemAlloc, cuMemFree, cuMemAlloc state. If we do + * kick out the regisrations and deregister. This function needs to be called + * with the rcache->vma_module->vma_lock held. */ +#if OPAL_CUDA_GDR_SUPPORT +static int check_for_cuda_freed_memory (mca_rcache_base_module_t *rcache, void *addr, size_t size) +{ + mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) rcache; + mca_rcache_base_registration_t *regs[GRDMA_RCACHE_NREGS]; + int reg_cnt, i, rc = OPAL_SUCCESS; + mca_rcache_base_registration_t *reg; + + mca_rcache_base_vma_find (rcache_grdma->cache->vma_module, addr, size, ®); + if (NULL == reg) { + return OPAL_SUCCESS; + } + + /* If not previously freed memory, just return 0 */ + if (!(mca_common_cuda_previously_freed_memory(reg))) { + return OPAL_SUCCESS; + } + + /* rcache->vma_module->rcache_dump_range(rcache->rcache, 0, (size_t)-1, "Before free"); */ + + /* This memory has been freed. Find all registrations and delete */ + do { + reg_cnt = mca_rcache_base_vma_find_all (rcache_grdma->cache->vma_module, reg->base, + reg->bound - reg->base + 1, regs, + GRDMA_RCACHE_NREGS); + for(i = 0 ; i < reg_cnt ; ++i) { + regs[i]->flags |= MCA_RCACHE_FLAGS_INVALID; + if (regs[i]->ref_count) { + opal_output(0, "Release FAILED: ref_count=%d, base=%p, bound=%p, size=%d", + regs[i]->ref_count, regs[i]->base, regs[i]->bound, + (int) (regs[i]->bound - regs[i]->base + 1)); + /* memory is being freed, but there are registration in use that + * covers the memory. This can happen even in a correct program, + * but may also be an user error. We can't tell. Mark the + * registration as invalid. It will not be used any more and + * will be unregistered when ref_count will become zero */ + rc = OPAL_ERROR; /* tell caller that something was wrong */ + } else { + opal_list_remove_item(&rcache_grdma->cache->lru_list,(opal_list_item_t *) regs[i]); + /* Now deregister. Do not use gc_list as we need to kick this out now. */ + dereg_mem(regs[i]); + } + } + } while(reg_cnt == GRDMA_RCACHE_NREGS); + + OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock); + /* rcache->rcache->rcache_dump_range(rcache->rcache, 0, (size_t)-1, "After free");*/ + + return rc; +} +#endif /* OPAL_CUDA_GDR_SUPPORT */ + +static void mca_rcache_grdma_finalize (mca_rcache_base_module_t *rcache) +{ + mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t*)rcache; + mca_rcache_base_registration_t *regs[GRDMA_RCACHE_NREGS]; + int reg_cnt, i; + + /* Statistic */ + if (true == mca_rcache_grdma_component.print_stats) { + opal_output(0, "%s grdma: stats " + "(hit/miss/found/not found/evicted): %d/%d/%d/%d/%d\n", + OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), + rcache_grdma->stat_cache_hit, rcache_grdma->stat_cache_miss, + rcache_grdma->stat_cache_found, rcache_grdma->stat_cache_notfound, + rcache_grdma->stat_evicted); + } + + OPAL_THREAD_LOCK(&rcache_grdma->cache->vma_module->vma_lock); + + do_unregistration_gc(rcache); + + do { + reg_cnt = mca_rcache_base_vma_find_all (rcache_grdma->cache->vma_module, 0, (size_t)-1, + regs, GRDMA_RCACHE_NREGS); + + for (i = 0 ; i < reg_cnt ; ++i) { + if (regs[i]->ref_count) { + regs[i]->ref_count = 0; /* otherwise dereg will fail on assert */ + } else if (mca_rcache_grdma_component.leave_pinned) { + opal_list_remove_item(&rcache_grdma->cache->lru_list, + (opal_list_item_t *) regs[i]); + } + + (void) dereg_mem(regs[i]); + } + } while (reg_cnt == GRDMA_RCACHE_NREGS); + + OBJ_RELEASE(rcache_grdma->cache); + + OBJ_DESTRUCT(&rcache_grdma->reg_list); + OPAL_THREAD_UNLOCK(&rcache_grdma->cache->vma_module->vma_lock); + + OBJ_RELEASE(rcache_grdma->cache->vma_module); + + /* this rcache was allocated by grdma_init in rcache_grdma_component.c */ + free(rcache); +} diff --git a/opal/mca/rcache/rcache.h b/opal/mca/rcache/rcache.h index 729b900536..ed8eabc5d2 100644 --- a/opal/mca/rcache/rcache.h +++ b/opal/mca/rcache/rcache.h @@ -30,40 +30,133 @@ #include "opal/threads/mutex.h" +/* forward-declaration of rcache module structure */ +struct mca_rcache_base_module_t; +typedef struct mca_rcache_base_module_t mca_rcache_base_module_t; + +enum { + /** bypass the cache when registering */ + MCA_RCACHE_FLAGS_CACHE_BYPASS = 0x0001, + /** persistent registration */ + MCA_RCACHE_FLAGS_PERSIST = 0x0002, + /** registation requires strong ordering (disables relaxed ordering) */ + MCA_RCACHE_FLAGS_SO_MEM = 0x0004, + /** address range is cuda buffer */ + MCA_RCACHE_FLAGS_CUDA_GPU_MEM = 0x0008, + /** register with common cuda */ + MCA_RCACHE_FLAGS_CUDA_REGISTER_MEM = 0x0010, + /** invalid registration (no valid for passing to rcache register) */ + MCA_RCACHE_FLAGS_INVALID = 0x0080, + /** reserved for register function */ + MCA_RCACHE_FLAGS_RESV0 = 0x1000, + /** reserved for register function */ + MCA_RCACHE_FLAGS_RESV1 = 0x2000, + /** reserved for register function */ + MCA_RCACHE_FLAGS_RESV2 = 0x4000, + /** reserved for register function */ + MCA_RCACHE_FLAGS_RESV3 = 0x8000, +}; + +/** access flags */ +enum { + /** register for local write */ + MCA_RCACHE_ACCESS_LOCAL_WRITE = 0x01, + /** register for remote read */ + MCA_RCACHE_ACCESS_REMOTE_READ = 0x02, + /** register for remote write */ + MCA_RCACHE_ACCESS_REMOTE_WRITE = 0x04, + /** register for local/remote atomic operations */ + MCA_RCACHE_ACCESS_REMOTE_ATOMIC = 0x08, + /** register for any access */ + MCA_RCACHE_ACCESS_ANY = 0x0f, +}; + +/** base class for all rcache registrations */ +struct mca_rcache_base_registration_t { + /** alloc registrations to be allocated from an opal_free_list_t */ + opal_free_list_item_t super; + /** rcache this registration belongs to */ + mca_rcache_base_module_t *rcache; + /** base of registered region */ + unsigned char *base; + /** bound of registered region */ + unsigned char *bound; + /** artifact of old mpool/rcache architecture. used by cuda code */ + unsigned char *alloc_base; + /** number of outstanding references */ + int32_t ref_count; + /** registration flags */ + uint32_t flags; + /** internal rcache context */ + void *rcache_context; +#if OPAL_CUDA_GDR_SUPPORT + /** CUDA gpu buffer identifier */ + unsigned long long gpu_bufID; +#endif /* OPAL_CUDA_GDR_SUPPORT */ + /** registration access flags */ + int32_t access_flags; + unsigned char padding[64]; +}; + +typedef struct mca_rcache_base_registration_t mca_rcache_base_registration_t; + +OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_rcache_base_registration_t); + +struct mca_rcache_base_resources_t { + char *cache_name; + void *reg_data; + size_t sizeof_reg; + int (*register_mem) (void *reg_data, void *base, size_t size, + mca_rcache_base_registration_t *reg); + int (*deregister_mem) (void *reg_data, mca_rcache_base_registration_t *reg); +}; +typedef struct mca_rcache_base_resources_t mca_rcache_base_resources_t; + + /** * component initialize */ -typedef struct mca_rcache_base_module_t* (*mca_rcache_base_component_init_fn_t)(void); +typedef struct mca_rcache_base_module_t *(*mca_rcache_base_component_init_fn_t)(mca_rcache_base_resources_t *); +/** + * register memory + */ +typedef int (*mca_rcache_base_module_register_fn_t) (mca_rcache_base_module_t *rcache, + void *addr, size_t size, uint32_t flags, + int32_t access_flags, + mca_rcache_base_registration_t **reg); -typedef int (*mca_rcache_base_module_find_fn_t) ( - struct mca_rcache_base_module_t* rcache, void* addr, size_t size, - mca_mpool_base_registration_t **reg); +/** + * deregister memory + */ +typedef int (*mca_rcache_base_module_deregister_fn_t) (mca_rcache_base_module_t *rcache, + mca_rcache_base_registration_t *reg); -typedef int (*mca_rcache_base_module_find_all_fn_t)( - struct mca_rcache_base_module_t* rcache, void* addr, size_t size, - mca_mpool_base_registration_t **regs, int reg_cnt); +/** + * find registration in this memory pool + */ -typedef int (*mca_rcache_base_module_insert_fn_t)( - struct mca_rcache_base_module_t* rcache, - mca_mpool_base_registration_t* registration, size_t limit); +typedef int (*mca_rcache_base_module_find_fn_t) (mca_rcache_base_module_t *rcache, void *addr, + size_t size, mca_rcache_base_registration_t **reg); -typedef int (*mca_rcache_base_module_delete_fn_t)( - struct mca_rcache_base_module_t* rcache, - mca_mpool_base_registration_t* registration); +/** + * release memory region + */ +typedef int (*mca_rcache_base_module_invalidate_range_fn_t) (mca_rcache_base_module_t *rcache, + void *addr, size_t size); -/* Do not call the clean function with the rcache lock held */ -typedef int (*mca_rcache_base_module_clean_fn_t)( - struct mca_rcache_base_module_t* rcache); - -typedef void (*mca_rcache_base_module_dump_range_fn_t)( - struct mca_rcache_base_module_t* rcache, unsigned char* addr, size_t size, char *msg); +/** + * evict one stale registration + * + * @returns true if successful + * @returns false if no registration could be evicted + */ +typedef bool (*mca_rcache_base_module_evict_fn_t) (mca_rcache_base_module_t *rcache); /** * finalize */ -typedef void (*mca_rcache_base_module_finalize_fn_t)( - struct mca_rcache_base_module_t*); +typedef void (*mca_rcache_base_module_finalize_fn_t)(mca_rcache_base_module_t *rcache); /** * rcache component descriptor. Contains component version information and @@ -86,22 +179,21 @@ typedef struct mca_rcache_base_component_2_0_0_t mca_rcache_base_component_t; */ struct mca_rcache_base_module_t { mca_rcache_base_component_t *rcache_component; /**< component struct */ + + mca_rcache_base_module_register_fn_t rcache_register; + mca_rcache_base_module_deregister_fn_t rcache_deregister; mca_rcache_base_module_find_fn_t rcache_find; - mca_rcache_base_module_find_all_fn_t rcache_find_all; - mca_rcache_base_module_insert_fn_t rcache_insert; - mca_rcache_base_module_delete_fn_t rcache_delete; - mca_rcache_base_module_clean_fn_t rcache_clean; + mca_rcache_base_module_invalidate_range_fn_t rcache_invalidate_range; mca_rcache_base_module_finalize_fn_t rcache_finalize; - mca_rcache_base_module_dump_range_fn_t rcache_dump_range; + mca_rcache_base_module_evict_fn_t rcache_evict; opal_mutex_t lock; }; -typedef struct mca_rcache_base_module_t mca_rcache_base_module_t; /** * Macro for use in components that are of type rcache */ -#define MCA_RCACHE_BASE_VERSION_2_0_0 \ - OPAL_MCA_BASE_VERSION_2_1_0("rcache", 2, 0, 0) +#define MCA_RCACHE_BASE_VERSION_3_0_0 \ + OPAL_MCA_BASE_VERSION_2_1_0("rcache", 3, 0, 0) #endif /* MCA_RCACHE_H */ diff --git a/opal/mca/mpool/rgpusm/Makefile.am b/opal/mca/rcache/rgpusm/Makefile.am similarity index 69% rename from opal/mca/mpool/rgpusm/Makefile.am rename to opal/mca/rcache/rgpusm/Makefile.am index eecc5e941e..24881e56d4 100644 --- a/opal/mca/mpool/rgpusm/Makefile.am +++ b/opal/mca/rcache/rgpusm/Makefile.am @@ -18,40 +18,40 @@ # $HEADER$ # -AM_CPPFLAGS = $(mpool_rgpusm_CPPFLAGS) +AM_CPPFLAGS = $(rcache_rgpusm_CPPFLAGS) sources = \ - mpool_rgpusm_module.c \ - mpool_rgpusm_component.c + rcache_rgpusm_module.c \ + rcache_rgpusm_component.c if WANT_INSTALL_HEADERS opaldir = $(opalincludedir)/$(subdir) -opal_HEADERS = mpool_rgpusm.h +opal_HEADERS = rcache_rgpusm.h endif # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la # (for static builds). -if MCA_BUILD_opal_mpool_rgpusm_DSO +if MCA_BUILD_opal_rcache_rgpusm_DSO component_noinst = -component_install = mca_mpool_rgpusm.la +component_install = mca_rcache_rgpusm.la else -component_noinst = libmca_mpool_rgpusm.la +component_noinst = libmca_rcache_rgpusm.la component_install = endif mcacomponentdir = $(opallibdir) mcacomponent_LTLIBRARIES = $(component_install) -mca_mpool_rgpusm_la_SOURCES = $(sources) -mca_mpool_rgpusm_la_LDFLAGS = -module -avoid-version -mca_mpool_rgpusm_la_LIBADD = $(mpool_rgpusm_LIBS) +mca_rcache_rgpusm_la_SOURCES = $(sources) +mca_rcache_rgpusm_la_LDFLAGS = -module -avoid-version +mca_rcache_rgpusm_la_LIBADD = $(rcache_rgpusm_LIBS) if OPAL_cuda_support -mca_mpool_rgpusm_la_LIBADD += \ +mca_rcache_rgpusm_la_LIBADD += \ $(OPAL_TOP_BUILDDIR)/opal/mca/common/cuda/lib@OPAL_LIB_PREFIX@mca_common_cuda.la endif noinst_LTLIBRARIES = $(component_noinst) -libmca_mpool_rgpusm_la_SOURCES = $(sources) -libmca_mpool_rgpusm_la_LDFLAGS = -module -avoid-version -libmca_mpool_rgpusm_la_LIBADD = $(mpool_rgpusm_LIBS) +libmca_rcache_rgpusm_la_SOURCES = $(sources) +libmca_rcache_rgpusm_la_LDFLAGS = -module -avoid-version +libmca_rcache_rgpusm_la_LIBADD = $(rcache_rgpusm_LIBS) diff --git a/opal/mca/mpool/rgpusm/configure.m4 b/opal/mca/rcache/rgpusm/configure.m4 similarity index 71% rename from opal/mca/mpool/rgpusm/configure.m4 rename to opal/mca/rcache/rgpusm/configure.m4 index 8518cfc3b7..a9bce3c39d 100644 --- a/opal/mca/mpool/rgpusm/configure.m4 +++ b/opal/mca/rcache/rgpusm/configure.m4 @@ -1,6 +1,8 @@ # -*- shell-script -*- # # Copyright (c) 2012-2015 NVIDIA Corporation. All rights reserved. +# Copyright (c) 2015 Los Alamos National Security, LLC. All rights +# reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -14,8 +16,8 @@ # the configure sequence by the opal_configure_options.m4 code. # -AC_DEFUN([MCA_opal_mpool_rgpusm_CONFIG],[ - AC_CONFIG_FILES([opal/mca/mpool/rgpusm/Makefile]) +AC_DEFUN([MCA_opal_rcache_rgpusm_CONFIG],[ + AC_CONFIG_FILES([opal/mca/rcache/rgpusm/Makefile]) # Use CUDA_SUPPORT which was filled in by the opal configure code. AS_IF([test "x$CUDA_SUPPORT" = "x1"], diff --git a/opal/mca/mpool/rgpusm/owner.txt b/opal/mca/rcache/rgpusm/owner.txt similarity index 100% rename from opal/mca/mpool/rgpusm/owner.txt rename to opal/mca/rcache/rgpusm/owner.txt diff --git a/opal/mca/mpool/rgpusm/mpool_rgpusm.h b/opal/mca/rcache/rgpusm/rcache_rgpusm.h similarity index 50% rename from opal/mca/mpool/rgpusm/mpool_rgpusm.h rename to opal/mca/rcache/rgpusm/rcache_rgpusm.h index 3f43347fb2..268d3d1548 100644 --- a/opal/mca/mpool/rgpusm/mpool_rgpusm.h +++ b/opal/mca/rcache/rgpusm/rcache_rgpusm.h @@ -24,18 +24,18 @@ /** * @file */ -#ifndef MCA_MPOOL_RGPUSM_H -#define MCA_MPOOL_RGPUSM_H +#ifndef MCA_RCACHE_RGPUSM_H +#define MCA_RCACHE_RGPUSM_H #include "opal_config.h" #include "opal/class/opal_list.h" #include "opal/class/opal_free_list.h" -#include "opal/mca/mpool/mpool.h" +#include "opal/mca/rcache/rcache.h" BEGIN_C_DECLS -struct mca_mpool_rgpusm_component_t { - mca_mpool_base_component_t super; +struct mca_rcache_rgpusm_component_t { + mca_rcache_base_component_t super; char* rcache_name; unsigned long long rcache_size_limit; bool print_stats; @@ -43,22 +43,12 @@ struct mca_mpool_rgpusm_component_t { int output; bool empty_cache; }; -typedef struct mca_mpool_rgpusm_component_t mca_mpool_rgpusm_component_t; +typedef struct mca_rcache_rgpusm_component_t mca_rcache_rgpusm_component_t; -OPAL_DECLSPEC extern mca_mpool_rgpusm_component_t mca_mpool_rgpusm_component; +OPAL_DECLSPEC extern mca_rcache_rgpusm_component_t mca_rcache_rgpusm_component; -struct mca_mpool_base_resources_t { - void *reg_data; - size_t sizeof_reg; - int (*register_mem)(void *base, size_t size, mca_mpool_base_registration_t *newreg, - mca_mpool_base_registration_t *hdrreg); - int (*deregister_mem)(void *reg_data, mca_mpool_base_registration_t *reg); -}; -typedef struct mca_mpool_base_resources_t mca_mpool_base_resources_t; - -struct mca_mpool_rgpusm_module_t { - mca_mpool_base_module_t super; - struct mca_mpool_base_resources_t resources; +struct mca_rcache_rgpusm_module_t { + mca_rcache_base_module_t super; opal_free_list_t reg_list; opal_list_t lru_list; uint32_t stat_cache_hit; @@ -68,54 +58,48 @@ struct mca_mpool_rgpusm_module_t { uint32_t stat_evicted; uint32_t stat_cache_found; uint32_t stat_cache_notfound; -}; typedef struct mca_mpool_rgpusm_module_t mca_mpool_rgpusm_module_t; + mca_rcache_base_vma_module_t *vma_module; +}; typedef struct mca_rcache_rgpusm_module_t mca_rcache_rgpusm_module_t; /* - * Initializes the mpool module. + * Initializes the rcache module. */ -void mca_mpool_rgpusm_module_init(mca_mpool_rgpusm_module_t *mpool); +void mca_rcache_rgpusm_module_init(mca_rcache_rgpusm_module_t *rcache); /** * register block of memory */ -int mca_mpool_rgpusm_register(mca_mpool_base_module_t* mpool, void *addr, - size_t size, uint32_t flags, int32_t access_flags, mca_mpool_base_registration_t **reg); +int mca_rcache_rgpusm_register(mca_rcache_base_module_t* rcache, void *addr, + size_t size, uint32_t flags, int32_t access_flags, mca_rcache_base_registration_t **reg); /** * deregister memory */ -int mca_mpool_rgpusm_deregister(mca_mpool_base_module_t *mpool, - mca_mpool_base_registration_t *reg); +int mca_rcache_rgpusm_deregister(mca_rcache_base_module_t *rcache, + mca_rcache_base_registration_t *reg); /** * free memory allocated by alloc function */ -void mca_mpool_rgpusm_free(mca_mpool_base_module_t *mpool, void * addr, - mca_mpool_base_registration_t *reg); +void mca_rcache_rgpusm_free(mca_rcache_base_module_t *rcache, void * addr, + mca_rcache_base_registration_t *reg); /** * find registration for a given block of memory */ -int mca_mpool_rgpusm_find(struct mca_mpool_base_module_t* mpool, void* addr, - size_t size, mca_mpool_base_registration_t **reg); +int mca_rcache_rgpusm_find(struct mca_rcache_base_module_t* rcache, void* addr, + size_t size, mca_rcache_base_registration_t **reg); /** * unregister all registration covering the block of memory */ -int mca_mpool_rgpusm_release_memory(mca_mpool_base_module_t* mpool, void *base, +int mca_rcache_rgpusm_release_memory(mca_rcache_base_module_t* rcache, void *base, size_t size); /** - * finalize mpool + * finalize rcache */ -void mca_mpool_rgpusm_finalize(struct mca_mpool_base_module_t *mpool); - -/** - * Fault Tolerance Event Notification Function - * @param state Checkpoint Stae - * @return OPAL_SUCCESS or failure status - */ -int mca_mpool_rgpusm_ft_event(int state); +void mca_rcache_rgpusm_finalize(struct mca_rcache_base_module_t *rcache); END_C_DECLS #endif diff --git a/opal/mca/mpool/rgpusm/mpool_rgpusm_component.c b/opal/mca/rcache/rgpusm/rcache_rgpusm_component.c similarity index 65% rename from opal/mca/mpool/rgpusm/mpool_rgpusm_component.c rename to opal/mca/rcache/rgpusm/rcache_rgpusm_component.c index 6b2d2016d7..8794d4f047 100644 --- a/opal/mca/mpool/rgpusm/mpool_rgpusm_component.c +++ b/opal/mca/rcache/rgpusm/rcache_rgpusm_component.c @@ -26,7 +26,7 @@ #define OPAL_DISABLE_ENABLE_MEM_DEBUG 1 #include "opal_config.h" #include "opal/mca/base/base.h" -#include "mpool_rgpusm.h" +#include "rcache_rgpusm.h" #ifdef HAVE_UNISTD_H #include #endif @@ -40,17 +40,17 @@ static int rgpusm_open(void); static int rgpusm_close(void); static int rgpusm_register(void); -static mca_mpool_base_module_t* rgpusm_init(struct mca_mpool_base_resources_t* resources); +static mca_rcache_base_module_t* rgpusm_init(struct mca_rcache_base_resources_t* resources); -static int opal_mpool_rgpusm_verbose = 0; +static int opal_rcache_rgpusm_verbose = 0; -mca_mpool_rgpusm_component_t mca_mpool_rgpusm_component = { +mca_rcache_rgpusm_component_t mca_rcache_rgpusm_component = { { /* First, the mca_base_component_t struct containing meta information about the component itself */ - .mpool_version = { - MCA_MPOOL_BASE_VERSION_2_0_0, + .rcache_version = { + MCA_RCACHE_BASE_VERSION_3_0_0, .mca_component_name = "rgpusm", MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, @@ -59,12 +59,12 @@ mca_mpool_rgpusm_component_t mca_mpool_rgpusm_component = { .mca_close_component = rgpusm_close, .mca_register_component_params = rgpusm_register, }, - .mpool_data = { + .rcache_data = { /* The component is checkpoint ready */ MCA_BASE_METADATA_PARAM_CHECKPOINT }, - .mpool_init = rgpusm_init + .rcache_init = rgpusm_init } }; @@ -73,8 +73,8 @@ mca_mpool_rgpusm_component_t mca_mpool_rgpusm_component = { */ static int rgpusm_open(void) { - mca_mpool_rgpusm_component.output = opal_output_open(NULL); - opal_output_set_verbosity(mca_mpool_rgpusm_component.output, opal_mpool_rgpusm_verbose); + mca_rcache_rgpusm_component.output = opal_output_open(NULL); + opal_output_set_verbosity(mca_rcache_rgpusm_component.output, opal_rcache_rgpusm_verbose); return OPAL_SUCCESS; } @@ -82,59 +82,59 @@ static int rgpusm_open(void) static int rgpusm_register(void) { - mca_mpool_rgpusm_component.rcache_name = "vma"; - (void) mca_base_component_var_register(&mca_mpool_rgpusm_component.super.mpool_version, + mca_rcache_rgpusm_component.rcache_name = "vma"; + (void) mca_base_component_var_register(&mca_rcache_rgpusm_component.super.rcache_version, "rcache_name", - "The name of the registration cache the mpool should use", + "The name of the registration cache the rcache should use", MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &mca_mpool_rgpusm_component.rcache_name); - mca_mpool_rgpusm_component.rcache_size_limit = 0; - (void) mca_base_component_var_register(&mca_mpool_rgpusm_component.super.mpool_version, + &mca_rcache_rgpusm_component.rcache_name); + mca_rcache_rgpusm_component.rcache_size_limit = 0; + (void) mca_base_component_var_register(&mca_rcache_rgpusm_component.super.rcache_version, "rcache_size_limit", "the maximum size of registration cache in bytes. " "0 is unlimited (default 0)", MCA_BASE_VAR_TYPE_UNSIGNED_LONG_LONG, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &mca_mpool_rgpusm_component.rcache_size_limit); + &mca_rcache_rgpusm_component.rcache_size_limit); - mca_mpool_rgpusm_component.leave_pinned = 1; - (void) mca_base_component_var_register(&mca_mpool_rgpusm_component.super.mpool_version, + mca_rcache_rgpusm_component.leave_pinned = 1; + (void) mca_base_component_var_register(&mca_rcache_rgpusm_component.super.rcache_version, "leave_pinned", "Whether to keep memory handles around or release them when done. ", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &mca_mpool_rgpusm_component.leave_pinned); + &mca_rcache_rgpusm_component.leave_pinned); - mca_mpool_rgpusm_component.print_stats = false; - (void) mca_base_component_var_register(&mca_mpool_rgpusm_component.super.mpool_version, + mca_rcache_rgpusm_component.print_stats = false; + (void) mca_base_component_var_register(&mca_rcache_rgpusm_component.super.rcache_version, "print_stats", "print pool usage statistics at the end of the run", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &mca_mpool_rgpusm_component.print_stats); + &mca_rcache_rgpusm_component.print_stats); /* Set different levels of verbosity in the rgpusm related code. */ - opal_mpool_rgpusm_verbose = 0; - (void) mca_base_component_var_register(&mca_mpool_rgpusm_component.super.mpool_version, - "verbose", "Set level of mpool rgpusm verbosity", + opal_rcache_rgpusm_verbose = 0; + (void) mca_base_component_var_register(&mca_rcache_rgpusm_component.super.rcache_version, + "verbose", "Set level of rcache rgpusm verbosity", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, - &opal_mpool_rgpusm_verbose); + &opal_rcache_rgpusm_verbose); /* Force emptying of entire registration cache when it gets full */ - mca_mpool_rgpusm_component.empty_cache = false; - (void) mca_base_component_var_register(&mca_mpool_rgpusm_component.super.mpool_version, + mca_rcache_rgpusm_component.empty_cache = false; + (void) mca_base_component_var_register(&mca_rcache_rgpusm_component.super.rcache_version, "empty_cache", "When set, empty entire registration cache when it is full", MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY, - &mca_mpool_rgpusm_component.empty_cache); + &mca_rcache_rgpusm_component.empty_cache); return OPAL_SUCCESS; } @@ -146,17 +146,20 @@ static int rgpusm_close(void) } -static mca_mpool_base_module_t* rgpusm_init( - struct mca_mpool_base_resources_t *resources) +static mca_rcache_base_module_t* rgpusm_init( + struct mca_rcache_base_resources_t *resources) { - mca_mpool_rgpusm_module_t* mpool_module; + mca_rcache_rgpusm_module_t* rcache_module; - mpool_module = - (mca_mpool_rgpusm_module_t*)malloc(sizeof(mca_mpool_rgpusm_module_t)); + /* ignore passed in resource structure */ + (void) resources; - mpool_module->resources = *resources; + rcache_module = (mca_rcache_rgpusm_module_t *) calloc (1, sizeof (*rcache_module)); + if (NULL == rcache_module) { + return NULL; + } - mca_mpool_rgpusm_module_init(mpool_module); + mca_rcache_rgpusm_module_init(rcache_module); - return &mpool_module->super; + return &rcache_module->super; } diff --git a/opal/mca/mpool/rgpusm/mpool_rgpusm_module.c b/opal/mca/rcache/rgpusm/rcache_rgpusm_module.c similarity index 56% rename from opal/mca/mpool/rgpusm/mpool_rgpusm_module.c rename to opal/mca/rcache/rgpusm/rcache_rgpusm_module.c index fe0854baee..38645cb276 100644 --- a/opal/mca/mpool/rgpusm/mpool_rgpusm_module.c +++ b/opal/mca/rcache/rgpusm/rcache_rgpusm_module.c @@ -36,10 +36,10 @@ * cache can also be used just to track how many handles are in use at * a time. It is best to look at this with the three different * scenarios that are possible. - * 1. mpool_rgpusm_leave_pinned=0, cache_size=unlimited - * 2. mpool_rgpusm_leave_pinned=0, cache_size=limited - * 3. mpool_rgpusm_leave_pinned=1, cache_size=unlimited (default) - * 4. mpool_rgpusm_leave_pinned=1, cache_size=limited. + * 1. rcache_rgpusm_leave_pinned=0, cache_size=unlimited + * 2. rcache_rgpusm_leave_pinned=0, cache_size=limited + * 3. rcache_rgpusm_leave_pinned=1, cache_size=unlimited (default) + * 4. rcache_rgpusm_leave_pinned=1, cache_size=limited. * * Case 1: The cache is unused and remote memory is registered and * unregistered for each transaction. The amount of outstanding @@ -80,7 +80,7 @@ #define OPAL_DISABLE_ENABLE_MEM_DEBUG 1 #include "opal_config.h" #include "opal/align.h" -#include "opal/mca/mpool/rgpusm/mpool_rgpusm.h" +#include "opal/mca/rcache/rgpusm/rcache_rgpusm.h" #include #include #ifdef HAVE_MALLOC_H @@ -89,89 +89,75 @@ #include "opal/util/proc.h" #include "opal/mca/rcache/rcache.h" #include "opal/mca/rcache/base/base.h" -#include "opal/mca/mpool/base/base.h" +#include "opal/mca/rcache/base/base.h" #include "opal/mca/common/cuda/common_cuda.h" -static int mca_mpool_rgpusm_deregister_no_lock(struct mca_mpool_base_module_t *, - mca_mpool_base_registration_t *); -static inline bool mca_mpool_rgpusm_deregister_lru (mca_mpool_base_module_t *mpool) { - mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t *) mpool; - mca_mpool_base_registration_t *old_reg; +static int mca_rcache_rgpusm_deregister_no_lock(struct mca_rcache_base_module_t *, + mca_rcache_base_registration_t *); +static inline bool mca_rcache_rgpusm_deregister_lru (mca_rcache_base_module_t *rcache) { + mca_rcache_rgpusm_module_t *rcache_rgpusm = (mca_rcache_rgpusm_module_t *) rcache; + mca_rcache_base_registration_t *old_reg; int rc; /* Remove the registration from the cache and list before deregistering the memory */ - old_reg = (mca_mpool_base_registration_t*) - opal_list_remove_first (&mpool_rgpusm->lru_list); + old_reg = (mca_rcache_base_registration_t*) + opal_list_remove_first (&rcache_rgpusm->lru_list); if (NULL == old_reg) { - opal_output_verbose(10, mca_mpool_rgpusm_component.output, + opal_output_verbose(10, mca_rcache_rgpusm_component.output, "RGPUSM: The LRU list is empty. There is nothing to deregister"); return false; } - mpool->rcache->rcache_delete(mpool->rcache, old_reg); + mca_rcache_base_vma_delete (rcache_rgpusm->vma_module, old_reg); /* Drop the rcache lock while we deregister the memory */ - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + OPAL_THREAD_UNLOCK(&rcache->lock); assert(old_reg->ref_count == 0); - rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data, - old_reg); - OPAL_THREAD_LOCK(&mpool->rcache->lock); + rc = cuda_closememhandle (NULL, old_reg); + OPAL_THREAD_LOCK(&rcache->lock); /* This introduces a potential leak of registrations if the deregistration fails to occur as we no longer have a reference to it. Is this possible? */ if (OPAL_SUCCESS != rc) { - opal_output_verbose(10, mca_mpool_rgpusm_component.output, + opal_output_verbose(10, mca_rcache_rgpusm_component.output, "RGPUSM: Failed to deregister the memory addr=%p, size=%d", old_reg->base, (int)(old_reg->bound - old_reg->base + 1)); return false; } - opal_free_list_return (&mpool_rgpusm->reg_list, + opal_free_list_return (&rcache_rgpusm->reg_list, (opal_free_list_item_t*)old_reg); - mpool_rgpusm->stat_evicted++; + rcache_rgpusm->stat_evicted++; return true; } /* - * Initializes the mpool module. + * Initializes the rcache module. */ -void mca_mpool_rgpusm_module_init(mca_mpool_rgpusm_module_t* mpool) +void mca_rcache_rgpusm_module_init(mca_rcache_rgpusm_module_t* rcache) { - mpool->super.mpool_component = &mca_mpool_rgpusm_component.super; - mpool->super.mpool_base = NULL; /* no base .. */ - mpool->super.mpool_alloc = NULL; - mpool->super.mpool_realloc = NULL; - mpool->super.mpool_free = mca_mpool_rgpusm_free; - mpool->super.mpool_register = mca_mpool_rgpusm_register; - mpool->super.mpool_find = mca_mpool_rgpusm_find; - mpool->super.mpool_deregister = mca_mpool_rgpusm_deregister; - mpool->super.mpool_release_memory = NULL; - mpool->super.mpool_finalize = mca_mpool_rgpusm_finalize; - mpool->super.mpool_ft_event = mca_mpool_rgpusm_ft_event; - mpool->super.rcache = - mca_rcache_base_module_create(mca_mpool_rgpusm_component.rcache_name); - mpool->super.flags = 0; + rcache->super.rcache_component = &mca_rcache_rgpusm_component.super; + rcache->super.rcache_register = mca_rcache_rgpusm_register; + rcache->super.rcache_find = mca_rcache_rgpusm_find; + rcache->super.rcache_deregister = mca_rcache_rgpusm_deregister; + rcache->super.rcache_finalize = mca_rcache_rgpusm_finalize; + rcache->vma_module = mca_rcache_base_vma_module_alloc (); - mpool->resources.reg_data = NULL; - mpool->resources.sizeof_reg = sizeof(struct mca_mpool_common_cuda_reg_t); - mpool->resources.register_mem = cuda_openmemhandle; - mpool->resources.deregister_mem = cuda_closememhandle; - - OBJ_CONSTRUCT(&mpool->reg_list, opal_free_list_t); - opal_free_list_init (&mpool->reg_list, mpool->resources.sizeof_reg, + OBJ_CONSTRUCT(&rcache->reg_list, opal_free_list_t); + opal_free_list_init (&rcache->reg_list, sizeof(struct mca_rcache_common_cuda_reg_t), opal_cache_line_size, - OBJ_CLASS(mca_mpool_base_registration_t), + OBJ_CLASS(mca_rcache_base_registration_t), 0,opal_cache_line_size, 0, -1, 32, NULL, 0, NULL, NULL, NULL); - OBJ_CONSTRUCT(&mpool->lru_list, opal_list_t); - mpool->stat_cache_hit = mpool->stat_cache_miss = mpool->stat_evicted = 0; - mpool->stat_cache_found = mpool->stat_cache_notfound = 0; - mpool->stat_cache_valid = mpool->stat_cache_invalid = 0; + OBJ_CONSTRUCT(&rcache->lru_list, opal_list_t); + rcache->stat_cache_hit = rcache->stat_cache_miss = rcache->stat_evicted = 0; + rcache->stat_cache_found = rcache->stat_cache_notfound = 0; + rcache->stat_cache_valid = rcache->stat_cache_invalid = 0; } @@ -180,27 +166,27 @@ void mca_mpool_rgpusm_module_init(mca_mpool_rgpusm_module_t* mpool) * from the remote memory. It uses the addr and size of the remote * memory for caching the registration. */ -int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr, +int mca_rcache_rgpusm_register (mca_rcache_base_module_t *rcache, void *addr, size_t size, uint32_t flags, int32_t access_flags, - mca_mpool_base_registration_t **reg) + mca_rcache_base_registration_t **reg) { - mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool; - mca_mpool_common_cuda_reg_t *rgpusm_reg; - mca_mpool_common_cuda_reg_t *rget_reg; + mca_rcache_rgpusm_module_t *rcache_rgpusm = (mca_rcache_rgpusm_module_t*)rcache; + mca_rcache_common_cuda_reg_t *rgpusm_reg; + mca_rcache_common_cuda_reg_t *rget_reg; opal_free_list_item_t *item; int rc; int mypeer; /* just for debugging */ - /* In order to preserve the signature of the mca_mpool_rgpusm_register + /* In order to preserve the signature of the mca_rcache_rgpusm_register * function, we are using the **reg variable to not only get back the * registration information, but to hand in the memory handle received * from the remote side. */ - rget_reg = (mca_mpool_common_cuda_reg_t *)*reg; + rget_reg = (mca_rcache_common_cuda_reg_t *)*reg; mypeer = flags; flags = 0; - /* No need to support MCA_MPOOL_FLAGS_CACHE_BYPASS in here. It is not used. */ - assert(0 == (flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)); + /* No need to support MCA_RCACHE_FLAGS_CACHE_BYPASS in here. It is not used. */ + assert(0 == (flags & MCA_RCACHE_FLAGS_CACHE_BYPASS)); /* This chunk of code handles the case where leave pinned is not * set and we do not use the cache. This is not typically how we @@ -208,13 +194,13 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr, * number of registrations occuring at the same time. Since we * are not leaving the registrations pinned, the number of * registrations is unlimited and there is no need for a cache. */ - if(!mca_mpool_rgpusm_component.leave_pinned && 0 == mca_mpool_rgpusm_component.rcache_size_limit) { - item = opal_free_list_get (&mpool_rgpusm->reg_list); + if(!mca_rcache_rgpusm_component.leave_pinned && 0 == mca_rcache_rgpusm_component.rcache_size_limit) { + item = opal_free_list_get (&rcache_rgpusm->reg_list); if(NULL == item) { return OPAL_ERR_OUT_OF_RESOURCE; } - rgpusm_reg = (mca_mpool_common_cuda_reg_t*)item; - rgpusm_reg->base.mpool = mpool; + rgpusm_reg = (mca_rcache_common_cuda_reg_t*)item; + rgpusm_reg->base.rcache = rcache; rgpusm_reg->base.base = addr; rgpusm_reg->base.bound = (unsigned char *)addr + size - 1;; rgpusm_reg->base.flags = flags; @@ -225,25 +211,24 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr, /* The rget_reg registration is holding the memory handle needed * to register the remote memory. This was received from the remote * process. A pointer to the memory is returned in the alloc_base field. */ - rc = mpool_rgpusm->resources.register_mem(addr, size, - (mca_mpool_base_registration_t *)rgpusm_reg, - (mca_mpool_base_registration_t *)rget_reg); + rc = cuda_openmemhandle (addr, size, (mca_rcache_base_registration_t *)rgpusm_reg, + (mca_rcache_base_registration_t *)rget_reg); /* This error should not happen with no cache in use. */ assert(OPAL_ERR_WOULD_BLOCK != rc); if(rc != OPAL_SUCCESS) { - opal_free_list_return (&mpool_rgpusm->reg_list, item); + opal_free_list_return (&rcache_rgpusm->reg_list, item); return rc; } rgpusm_reg->base.ref_count++; - *reg = (mca_mpool_base_registration_t *)rgpusm_reg; + *reg = (mca_rcache_base_registration_t *)rgpusm_reg; return OPAL_SUCCESS; } /* Check to see if memory is registered and stored in the cache. */ - OPAL_THREAD_LOCK(&mpool->rcache->lock); - mpool->rcache->rcache_find(mpool->rcache, addr, size, reg); + OPAL_THREAD_LOCK(&rcache->lock); + mca_rcache_base_vma_find (rcache_rgpusm->vma_module, addr, size, reg); /* If *reg is not NULL, we have a registration. Let us see if the * memory handle matches the one we were looking for. If not, the @@ -253,18 +238,18 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr, * previous registration. The memory handle check will catch that * scenario as the handles have unique serial numbers. */ if (*reg != NULL) { - mpool_rgpusm->stat_cache_hit++; - opal_output_verbose(10, mca_mpool_rgpusm_component.output, + rcache_rgpusm->stat_cache_hit++; + opal_output_verbose(10, mca_rcache_rgpusm_component.output, "RGPUSM: Found addr=%p,size=%d (base=%p,size=%d) in cache", addr, (int)size, (*reg)->base, (int)((*reg)->bound - (*reg)->base)); - if (mca_common_cuda_memhandle_matches((mca_mpool_common_cuda_reg_t *)*reg, rget_reg)) { + if (mca_common_cuda_memhandle_matches((mca_rcache_common_cuda_reg_t *)*reg, rget_reg)) { /* Registration matches what was requested. All is good. */ - mpool_rgpusm->stat_cache_valid++; + rcache_rgpusm->stat_cache_valid++; } else { /* This is an old registration. Need to boot it. */ - opal_output_verbose(10, mca_mpool_rgpusm_component.output, + opal_output_verbose(10, mca_rcache_rgpusm_component.output, "RGPUSM: Mismatched Handle: Evicting/unregistering " "addr=%p,size=%d (base=%p,size=%d) from cache", addr, (int)size, (*reg)->base, @@ -273,42 +258,42 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr, /* The ref_count has to be zero as this memory cannot possibly * be in use. Assert on that just to make sure. */ assert(0 == (*reg)->ref_count); - if (mca_mpool_rgpusm_component.leave_pinned) { - opal_list_remove_item(&mpool_rgpusm->lru_list, + if (mca_rcache_rgpusm_component.leave_pinned) { + opal_list_remove_item(&rcache_rgpusm->lru_list, (opal_list_item_t*)(*reg)); } /* Bump the reference count to keep things copacetic in deregister */ (*reg)->ref_count++; /* Invalidate the registration so it will get booted out. */ - (*reg)->flags |= MCA_MPOOL_FLAGS_INVALID; - mca_mpool_rgpusm_deregister_no_lock(mpool, *reg); + (*reg)->flags |= MCA_RCACHE_FLAGS_INVALID; + mca_rcache_rgpusm_deregister_no_lock(rcache, *reg); *reg = NULL; - mpool_rgpusm->stat_cache_invalid++; + rcache_rgpusm->stat_cache_invalid++; } } else { /* Nothing was found in the cache. */ - mpool_rgpusm->stat_cache_miss++; + rcache_rgpusm->stat_cache_miss++; } /* If we have a registration here, then we know it is valid. */ if (*reg != NULL) { - opal_output_verbose(10, mca_mpool_rgpusm_component.output, + opal_output_verbose(10, mca_rcache_rgpusm_component.output, "RGPUSM: CACHE HIT is good: ep=%d, addr=%p, size=%d in cache", mypeer, addr, (int)size); /* When using leave pinned, we keep an LRU list. */ - if ((0 == (*reg)->ref_count) && mca_mpool_rgpusm_component.leave_pinned) { - opal_output_verbose(20, mca_mpool_rgpusm_component.output, + if ((0 == (*reg)->ref_count) && mca_rcache_rgpusm_component.leave_pinned) { + opal_output_verbose(20, mca_rcache_rgpusm_component.output, "RGPUSM: POP OFF LRU: ep=%d, addr=%p, size=%d in cache", mypeer, addr, (int)size); - opal_list_remove_item(&mpool_rgpusm->lru_list, + opal_list_remove_item(&rcache_rgpusm->lru_list, (opal_list_item_t*)(*reg)); } (*reg)->ref_count++; - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + OPAL_THREAD_UNLOCK(&rcache->lock); opal_output(-1, "reg->ref_count=%d", (int)(*reg)->ref_count); - opal_output_verbose(80, mca_mpool_rgpusm_component.output, + opal_output_verbose(80, mca_rcache_rgpusm_component.output, "RGPUSM: Found entry in cache addr=%p, size=%d", addr, (int)size); return OPAL_SUCCESS; } @@ -316,18 +301,18 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr, /* If we are here, then we did not find a registration, or it was invalid, * so this is a new one, and we are going to use the cache. */ assert(NULL == *reg); - opal_output_verbose(10, mca_mpool_rgpusm_component.output, + opal_output_verbose(10, mca_rcache_rgpusm_component.output, "RGPUSM: New registration ep=%d, addr=%p, size=%d. Need to register and insert in cache", mypeer, addr, (int)size); - item = opal_free_list_get (&mpool_rgpusm->reg_list); + item = opal_free_list_get (&rcache_rgpusm->reg_list); if(NULL == item) { - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + OPAL_THREAD_UNLOCK(&rcache->lock); return OPAL_ERR_OUT_OF_RESOURCE; } - rgpusm_reg = (mca_mpool_common_cuda_reg_t*)item; + rgpusm_reg = (mca_rcache_common_cuda_reg_t*)item; - rgpusm_reg->base.mpool = mpool; + rgpusm_reg->base.rcache = rcache; rgpusm_reg->base.base = addr; rgpusm_reg->base.bound = (unsigned char *)addr + size - 1; rgpusm_reg->base.flags = flags; @@ -340,8 +325,8 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr, * bound values may be changed by the registration. The memory * associated with the handle comes back in the alloc_base * value. */ - rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg, - (mca_mpool_base_registration_t *)rget_reg); + rc = cuda_openmemhandle (addr, size, (mca_rcache_base_registration_t *)rgpusm_reg, + (mca_rcache_base_registration_t *)rget_reg); /* There is a chance we can get the OPAL_ERR_WOULD_BLOCK from the * CUDA codes attempt to register the memory. The case that this * can happen is as follows. A block of memory is registered. @@ -354,11 +339,11 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr, * boot that previous allocation out and deregister it first. */ if (OPAL_ERR_WOULD_BLOCK == rc) { - mca_mpool_base_registration_t *oldreg; + mca_rcache_base_registration_t *oldreg; /* Need to make sure it is at least 4 bytes in size This will * ensure we get the hit in the cache. */ - mpool->rcache->rcache_find(mpool->rcache, addr, 4, &oldreg); + mca_rcache_base_vma_find (rcache_rgpusm->vma_module, addr, 4, &oldreg); /* For most cases, we will find a registration that overlaps. * Removal of it should allow the registration we are @@ -367,71 +352,71 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr, /* The ref_count has to be zero as this memory cannot * possibly be in use. Assert on that just to make sure. */ assert(0 == oldreg->ref_count); - if (mca_mpool_rgpusm_component.leave_pinned) { - opal_list_remove_item(&mpool_rgpusm->lru_list, + if (mca_rcache_rgpusm_component.leave_pinned) { + opal_list_remove_item(&rcache_rgpusm->lru_list, (opal_list_item_t*)oldreg); } /* Bump the reference count to keep things copacetic in deregister */ oldreg->ref_count++; /* Invalidate the registration so it will get booted out. */ - oldreg->flags |= MCA_MPOOL_FLAGS_INVALID; - mca_mpool_rgpusm_deregister_no_lock(mpool, oldreg); - mpool_rgpusm->stat_evicted++; + oldreg->flags |= MCA_RCACHE_FLAGS_INVALID; + mca_rcache_rgpusm_deregister_no_lock(rcache, oldreg); + rcache_rgpusm->stat_evicted++; /* And try again. This one usually works. */ - rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg, - (mca_mpool_base_registration_t *)rget_reg); + rc = cuda_openmemhandle (addr, size, (mca_rcache_base_registration_t *)rgpusm_reg, + (mca_rcache_base_registration_t *)rget_reg); } /* There is a chance that another registration is blocking our * ability to register. Check the rc to see if we still need * to try and clear out registrations. */ while (OPAL_SUCCESS != rc) { - if (true != mca_mpool_rgpusm_deregister_lru(mpool)) { + if (true != mca_rcache_rgpusm_deregister_lru(rcache)) { rc = OPAL_ERROR; break; } /* Clear out one registration. */ - rc = mpool_rgpusm->resources.register_mem(addr, size, (mca_mpool_base_registration_t *)rgpusm_reg, - (mca_mpool_base_registration_t *)rget_reg); + rc = cuda_openmemhandle (addr, size, (mca_rcache_base_registration_t *)rgpusm_reg, + (mca_rcache_base_registration_t *)rget_reg); } } if(rc != OPAL_SUCCESS) { - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - opal_free_list_return (&mpool_rgpusm->reg_list, item); + OPAL_THREAD_UNLOCK(&rcache->lock); + opal_free_list_return (&rcache_rgpusm->reg_list, item); return rc; } - opal_output_verbose(80, mca_mpool_rgpusm_component.output, + opal_output_verbose(80, mca_rcache_rgpusm_component.output, "RGPUSM: About to insert in rgpusm cache addr=%p, size=%d", addr, (int)size); - rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg, - mca_mpool_rgpusm_component.rcache_size_limit); + rc = mca_rcache_base_vma_insert (rcache_rgpusm->vma_module, (mca_rcache_base_registration_t *)rgpusm_reg, + mca_rcache_rgpusm_component.rcache_size_limit); if (OPAL_ERR_TEMP_OUT_OF_RESOURCE == rc) { - opal_output_verbose(40, mca_mpool_rgpusm_component.output, + opal_output_verbose(40, mca_rcache_rgpusm_component.output, "RGPUSM: No room in the cache - boot the first one out"); - (void)mca_mpool_rgpusm_deregister_lru(mpool); - if (mca_mpool_rgpusm_component.empty_cache) { + (void)mca_rcache_rgpusm_deregister_lru(rcache); + if (mca_rcache_rgpusm_component.empty_cache) { int remNum = 1; /* Empty out every registration from LRU until it is empty */ - opal_output_verbose(40, mca_mpool_rgpusm_component.output, + opal_output_verbose(40, mca_rcache_rgpusm_component.output, "RGPUSM: About to delete all the unused entries in the cache"); - while (mca_mpool_rgpusm_deregister_lru(mpool)) { + while (mca_rcache_rgpusm_deregister_lru(rcache)) { remNum++; } - opal_output_verbose(40, mca_mpool_rgpusm_component.output, + opal_output_verbose(40, mca_rcache_rgpusm_component.output, "RGPUSM: Deleted and deregistered %d entries", remNum); - rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg, - mca_mpool_rgpusm_component.rcache_size_limit); + rc = mca_rcache_base_vma_insert (rcache_rgpusm->vma_module, (mca_rcache_base_registration_t *)rgpusm_reg, + mca_rcache_rgpusm_component.rcache_size_limit); } else { /* Check for room after one removal. If not, remove another one until there is space */ - while((rc = mpool->rcache->rcache_insert(mpool->rcache, (mca_mpool_base_registration_t *)rgpusm_reg, - mca_mpool_rgpusm_component.rcache_size_limit)) == + while((rc = mca_rcache_base_vma_insert (rcache_rgpusm->vma_module, (mca_rcache_base_registration_t *)rgpusm_reg, + mca_rcache_rgpusm_component.rcache_size_limit)) == OPAL_ERR_TEMP_OUT_OF_RESOURCE) { - opal_output_verbose(40, mca_mpool_rgpusm_component.output, + opal_output_verbose(40, mca_rcache_rgpusm_component.output, "RGPUSM: No room in the cache - boot one out"); - if (!mca_mpool_rgpusm_deregister_lru(mpool)) { + if (!mca_rcache_rgpusm_deregister_lru(rcache)) { break; } } @@ -439,131 +424,111 @@ int mca_mpool_rgpusm_register (mca_mpool_base_module_t *mpool, void *addr, } if(rc != OPAL_SUCCESS) { - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - opal_free_list_return (&mpool_rgpusm->reg_list, item); + OPAL_THREAD_UNLOCK(&rcache->lock); + opal_free_list_return (&rcache_rgpusm->reg_list, item); /* We cannot recover from this. We can be here if the size of * the cache is smaller than the amount of memory we are * trying to register in a single transfer. In that case, rc * is MPI_ERR_OUT_OF_RESOURCES, but everything is stuck at * that point. Therefore, just error out completely. */ - opal_output_verbose(10, mca_mpool_rgpusm_component.output, + opal_output_verbose(10, mca_rcache_rgpusm_component.output, "RGPUSM: Failed to register addr=%p, size=%d", addr, (int)size); return OPAL_ERROR; } rgpusm_reg->base.ref_count++; - *reg = (mca_mpool_base_registration_t *)rgpusm_reg; - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + *reg = (mca_rcache_base_registration_t *)rgpusm_reg; + OPAL_THREAD_UNLOCK(&rcache->lock); - /* Cleanup any vmas that we have deferred deletion on */ - mpool->rcache->rcache_clean(mpool->rcache); return OPAL_SUCCESS; } - -/** - * free function - */ -void mca_mpool_rgpusm_free(mca_mpool_base_module_t *mpool, void *addr, - mca_mpool_base_registration_t *registration) +int mca_rcache_rgpusm_find(struct mca_rcache_base_module_t *rcache, void *addr, + size_t size, mca_rcache_base_registration_t **reg) { - void *alloc_base = registration->alloc_base; - mca_mpool_rgpusm_deregister(mpool, registration); - free(alloc_base); -} - -int mca_mpool_rgpusm_find(struct mca_mpool_base_module_t *mpool, void *addr, - size_t size, mca_mpool_base_registration_t **reg) -{ - mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool; + mca_rcache_rgpusm_module_t *rcache_rgpusm = (mca_rcache_rgpusm_module_t*)rcache; int rc; unsigned char *base, *bound; base = addr; bound = base + size - 1; /* To keep cache hits working correctly */ - OPAL_THREAD_LOCK(&mpool->rcache->lock); + OPAL_THREAD_LOCK(&rcache->lock); opal_output(-1, "Looking for addr=%p, size=%d", addr, (int)size); - rc = mpool->rcache->rcache_find(mpool->rcache, addr, size, reg); - if(*reg != NULL && mca_mpool_rgpusm_component.leave_pinned) { - if(0 == (*reg)->ref_count && mca_mpool_rgpusm_component.leave_pinned) { - opal_list_remove_item(&mpool_rgpusm->lru_list, (opal_list_item_t*)(*reg)); + rc = mca_rcache_base_vma_find (rcache_rgpusm->vma_module, addr, size, reg); + if(*reg != NULL && mca_rcache_rgpusm_component.leave_pinned) { + if(0 == (*reg)->ref_count && mca_rcache_rgpusm_component.leave_pinned) { + opal_list_remove_item(&rcache_rgpusm->lru_list, (opal_list_item_t*)(*reg)); } - mpool_rgpusm->stat_cache_found++; + rcache_rgpusm->stat_cache_found++; (*reg)->ref_count++; } else { - mpool_rgpusm->stat_cache_notfound++; + rcache_rgpusm->stat_cache_notfound++; } - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + OPAL_THREAD_UNLOCK(&rcache->lock); return rc; } -static inline bool registration_is_cachebale(mca_mpool_base_registration_t *reg) +static inline bool registration_is_cachebale(mca_rcache_base_registration_t *reg) { return !(reg->flags & - (MCA_MPOOL_FLAGS_CACHE_BYPASS | - MCA_MPOOL_FLAGS_INVALID)); + (MCA_RCACHE_FLAGS_CACHE_BYPASS | + MCA_RCACHE_FLAGS_INVALID)); } -int mca_mpool_rgpusm_deregister(struct mca_mpool_base_module_t *mpool, - mca_mpool_base_registration_t *reg) +int mca_rcache_rgpusm_deregister(struct mca_rcache_base_module_t *rcache, + mca_rcache_base_registration_t *reg) { - mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool; + mca_rcache_rgpusm_module_t *rcache_rgpusm = (mca_rcache_rgpusm_module_t*)rcache; int rc = OPAL_SUCCESS; assert(reg->ref_count > 0); - OPAL_THREAD_LOCK(&mpool->rcache->lock); + OPAL_THREAD_LOCK(&rcache->lock); reg->ref_count--; opal_output(-1, "Deregister: reg->ref_count=%d", (int)reg->ref_count); if(reg->ref_count > 0) { - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + OPAL_THREAD_UNLOCK(&rcache->lock); return OPAL_SUCCESS; } - if(mca_mpool_rgpusm_component.leave_pinned && registration_is_cachebale(reg)) + if(mca_rcache_rgpusm_component.leave_pinned && registration_is_cachebale(reg)) { /* if leave_pinned is set don't deregister memory, but put it * on LRU list for future use */ - opal_output_verbose(20, mca_mpool_rgpusm_component.output, + opal_output_verbose(20, mca_rcache_rgpusm_component.output, "RGPUSM: Deregister: addr=%p, size=%d: cacheable and pinned, leave in cache, PUSH IN LRU", reg->base, (int)(reg->bound - reg->base + 1)); - opal_list_prepend(&mpool_rgpusm->lru_list, (opal_list_item_t*)reg); + opal_list_prepend(&rcache_rgpusm->lru_list, (opal_list_item_t*)reg); } else { /* Remove from rcache first */ - if(!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) - mpool->rcache->rcache_delete(mpool->rcache, reg); + if(!(reg->flags & MCA_RCACHE_FLAGS_CACHE_BYPASS)) + mca_rcache_base_vma_delete (rcache_rgpusm->vma_module, reg); /* Drop the rcache lock before deregistring the memory */ - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + OPAL_THREAD_UNLOCK(&rcache->lock); { - mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t *)mpool; - assert(reg->ref_count == 0); - rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data, - reg); + rc = cuda_closememhandle (NULL, reg); } - OPAL_THREAD_LOCK(&mpool->rcache->lock); + OPAL_THREAD_LOCK(&rcache->lock); if(OPAL_SUCCESS == rc) { - opal_free_list_return (&mpool_rgpusm->reg_list, + opal_free_list_return (&rcache_rgpusm->reg_list, (opal_free_list_item_t*)reg); } } - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - - /* Cleanup any vmas that we have deferred deletion on */ - mpool->rcache->rcache_clean(mpool->rcache); + OPAL_THREAD_UNLOCK(&rcache->lock); return rc; } -int mca_mpool_rgpusm_deregister_no_lock(struct mca_mpool_base_module_t *mpool, - mca_mpool_base_registration_t *reg) +int mca_rcache_rgpusm_deregister_no_lock(struct mca_rcache_base_module_t *rcache, + mca_rcache_base_registration_t *reg) { - mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool; + mca_rcache_rgpusm_module_t *rcache_rgpusm = (mca_rcache_rgpusm_module_t*)rcache; int rc = OPAL_SUCCESS; assert(reg->ref_count > 0); @@ -572,26 +537,21 @@ int mca_mpool_rgpusm_deregister_no_lock(struct mca_mpool_base_module_t *mpool, if(reg->ref_count > 0) { return OPAL_SUCCESS; } - if(mca_mpool_rgpusm_component.leave_pinned && registration_is_cachebale(reg)) + if(mca_rcache_rgpusm_component.leave_pinned && registration_is_cachebale(reg)) { /* if leave_pinned is set don't deregister memory, but put it * on LRU list for future use */ - opal_list_prepend(&mpool_rgpusm->lru_list, (opal_list_item_t*)reg); + opal_list_prepend(&rcache_rgpusm->lru_list, (opal_list_item_t*)reg); } else { /* Remove from rcache first */ - if(!(reg->flags & MCA_MPOOL_FLAGS_CACHE_BYPASS)) - mpool->rcache->rcache_delete(mpool->rcache, reg); + if(!(reg->flags & MCA_RCACHE_FLAGS_CACHE_BYPASS)) + mca_rcache_base_vma_delete (rcache_rgpusm->vma_module, reg); - { - mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t *)mpool; - - assert(reg->ref_count == 0); - rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data, - reg); - } + assert(reg->ref_count == 0); + rc = cuda_closememhandle (NULL, reg); if(OPAL_SUCCESS == rc) { - opal_free_list_return (&mpool_rgpusm->reg_list, + opal_free_list_return (&rcache_rgpusm->reg_list, (opal_free_list_item_t*)reg); } } @@ -599,30 +559,30 @@ int mca_mpool_rgpusm_deregister_no_lock(struct mca_mpool_base_module_t *mpool, return rc; } -#define RGPUSM_MPOOL_NREGS 100 +#define RGPUSM_RCACHE_NREGS 100 -void mca_mpool_rgpusm_finalize(struct mca_mpool_base_module_t *mpool) +void mca_rcache_rgpusm_finalize(struct mca_rcache_base_module_t *rcache) { - mca_mpool_rgpusm_module_t *mpool_rgpusm = (mca_mpool_rgpusm_module_t*)mpool; - mca_mpool_base_registration_t *reg; - mca_mpool_base_registration_t *regs[RGPUSM_MPOOL_NREGS]; + mca_rcache_rgpusm_module_t *rcache_rgpusm = (mca_rcache_rgpusm_module_t*)rcache; + mca_rcache_base_registration_t *reg; + mca_rcache_base_registration_t *regs[RGPUSM_RCACHE_NREGS]; int reg_cnt, i; int rc; /* Statistic */ - if(true == mca_mpool_rgpusm_component.print_stats) { + if(true == mca_rcache_rgpusm_component.print_stats) { opal_output(0, "%s rgpusm: stats " "(hit/valid/invalid/miss/evicted): %d/%d/%d/%d/%d\n", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), - mpool_rgpusm->stat_cache_hit, mpool_rgpusm->stat_cache_valid, - mpool_rgpusm->stat_cache_invalid, mpool_rgpusm->stat_cache_miss, - mpool_rgpusm->stat_evicted); + rcache_rgpusm->stat_cache_hit, rcache_rgpusm->stat_cache_valid, + rcache_rgpusm->stat_cache_invalid, rcache_rgpusm->stat_cache_miss, + rcache_rgpusm->stat_evicted); } - OPAL_THREAD_LOCK(&mpool->rcache->lock); + OPAL_THREAD_LOCK(&rcache->lock); do { - reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, 0, (size_t)-1, - regs, RGPUSM_MPOOL_NREGS); + reg_cnt = mca_rcache_base_vma_find_all (rcache_rgpusm->vma_module, 0, (size_t)-1, + regs, RGPUSM_RCACHE_NREGS); opal_output(-1, "Registration size at finalize = %d", reg_cnt); for(i = 0; i < reg_cnt; i++) { @@ -630,20 +590,19 @@ void mca_mpool_rgpusm_finalize(struct mca_mpool_base_module_t *mpool) if(reg->ref_count) { reg->ref_count = 0; /* otherway dereg will fail on assert */ - } else if (mca_mpool_rgpusm_component.leave_pinned) { - opal_list_remove_item(&mpool_rgpusm->lru_list, + } else if (mca_rcache_rgpusm_component.leave_pinned) { + opal_list_remove_item(&rcache_rgpusm->lru_list, (opal_list_item_t*)reg); } /* Remove from rcache first */ - mpool->rcache->rcache_delete(mpool->rcache, reg); + mca_rcache_base_vma_delete (rcache_rgpusm->vma_module, reg); /* Drop lock before deregistering memory */ - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); + OPAL_THREAD_UNLOCK(&rcache->lock); assert(reg->ref_count == 0); - rc = mpool_rgpusm->resources.deregister_mem(mpool_rgpusm->resources.reg_data, - reg); - OPAL_THREAD_LOCK(&mpool->rcache->lock); + rc = cuda_closememhandle (NULL, reg); + OPAL_THREAD_LOCK(&rcache->lock); if(rc != OPAL_SUCCESS) { /* Potentially lose track of registrations @@ -651,20 +610,12 @@ void mca_mpool_rgpusm_finalize(struct mca_mpool_base_module_t *mpool) continue; } - opal_free_list_return (&mpool_rgpusm->reg_list, + opal_free_list_return (&rcache_rgpusm->reg_list, (opal_free_list_item_t *) reg); } - } while(reg_cnt == RGPUSM_MPOOL_NREGS); - - OBJ_DESTRUCT(&mpool_rgpusm->lru_list); - OBJ_DESTRUCT(&mpool_rgpusm->reg_list); - OPAL_THREAD_UNLOCK(&mpool->rcache->lock); - - /* Cleanup any vmas that we have deferred deletion on */ - mpool->rcache->rcache_clean(mpool->rcache); + } while(reg_cnt == RGPUSM_RCACHE_NREGS); -} - -int mca_mpool_rgpusm_ft_event(int state) { - return OPAL_SUCCESS; + OBJ_DESTRUCT(&rcache_rgpusm->lru_list); + OBJ_DESTRUCT(&rcache_rgpusm->reg_list); + OPAL_THREAD_UNLOCK(&rcache->lock); } diff --git a/opal/mca/mpool/udreg/Makefile.am b/opal/mca/rcache/udreg/Makefile.am similarity index 65% rename from opal/mca/mpool/udreg/Makefile.am rename to opal/mca/rcache/udreg/Makefile.am index dae4519606..ce9fd42ef0 100644 --- a/opal/mca/mpool/udreg/Makefile.am +++ b/opal/mca/rcache/udreg/Makefile.am @@ -10,7 +10,7 @@ # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. # Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2012-2013 Los Alamos National Security, LLC. All rights +# Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights # reserved. # $COPYRIGHT$ # @@ -19,34 +19,34 @@ # $HEADER$ # -AM_CPPFLAGS = $(mpool_udreg_CPPFLAGS) +AM_CPPFLAGS = $(rcache_udreg_CPPFLAGS) -sources = mpool_udreg_module.c mpool_udreg_component.c +sources = rcache_udreg_module.c rcache_udreg_component.c if WANT_INSTALL_HEADERS opaldir = $(opalincludedir)/$(subdir) -opal_HEADERS = mpool_udreg.h +opal_HEADERS = rcache_udreg.h endif # Make the output library in this directory, and name it either # mca__.la (for DSO builds) or libmca__.la # (for static builds). -if MCA_BUILD_opal_mpool_udreg_DSO +if MCA_BUILD_opal_rcache_udreg_DSO component_noinst = -component_install = mca_mpool_udreg.la +component_install = mca_rcache_udreg.la else -component_noinst = libmca_mpool_udreg.la +component_noinst = libmca_rcache_udreg.la component_install = endif mcacomponentdir = $(opallibdir) mcacomponent_LTLIBRARIES = $(component_install) -mca_mpool_udreg_la_SOURCES = $(sources) -mca_mpool_udreg_la_LDFLAGS = -module -avoid-version $(mpool_udreg_LDFLAGS) -mca_mpool_udreg_la_LIBADD = $(mpool_udreg_LIBS) +mca_rcache_udreg_la_SOURCES = $(sources) +mca_rcache_udreg_la_LDFLAGS = -module -avoid-version $(rcache_udreg_LDFLAGS) +mca_rcache_udreg_la_LIBADD = $(rcache_udreg_LIBS) noinst_LTLIBRARIES = $(component_noinst) -libmca_mpool_udreg_la_SOURCES = $(sources) -libmca_mpool_udreg_la_LIBADD = $(mpool_udreg_LIBS) -libmca_mpool_udreg_la_LDFLAGS = -module -avoid-version $(mpool_udreg_LDFLAGS) +libmca_rcache_udreg_la_SOURCES = $(sources) +libmca_rcache_udreg_la_LIBADD = $(rcache_udreg_LIBS) +libmca_rcache_udreg_la_LDFLAGS = -module -avoid-version $(rcache_udreg_LDFLAGS) diff --git a/opal/mca/mpool/udreg/configure.m4 b/opal/mca/rcache/udreg/configure.m4 similarity index 64% rename from opal/mca/mpool/udreg/configure.m4 rename to opal/mca/rcache/udreg/configure.m4 index 9a1db3ec5a..01d954527f 100644 --- a/opal/mca/mpool/udreg/configure.m4 +++ b/opal/mca/rcache/udreg/configure.m4 @@ -12,7 +12,7 @@ # All rights reserved. # Copyright (c) 2006 QLogic Corp. All rights reserved. # Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2011-2013 Los Alamos National Security, LLC. +# Copyright (c) 2011-2015 Los Alamos National Security, LLC. # All rights reserved. # $COPYRIGHT$ # @@ -21,28 +21,28 @@ # $HEADER$ # -AC_DEFUN([MCA_opal_mpool_udreg_CONFIG],[ - AC_CONFIG_FILES([opal/mca/mpool/udreg/Makefile]) +AC_DEFUN([MCA_opal_rcache_udreg_CONFIG],[ + AC_CONFIG_FILES([opal/mca/rcache/udreg/Makefile]) AC_ARG_WITH([udreg], [AC_HELP_STRING([--with-udreg], [Build support for Cray udreg support. Set PKG_CONFIG_PATH env. variable to specify alternate path.])]) - mpool_udreg_happy="no" + rcache_udreg_happy="no" AS_IF([test "$with_udreg" = "no"], - [mpool_udreg_happy="no"], + [rcache_udreg_happy="no"], [PKG_CHECK_MODULES([CRAY_UDREG], [cray-udreg], - [mpool_udreg_LDFLAGS="$CRAY_UDREG_LIBS" - mpool_udreg_CPPFLAGS="$CRAY_UDREG_CFLAGS" - mpool_udreg_happy="yes"], + [rcache_udreg_LDFLAGS="$CRAY_UDREG_LIBS" + rcache_udreg_CPPFLAGS="$CRAY_UDREG_CFLAGS" + rcache_udreg_happy="yes"], [AC_MSG_RESULT([no]) - mpool_udreg_happ="no"])]) + rcache_udreg_happ="no"])]) - AS_IF([test "$mpool_udreg_happy" = "yes"], [$1], [$2]) + AS_IF([test "$rcache_udreg_happy" = "yes"], [$1], [$2]) - # substitute in the things needed to build udreg/mpool - AC_SUBST([mpool_udreg_CPPFLAGS]) - AC_SUBST([mpool_udreg_LDFLAGS]) - AC_SUBST([mpool_udreg_LIBS]) + # substitute in the things needed to build udreg/rcache + AC_SUBST([rcache_udreg_CPPFLAGS]) + AC_SUBST([rcache_udreg_LDFLAGS]) + AC_SUBST([rcache_udreg_LIBS]) ])dnl diff --git a/opal/mca/mpool/sm/owner.txt b/opal/mca/rcache/udreg/owner.txt similarity index 100% rename from opal/mca/mpool/sm/owner.txt rename to opal/mca/rcache/udreg/owner.txt diff --git a/opal/mca/rcache/udreg/rcache_udreg.h b/opal/mca/rcache/udreg/rcache_udreg.h new file mode 100644 index 0000000000..de3b6313fe --- /dev/null +++ b/opal/mca/rcache/udreg/rcache_udreg.h @@ -0,0 +1,83 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file + */ +#ifndef MCA_RCACHE_UDREG_H +#define MCA_RCACHE_UDREG_H + +#include "opal_config.h" +#include "opal/class/opal_list.h" +#include "opal/class/opal_free_list.h" +#include "opal/mca/event/event.h" +#include "opal/mca/rcache/rcache.h" +#include "opal/util/proc.h" +#if HAVE_SYS_MMAN_H +#include +#endif + +BEGIN_C_DECLS + +struct mca_rcache_udreg_component_t { + mca_rcache_base_component_t super; + bool print_stats; + int leave_pinned; +}; +typedef struct mca_rcache_udreg_component_t mca_rcache_udreg_component_t; + +OPAL_DECLSPEC extern mca_rcache_udreg_component_t mca_rcache_udreg_component; + +struct mca_rcache_udreg_resources_t { + mca_rcache_base_resources_t base; + + /* udreg specific resources */ + bool use_kernel_cache; + bool use_evict_w_unreg; + int max_entries; + size_t page_size; +}; +typedef struct mca_rcache_udreg_resources_t mca_rcache_udreg_resources_t; + +struct mca_rcache_udreg_module_t; + +struct mca_rcache_udreg_module_t { + mca_rcache_base_module_t super; + mca_rcache_udreg_resources_t resources; + opal_free_list_t reg_list; + opal_mutex_t lock; + void *udreg_handle; + /** used to communicate the access flags to the underlying registration + * function */ + int requested_access_flags; + int requested_flags; +}; +typedef struct mca_rcache_udreg_module_t mca_rcache_udreg_module_t; + + +/* + * Initializes the rcache module. + */ +int mca_rcache_udreg_module_init(mca_rcache_udreg_module_t *rcache); + +END_C_DECLS +#endif diff --git a/opal/mca/rcache/udreg/rcache_udreg_component.c b/opal/mca/rcache/udreg/rcache_udreg_component.c new file mode 100644 index 0000000000..8a4b44b938 --- /dev/null +++ b/opal/mca/rcache/udreg/rcache_udreg_component.c @@ -0,0 +1,130 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1 +#include "opal_config.h" +#include "opal/mca/base/base.h" +#include "opal/runtime/opal_params.h" +#include "rcache_udreg.h" +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_MALLOC_H +#include +#endif + +#include + +/* + * Local functions + */ +static int udreg_open(void); +static int udreg_close(void); +static int udreg_register(void); +static mca_rcache_base_module_t* udreg_init( + struct mca_rcache_base_resources_t* resources); + +mca_rcache_udreg_component_t mca_rcache_udreg_component = { + { + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + .rcache_version ={ + MCA_RCACHE_BASE_VERSION_3_0_0, + + .mca_component_name = "udreg", + MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, + OPAL_RELEASE_VERSION), + .mca_open_component = udreg_open, + .mca_close_component = udreg_close, + .mca_register_component_params = udreg_register, + }, + .rcache_data = { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + .rcache_init = udreg_init + } +}; + +/** + * component open/close/init function + */ +static int udreg_open(void) +{ + return OPAL_SUCCESS; +} + + +static int udreg_register(void) +{ + mca_rcache_udreg_component.print_stats = false; + (void) mca_base_component_var_register(&mca_rcache_udreg_component.super.rcache_version, + "print_stats", "print pool usage statistics at the end of the run", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_rcache_udreg_component.print_stats); + + return OPAL_SUCCESS; +} + + +static int udreg_close(void) +{ + return OPAL_SUCCESS; +} + +static mca_rcache_base_module_t * +udreg_init(struct mca_rcache_base_resources_t *resources) +{ + mca_rcache_udreg_resources_t *udreg_resources = (mca_rcache_udreg_resources_t *) resources; + mca_rcache_udreg_module_t* rcache_module; + static int inited = false; + int rc; + + /* Set this here (vs in component.c) because + opal_leave_pinned* may have been set after MCA params were + read (e.g., by the openib btl) */ + mca_rcache_udreg_component.leave_pinned = (int) + (1 == opal_leave_pinned || opal_leave_pinned_pipeline); + + if (!inited) { + inited = true; + } + + rcache_module = + (mca_rcache_udreg_module_t *) malloc (sizeof (mca_rcache_udreg_module_t)); + + memmove (&rcache_module->resources, udreg_resources, sizeof (*udreg_resources)); + + rc = mca_rcache_udreg_module_init(rcache_module); + if (OPAL_SUCCESS != rc) { + free (rcache_module); + return NULL; + } + + return &rcache_module->super; +} diff --git a/opal/mca/rcache/udreg/rcache_udreg_module.c b/opal/mca/rcache/udreg/rcache_udreg_module.c new file mode 100644 index 0000000000..8c0651219c --- /dev/null +++ b/opal/mca/rcache/udreg/rcache_udreg_module.c @@ -0,0 +1,346 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2013 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006 Voltaire. All rights reserved. + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#define OPAL_DISABLE_ENABLE_MEM_DEBUG 1 +#include "opal_config.h" +#include "opal/align.h" +#include "rcache_udreg.h" +#include +#include +#ifdef HAVE_MALLOC_H +#include +#endif +#include "opal/mca/rcache/base/base.h" +#include "opal/runtime/opal_params.h" +#include "opal/include/opal_stdint.h" +#include "opal/util/sys_limits.h" + +#include + +#include + +#include + + +static int mca_rcache_udreg_register (mca_rcache_base_module_t* rcache, void *addr, + size_t size, uint32_t flags, int32_t access_flags, + mca_rcache_base_registration_t **reg); +static int mca_rcache_udreg_deregister (mca_rcache_base_module_t *rcache, + mca_rcache_base_registration_t *reg); +static int mca_rcache_udreg_find (mca_rcache_base_module_t* rcache, void* addr, + size_t size, mca_rcache_base_registration_t **reg); +static void mca_rcache_udreg_finalize (mca_rcache_base_module_t *rcache); +static bool mca_rcache_udreg_evict (mca_rcache_base_module_t *rcache); + +static void *mca_rcache_udreg_reg_func (void *addr, uint64_t len, void *reg_context); +static uint32_t mca_rcache_udreg_dereg_func (void *device_data, void *dreg_context); + + +/* + * Initializes the rcache module. + */ +int mca_rcache_udreg_module_init (mca_rcache_udreg_module_t *rcache) +{ + struct udreg_cache_attr cache_attr; + int urc; + + rcache->super.rcache_component = &mca_rcache_udreg_component.super; + rcache->super.rcache_register = mca_rcache_udreg_register; + rcache->super.rcache_find = mca_rcache_udreg_find; + rcache->super.rcache_deregister = mca_rcache_udreg_deregister; + /* This module relies on udreg for notification of memory release */ + rcache->super.rcache_invalidate_range = NULL; + rcache->super.rcache_finalize = mca_rcache_udreg_finalize; + + cache_attr.modes = 0; + + /* Create udreg cache */ + if (rcache->resources.use_kernel_cache) { + cache_attr.modes |= UDREG_CC_MODE_USE_KERNEL_CACHE; + } + + if (rcache->resources.use_evict_w_unreg) { + cache_attr.modes |= UDREG_CC_MODE_USE_EVICT_W_UNREG; + } + + if (mca_rcache_udreg_component.leave_pinned) { + cache_attr.modes |= UDREG_CC_MODE_USE_LAZY_DEREG; + } + + OBJ_CONSTRUCT(&rcache->lock, opal_mutex_t); + + strncpy (cache_attr.cache_name, rcache->resources.base.cache_name, UDREG_MAX_CACHENAME_LEN); + cache_attr.max_entries = rcache->resources.max_entries; + cache_attr.debug_mode = 0; + cache_attr.debug_rank = 0; + cache_attr.reg_context = rcache; + cache_attr.dreg_context = rcache; + cache_attr.destructor_context = rcache; + cache_attr.device_reg_func = mca_rcache_udreg_reg_func; + cache_attr.device_dereg_func = mca_rcache_udreg_dereg_func; + cache_attr.destructor_callback = NULL; + + opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output, + "rcache/udreg: creating udreg cache with name %s", cache_attr.cache_name); + + /* attempt to create the udreg cache. this will fail if one already exists */ + (void) UDREG_CacheCreate (&cache_attr); + + urc = UDREG_CacheAccess (rcache->resources.base.cache_name, (udreg_cache_handle_t *) &rcache->udreg_handle); + if (UDREG_RC_SUCCESS != urc) { + opal_output_verbose (MCA_BASE_VERBOSE_WARN, opal_rcache_base_framework.framework_output, + "rcache/udreg: call to UDREG_CacheAccess failed with rc: %d", urc); + return OPAL_ERROR; + } + + OBJ_CONSTRUCT(&rcache->reg_list, opal_free_list_t); + opal_free_list_init (&rcache->reg_list, rcache->resources.base.sizeof_reg, + opal_cache_line_size, OBJ_CLASS(mca_rcache_base_registration_t), + 0, opal_cache_line_size, 0, -1, 32, NULL, 0, + NULL, NULL, NULL); + + return OPAL_SUCCESS; +} + +/* udreg callback functions */ +static void *mca_rcache_udreg_reg_func (void *addr, uint64_t size, void *reg_context) +{ + mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) reg_context; + unsigned int page_size = opal_getpagesize (); + mca_rcache_base_registration_t *udreg_reg; + opal_free_list_item_t *item; + int rc; + + item = opal_free_list_get (&rcache_udreg->reg_list); + if (NULL == item) { + return NULL; + } + + udreg_reg = (mca_rcache_base_registration_t *) item; + + udreg_reg->rcache = reg_context; + udreg_reg->base = OPAL_DOWN_ALIGN_PTR(addr, page_size, unsigned char *); + udreg_reg->bound = OPAL_ALIGN_PTR((intptr_t) addr + size, page_size, unsigned char *) - 1; + + addr = (void *) udreg_reg->base; + size = (uint64_t) (udreg_reg->bound - udreg_reg->base + 1); + + /* pull the flags and access flags out of the rcache module */ + udreg_reg->access_flags = rcache_udreg->requested_access_flags; + udreg_reg->flags = rcache_udreg->requested_flags; + + opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output, + "rcache/udreg: calling underlying register function for address range {%p, %p}", + addr, (void *)((intptr_t) addr + size)); + rc = rcache_udreg->resources.base.register_mem (rcache_udreg->resources.base.reg_data, udreg_reg->base, size, + udreg_reg); + if (OPAL_SUCCESS != rc) { + opal_output_verbose (MCA_BASE_VERBOSE_WARN, opal_rcache_base_framework.framework_output, + "rcache/udreg: could not register memory. rc: %d", rc); + opal_free_list_return (&rcache_udreg->reg_list, item); + udreg_reg = NULL; + } + + return udreg_reg; +} + +static uint32_t mca_rcache_udreg_dereg_func (void *device_data, void *dreg_context) +{ + mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) dreg_context; + mca_rcache_base_registration_t *udreg_reg = (mca_rcache_base_registration_t *) device_data; + int rc; + + if (udreg_reg->ref_count) { + /* there are still users of this registration. leave it alone */ + return 0; + } + + rc = rcache_udreg->resources.base.deregister_mem (rcache_udreg->resources.base.reg_data, udreg_reg); + + if (OPAL_LIKELY(OPAL_SUCCESS == rc)) { + opal_free_list_return (&rcache_udreg->reg_list, + (opal_free_list_item_t *) udreg_reg); + } + /* might be worth printing out a warning if an error occurs here */ + + return 0; +} + +static bool mca_rcache_udreg_evict (mca_rcache_base_module_t *rcache) +{ + mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) rcache; + udreg_return_t urc; + + urc = UDREG_Evict (rcache_udreg->udreg_handle); + return (UDREG_RC_SUCCESS == urc); +} + +/* + * register memory + */ +static int mca_rcache_udreg_register(mca_rcache_base_module_t *rcache, void *addr, + size_t size, uint32_t flags, int32_t access_flags, + mca_rcache_base_registration_t **reg) +{ + mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) rcache; + mca_rcache_base_registration_t *udreg_reg, *old_reg; + bool bypass_cache = !!(flags & MCA_RCACHE_FLAGS_CACHE_BYPASS); + udreg_entry_t *udreg_entry; + udreg_return_t urc; + + *reg = NULL; + + OPAL_THREAD_LOCK(&rcache_udreg->lock); + + /* we hold the lock so no other thread can modify these flags until the registration is complete */ + rcache_udreg->requested_access_flags = access_flags; + rcache_udreg->requested_flags = flags; + + if (false == bypass_cache) { + /* Get a udreg entry for this region */ + do { + opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output, + "rcache/udreg: registering region {%p, %p} with udreg", addr, (void *)((intptr_t) addr + size)); + while (UDREG_RC_SUCCESS != + (urc = UDREG_Register (rcache_udreg->udreg_handle, addr, size, &udreg_entry))) { + /* try to remove one unused reg and retry */ + if (!mca_rcache_udreg_evict (rcache)) { + opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output, + "rcache/udreg: could not register memory with udreg. udreg rc: %d", urc); + OPAL_THREAD_UNLOCK(&rcache_udreg->lock); + return OPAL_ERR_OUT_OF_RESOURCE; + } + } + + udreg_reg = (mca_rcache_base_registration_t *) udreg_entry->device_data; + + if ((udreg_reg->access_flags & access_flags) == access_flags) { + /* sufficient access */ + break; + } + + old_reg = udreg_reg; + + /* to not confuse udreg make sure the new registration covers the same address + * range as the old one. */ + addr = old_reg->base; + size = (size_t)((intptr_t) old_reg->bound - (intptr_t) old_reg->base); + + /* make the new access flags more permissive */ + rcache_udreg->requested_access_flags = access_flags | old_reg->access_flags; + + /* get a new registration */ + udreg_reg = mca_rcache_udreg_reg_func (addr, size, rcache); + if (NULL == udreg_reg) { + OPAL_THREAD_UNLOCK(&rcache_udreg->lock); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + /* update the device data with the new registration */ + udreg_entry->device_data = udreg_reg; + + /* ensure that mca_rcache_udreg_deregister does not call into udreg since + * we are forcefully evicting the registration here */ + old_reg->flags |= MCA_RCACHE_FLAGS_CACHE_BYPASS | MCA_RCACHE_FLAGS_INVALID; + + mca_rcache_udreg_dereg_func (old_reg, rcache); + } while (0); + + udreg_reg->rcache_context = udreg_entry; + } else { + /* if cache bypass is requested don't use the udreg cache */ + while (NULL == (udreg_reg = mca_rcache_udreg_reg_func (addr, size, rcache))) { + /* try to remove one unused reg and retry */ + if (!mca_rcache_udreg_evict (rcache)) { + opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output, + "rcache/udreg: could not register memory"); + OPAL_THREAD_UNLOCK(&rcache_udreg->lock); + return OPAL_ERR_OUT_OF_RESOURCE; + } + } + udreg_reg->rcache_context = NULL; + } + + OPAL_THREAD_UNLOCK(&rcache_udreg->lock); + + *reg = udreg_reg; + ++udreg_reg->ref_count; + + return OPAL_SUCCESS; +} + +static int mca_rcache_udreg_find (mca_rcache_base_module_t *rcache, void *addr, + size_t size, mca_rcache_base_registration_t **reg) +{ + *reg = NULL; + return OPAL_ERR_NOT_FOUND; +} + +static int mca_rcache_udreg_deregister(mca_rcache_base_module_t *rcache, + mca_rcache_base_registration_t *reg) +{ + mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) rcache; + + assert(reg->ref_count > 0); + + --reg->ref_count; + + if (!(reg->flags & MCA_RCACHE_FLAGS_CACHE_BYPASS)) { + OPAL_THREAD_LOCK(&rcache_udreg->lock); + UDREG_DecrRefcount (rcache_udreg->udreg_handle, reg->rcache_context); + OPAL_THREAD_UNLOCK(&rcache_udreg->lock); + } else { + mca_rcache_udreg_dereg_func (reg, rcache); + } + + return OPAL_SUCCESS; +} + +static void mca_rcache_udreg_finalize (mca_rcache_base_module_t *rcache) +{ + mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t*)rcache; + + /* Statistic */ + if (true == mca_rcache_udreg_component.print_stats) { + uint64_t hit = 0, miss = 0, evicted = 0; + + (void) UDREG_GetStat (rcache_udreg->udreg_handle, + UDREG_STAT_CACHE_HIT, &hit); + + (void) UDREG_GetStat (rcache_udreg->udreg_handle, + UDREG_STAT_CACHE_MISS, &miss); + + (void) UDREG_GetStat (rcache_udreg->udreg_handle, + UDREG_STAT_CACHE_EVICTED, &evicted); + + opal_output(0, "%s udreg: stats (hit/miss/evicted): %" PRIu64 "/%" PRIu64 "/%" PRIu64 "\n", + OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), hit, miss, evicted); + } + + UDREG_CacheRelease (rcache_udreg->udreg_handle); + OBJ_DESTRUCT(&rcache_udreg->reg_list); + OBJ_DESTRUCT(&rcache_udreg->lock); +} diff --git a/opal/mca/rcache/vma/Makefile.am b/opal/mca/rcache/vma/Makefile.am deleted file mode 100644 index bf99c9aef1..0000000000 --- a/opal/mca/rcache/vma/Makefile.am +++ /dev/null @@ -1,52 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -sources = \ - rcache_vma.c \ - rcache_vma.h \ - rcache_vma_component.c \ - rcache_vma_tree.c \ - rcache_vma_tree.h - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if MCA_BUILD_opal_rcache_vma_DSO -lib = -lib_sources = -component = mca_rcache_vma.la -component_sources = $(sources) -else -lib = libmca_rcache_vma.la -lib_sources = $(sources) -component = -component_sources = -endif - -mcacomponentdir = $(opallibdir) -mcacomponent_LTLIBRARIES = $(component) -mca_rcache_vma_la_SOURCES = $(component_sources) -mca_rcache_vma_la_LDFLAGS = -module -avoid-version $(rcache_vma_LDFLAGS) -mca_rcache_vma_la_LIBADD = $(rcache_vma_LIBS) - -noinst_LTLIBRARIES = $(lib) -libmca_rcache_vma_la_SOURCES = $(lib_sources) -libmca_rcache_vma_la_LDFLAGS = -module -avoid-version$ $(rcache_vma_LDFLAGS) -libmca_rcache_vma_la_LIBADD = $(rcache_vma_LIBS) diff --git a/opal/mca/rcache/vma/owner.txt b/opal/mca/rcache/vma/owner.txt deleted file mode 100644 index 52961b5d12..0000000000 --- a/opal/mca/rcache/vma/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: LANL -status: maintenance diff --git a/opal/mca/rcache/vma/rcache_vma.c b/opal/mca/rcache/vma/rcache_vma.c deleted file mode 100644 index 8c9bd5e6f0..0000000000 --- a/opal/mca/rcache/vma/rcache_vma.c +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2009-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include MCA_memory_IMPLEMENTATION_HEADER -#include "opal/mca/memory/memory.h" -#include "opal/mca/rcache/rcache.h" -#include "rcache_vma.h" -#include "rcache_vma_tree.h" -#include "opal/mca/mpool/base/base.h" - -/** - * Initialize the rcache - */ - -void mca_rcache_vma_module_init( mca_rcache_vma_module_t* rcache ) { - rcache->base.rcache_find = mca_rcache_vma_find; - rcache->base.rcache_find_all = mca_rcache_vma_find_all; - rcache->base.rcache_insert = mca_rcache_vma_insert; - rcache->base.rcache_delete = mca_rcache_vma_delete; - rcache->base.rcache_clean = mca_rcache_vma_clean; - rcache->base.rcache_finalize = mca_rcache_vma_finalize; - rcache->base.rcache_dump_range = mca_rcache_vma_dump_range; - OBJ_CONSTRUCT(&rcache->base.lock, opal_recursive_mutex_t); - mca_rcache_vma_tree_init(rcache); -} - -int mca_rcache_vma_find(struct mca_rcache_base_module_t* rcache, - void* addr, size_t size, mca_mpool_base_registration_t **reg) -{ - int rc; - unsigned char* bound_addr; - - if(size == 0) { - return OPAL_ERROR; - } - - bound_addr = ((unsigned char *)addr) + size - 1; - - /* Check to ensure that the cache is valid */ - if (OPAL_UNLIKELY(opal_memory_changed() && - NULL != opal_memory->memoryc_process && - OPAL_SUCCESS != (rc = opal_memory->memoryc_process()))) { - return rc; - } - - *reg = mca_rcache_vma_tree_find((mca_rcache_vma_module_t*)rcache, (unsigned char*)addr, - bound_addr); - - return OPAL_SUCCESS; -} - -int mca_rcache_vma_find_all(struct mca_rcache_base_module_t* rcache, - void* addr, size_t size, mca_mpool_base_registration_t **regs, - int reg_cnt) -{ - int rc; - unsigned char *bound_addr; - - if(size == 0) { - return OPAL_ERROR; - } - - bound_addr = ((unsigned char *)addr) + size - 1; - - /* Check to ensure that the cache is valid */ - if (OPAL_UNLIKELY(opal_memory_changed() && - NULL != opal_memory->memoryc_process && - OPAL_SUCCESS != (rc = opal_memory->memoryc_process()))) { - return rc; - } - - return mca_rcache_vma_tree_find_all((mca_rcache_vma_module_t*)rcache, - (unsigned char*)addr, bound_addr, regs, - reg_cnt); -} - -int mca_rcache_vma_insert(struct mca_rcache_base_module_t* rcache, - mca_mpool_base_registration_t* reg, size_t limit) -{ - int rc; - size_t reg_size = reg->bound - reg->base + 1; - mca_rcache_vma_module_t *vma_rcache = (mca_rcache_vma_module_t*)rcache; - - if(limit != 0 && reg_size > limit) { - /* return out of resources if request is bigger than cache size - * return temp out of resources otherwise */ - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* Check to ensure that the cache is valid */ - if (OPAL_UNLIKELY(opal_memory_changed() && - NULL != opal_memory->memoryc_process && - OPAL_SUCCESS != (rc = opal_memory->memoryc_process()))) { - return rc; - } - - rc = mca_rcache_vma_tree_insert(vma_rcache, reg, limit); - if (OPAL_LIKELY(OPAL_SUCCESS == rc)) { - /* If we successfully registered, then tell the memory manager - to start monitoring this region */ - opal_memory->memoryc_register(reg->base, - (uint64_t) reg_size, (uint64_t) (uintptr_t) reg); - } - - return rc; -} - -int mca_rcache_vma_delete(struct mca_rcache_base_module_t* rcache, - mca_mpool_base_registration_t* reg) -{ - mca_rcache_vma_module_t *vma_rcache = (mca_rcache_vma_module_t*)rcache; - /* Tell the memory manager that we no longer care about this - region */ - opal_memory->memoryc_deregister(reg->base, - (uint64_t) (reg->bound - reg->base), - (uint64_t) (uintptr_t) reg); - return mca_rcache_vma_tree_delete(vma_rcache, reg); -} - -int mca_rcache_vma_clean(struct mca_rcache_base_module_t* rcache) -{ - mca_rcache_vma_module_t *vma_rcache = (mca_rcache_vma_module_t*)rcache; - mca_rcache_vma_t *vma; - opal_list_item_t *i; - - do { - OPAL_THREAD_LOCK(&rcache->lock); - i = opal_list_get_first(&vma_rcache->vma_delete_list); - if(opal_list_get_end(&vma_rcache->vma_delete_list) == i) { - vma = NULL; - OPAL_THREAD_UNLOCK(&rcache->lock); - } else { - vma = (mca_rcache_vma_t *)i; - opal_list_remove_item(&vma_rcache->vma_delete_list, &vma->super); - - /* Need to drop the rcache lock before destroying the vma */ - OPAL_THREAD_UNLOCK(&rcache->lock); - - mca_rcache_vma_destroy(vma); - } - } while (NULL != vma); - return OPAL_SUCCESS; -} - -/** - * finalize - */ -void mca_rcache_vma_finalize(struct mca_rcache_base_module_t* rcache) -{ - OBJ_DESTRUCT(&rcache->lock); - mca_rcache_vma_tree_finalize((mca_rcache_vma_module_t *)rcache); - free(rcache); -} - -void mca_rcache_vma_dump_range(struct mca_rcache_base_module_t* rcache, - unsigned char *base, size_t size, char *msg) -{ - mca_rcache_vma_module_t *vma_rcache = (struct mca_rcache_vma_module_t*) rcache; - mca_rcache_vma_tree_dump_range(vma_rcache, base, size, msg); -} diff --git a/opal/mca/rcache/vma/rcache_vma.h b/opal/mca/rcache/vma/rcache_vma.h deleted file mode 100644 index 0306fc0bba..0000000000 --- a/opal/mca/rcache/vma/rcache_vma.h +++ /dev/null @@ -1,93 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/** - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * - * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2009 IBM Corporation. All rights reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * Description of the Registration Cache framework - */ -#ifndef MCA_RCACHE_VMA_H -#define MCA_RCACHE_VMA_H -#include "opal_config.h" -#include "opal/mca/mca.h" -#include "opal/class/opal_list.h" -#include "opal/class/opal_rb_tree.h" -#include "opal/mca/rcache/rcache.h" - -BEGIN_C_DECLS - -struct mca_rcache_vma_module_t { - mca_rcache_base_module_t base; - opal_rb_tree_t rb_tree; - opal_list_t vma_list; - opal_list_t vma_delete_list; - size_t reg_cur_cache_size; -}; -typedef struct mca_rcache_vma_module_t mca_rcache_vma_module_t; - - -struct mca_rcache_vma_component_t { - mca_rcache_base_component_t super; -}; -typedef struct mca_rcache_vma_component_t mca_rcache_vma_component_t; - -OPAL_DECLSPEC extern mca_rcache_vma_component_t mca_rcache_vma_component; - - - -void mca_rcache_vma_module_init(mca_rcache_vma_module_t* rcache); - -int mca_rcache_vma_find(mca_rcache_base_module_t* rcache, void* addr, - size_t size, mca_mpool_base_registration_t **reg); - -int mca_rcache_vma_find_all(mca_rcache_base_module_t* rcache, void* addr, - size_t size, mca_mpool_base_registration_t **regs, int reg_cnt); - -int mca_rcache_vma_insert(struct mca_rcache_base_module_t* rcache, - mca_mpool_base_registration_t* registration, size_t limit); - -int mca_rcache_vma_delete(struct mca_rcache_base_module_t* rcache, - mca_mpool_base_registration_t* registration); - -/* It is not safe to call mca_rcache_vma_clean with the rcache lock held */ -int mca_rcache_vma_clean(struct mca_rcache_base_module_t* rcache); -/* Destroy vma objects which are on the deferred delete list. These were placed - on the list earlier when the rcache lock was held and it was not safe to - destory them. They should not be linked into any other structure anymore except - the vma_list_delete list */ - -/** - * init/finalize - */ - -void mca_rcache_vma_module_init(mca_rcache_vma_module_t *rcache); - -void mca_rcache_vma_finalize(struct mca_rcache_base_module_t*); - -void mca_rcache_vma_dump_range(struct mca_rcache_base_module_t *rcache, - unsigned char* addr, size_t size, char *msg); - - -END_C_DECLS - -#endif /* MCA_RCACHE_VMA_H */ - - diff --git a/opal/mca/rcache/vma/rcache_vma_component.c b/opal/mca/rcache/vma/rcache_vma_component.c deleted file mode 100644 index c1dcd65ed1..0000000000 --- a/opal/mca/rcache/vma/rcache_vma_component.c +++ /dev/null @@ -1,57 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2006-2007 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include "opal/mca/rcache/rcache.h" -#include "rcache_vma.h" - -static int mca_rcache_vma_component_open(void); - -static mca_rcache_base_module_t* mca_rcache_vma_component_init( void ); - -mca_rcache_vma_component_t mca_rcache_vma_component = { - { - .rcache_version = { - MCA_RCACHE_BASE_VERSION_2_0_0, - - .mca_component_name = "vma", - MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION, - OPAL_RELEASE_VERSION), - .mca_open_component = mca_rcache_vma_component_open, - }, - .rcache_data = { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - }, - .rcache_init = mca_rcache_vma_component_init, - } -}; - - -static int mca_rcache_vma_component_open(void) -{ - return OPAL_SUCCESS; -} - -static mca_rcache_base_module_t* mca_rcache_vma_component_init(void) { - mca_rcache_vma_module_t* rcache; - - rcache = (mca_rcache_vma_module_t*) malloc(sizeof(mca_rcache_vma_module_t)); - mca_rcache_vma_module_init(rcache); - - return &rcache->base; -} diff --git a/opal/mca/rcache/vma/rcache_vma_tree.c b/opal/mca/rcache/vma/rcache_vma_tree.c deleted file mode 100644 index 1c1d765bd3..0000000000 --- a/opal/mca/rcache/vma/rcache_vma_tree.c +++ /dev/null @@ -1,600 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2013 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * - * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2007 Mellanox Technologies. All rights reserved. - * Copyright (c) 2009 IBM Corporation. All rights reserved. - * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2015 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * Description of the Registration Cache framework - */ - -#include "opal_config.h" - -#include "opal/mca/mca.h" -#include "opal/util/output.h" -#include "rcache_vma_tree.h" - -OBJ_CLASS_INSTANCE(mca_rcache_vma_reg_list_item_t, opal_list_item_t, NULL, NULL); - -static void mca_rcache_vma_construct(opal_object_t *object) -{ - mca_rcache_vma_t *vma = (mca_rcache_vma_t*)object; - OBJ_CONSTRUCT(&vma->reg_list, opal_list_t); - OBJ_CONSTRUCT(&vma->reg_delete_list, opal_list_t); -} - -static void mca_rcache_vma_destruct(opal_object_t *object) -{ - mca_rcache_vma_t *vma = (mca_rcache_vma_t*)object; - OBJ_DESTRUCT(&vma->reg_list); - OBJ_DESTRUCT(&vma->reg_delete_list); -} - -OBJ_CLASS_INSTANCE(mca_rcache_vma_t, opal_list_item_t, - mca_rcache_vma_construct, mca_rcache_vma_destruct); - - -/** - * Function for the red black tree to compare 2 keys - * - * @param key1 a pointer to the 1st key - * @param key2 a pointer to the second key - * - * @retval -1 if key1 is below key2 - * @retval 1 if key 1 is above key2 - * @retval 0 if the keys are the same - */ - -static int mca_rcache_vma_tree_node_compare(void *key1, void *key2) -{ - mca_rcache_vma_t *vma1 = (mca_rcache_vma_t*)key1, - *vma2 = (mca_rcache_vma_t*)key2; - - if(vma1->start < vma2->start) - return -1; - if(vma1->start > vma2->start) - return 1; - - return 0; -} - -static int mca_rcache_vma_tree_node_compare_search(void *key1, void *key2) -{ - mca_rcache_vma_t *vma = (mca_rcache_vma_t*)key2; - uintptr_t addr = (uintptr_t)key1; - - if(vma->end < addr) - return 1; - if(vma->start <= addr) - return 0; - - return -1; -} - -static int mca_rcache_vma_tree_node_compare_closest(void *key1, void *key2) -{ - mca_rcache_vma_t *vma = (mca_rcache_vma_t*)key2, *prev_vma; - uintptr_t addr = (uintptr_t)key1; - - if(vma->end < addr) - return 1; - if(vma->start <= addr) - return 0; - prev_vma = (mca_rcache_vma_t *)opal_list_get_prev(&vma->super.super); - if(prev_vma == (mca_rcache_vma_t *)opal_list_get_end(&vma->rcache->vma_list) - || prev_vma->end < addr) - return 0; - - return -1; -} - -static inline mca_rcache_vma_t *mca_rcache_vma_new( - mca_rcache_vma_module_t *vma_rcache, uintptr_t start, uintptr_t end) -{ - mca_rcache_vma_t *vma = OBJ_NEW(mca_rcache_vma_t); - - if(NULL == vma) - return NULL; - - vma->start = start; - vma->end = end; - vma->rcache = vma_rcache; - - (void)opal_rb_tree_insert(&vma_rcache->rb_tree, vma, vma); - - return vma; -} - -void mca_rcache_vma_destroy(mca_rcache_vma_t *vma) -{ - opal_list_item_t *item; - - while ((item = opal_list_remove_first(&vma->reg_list))) - OBJ_RELEASE(item); - - while ((item = opal_list_remove_first(&vma->reg_delete_list))) - OBJ_RELEASE(item); - - OBJ_RELEASE(vma); -} - -static inline int mca_rcache_vma_compare_regs( - mca_mpool_base_registration_t *reg1, - mca_mpool_base_registration_t *reg2) -{ - /* persisten registration are on top */ - if((reg1->flags & MCA_MPOOL_FLAGS_PERSIST) && - !(reg2->flags & MCA_MPOOL_FLAGS_PERSIST)) - return 1; - - if(!(reg1->flags & MCA_MPOOL_FLAGS_PERSIST) && - (reg2->flags & MCA_MPOOL_FLAGS_PERSIST)) - return -1; - - if (reg1->bound != reg2->bound) - return (int)(reg1->bound - reg2->bound); - - /* tie breaker */ - return (int)((uintptr_t)reg1 - (uintptr_t)reg2); -} - -static inline int mca_rcache_vma_add_reg(mca_rcache_vma_t *vma, - mca_mpool_base_registration_t *reg) -{ - opal_list_item_t *i; - mca_rcache_vma_reg_list_item_t *item, *entry; - - entry = OBJ_NEW(mca_rcache_vma_reg_list_item_t); - - if(!entry) - return -1; - - entry->reg = reg; - - for(i = opal_list_get_first(&vma->reg_list); - i != opal_list_get_end(&vma->reg_list); - i = opal_list_get_next(i)) { - item = (mca_rcache_vma_reg_list_item_t*)i; - - if(mca_rcache_vma_compare_regs(item->reg, reg) > 0) - continue; - - opal_list_insert_pos(&vma->reg_list, &item->super, &entry->super); - return 0; - } - opal_list_append(&vma->reg_list, &entry->super); - return 0; -} - -static inline void mca_rcache_vma_remove_reg(mca_rcache_vma_t *vma, - mca_mpool_base_registration_t *reg) -{ - opal_list_item_t *i; - mca_rcache_vma_reg_list_item_t *item; - - for(i = opal_list_get_first(&vma->reg_list); - i != opal_list_get_end(&vma->reg_list); - i = opal_list_get_next(i)) { - item = (mca_rcache_vma_reg_list_item_t*)i; - - if(item->reg == reg) { - opal_list_remove_item(&vma->reg_list, &item->super); - opal_list_append(&vma->reg_delete_list, &item->super); - break; - } - } -} - -static inline int mca_rcache_vma_copy_reg_list(mca_rcache_vma_t *to, - mca_rcache_vma_t *from) -{ - opal_list_item_t *i; - mca_rcache_vma_reg_list_item_t *item_f, *item_t; - for(i = opal_list_get_first(&from->reg_list); - i != opal_list_get_end(&from->reg_list); - i = opal_list_get_next(i)) { - item_f = (mca_rcache_vma_reg_list_item_t*)i; - item_t = OBJ_NEW(mca_rcache_vma_reg_list_item_t); - - if(NULL == item_t) - return 0; - - item_t->reg = item_f->reg; - - opal_list_append(&to->reg_list, &item_t->super); - } - - return OPAL_SUCCESS; -} - -/* returns 1 iff two lists contain the same entries */ -static inline int mca_rcache_vma_compare_reg_lists(mca_rcache_vma_t *vma1, - mca_rcache_vma_t *vma2) -{ - mca_rcache_vma_reg_list_item_t *i1, *i2; - - if (!vma1 || !vma2) - return 0; - - if(opal_list_get_size(&vma1->reg_list) != - opal_list_get_size(&vma2->reg_list)) - return 0; - - i1 = (mca_rcache_vma_reg_list_item_t*)opal_list_get_first(&vma1->reg_list); - i2 = (mca_rcache_vma_reg_list_item_t*)opal_list_get_first(&vma2->reg_list); - - do { - if(i1 == (mca_rcache_vma_reg_list_item_t*)opal_list_get_end(&vma1->reg_list) || - i2 == (mca_rcache_vma_reg_list_item_t*)opal_list_get_end(&vma2->reg_list)) - return 1; - - if(i1->reg != i2->reg) - break; - - i1 = (mca_rcache_vma_reg_list_item_t*)opal_list_get_next(i1); - i2 = (mca_rcache_vma_reg_list_item_t*)opal_list_get_next(i2); - } while(1); - - return 0; -} - -int mca_rcache_vma_tree_init(mca_rcache_vma_module_t* rcache) -{ - OBJ_CONSTRUCT(&rcache->rb_tree, opal_rb_tree_t); - OBJ_CONSTRUCT(&rcache->vma_list, opal_list_t); - OBJ_CONSTRUCT(&rcache->vma_delete_list, opal_list_t); - rcache->reg_cur_cache_size = 0; - return opal_rb_tree_init(&rcache->rb_tree, - mca_rcache_vma_tree_node_compare); -} - -void mca_rcache_vma_tree_finalize(mca_rcache_vma_module_t* rcache) -{ - opal_rb_tree_init(&rcache->rb_tree, - mca_rcache_vma_tree_node_compare); - OBJ_DESTRUCT(&rcache->vma_delete_list); - OBJ_DESTRUCT(&rcache->vma_list); - OBJ_DESTRUCT(&rcache->rb_tree); -} - -mca_mpool_base_registration_t *mca_rcache_vma_tree_find( - mca_rcache_vma_module_t* vma_rcache, unsigned char *base, - unsigned char *bound) -{ - mca_rcache_vma_t *vma; - mca_rcache_vma_reg_list_item_t *item; - - vma = (mca_rcache_vma_t*)opal_rb_tree_find_with(&vma_rcache->rb_tree, base, - mca_rcache_vma_tree_node_compare_search); - - if(!vma) - return NULL; - - for(item = (mca_rcache_vma_reg_list_item_t*) - opal_list_get_first(&vma->reg_list); - item != (mca_rcache_vma_reg_list_item_t*) - opal_list_get_end(&vma->reg_list); - item = (mca_rcache_vma_reg_list_item_t*) - opal_list_get_next(item)) { - if(item->reg->flags & MCA_MPOOL_FLAGS_INVALID) - continue; - if(item->reg->bound >= bound) - return item->reg; - if(!(item->reg->flags & MCA_MPOOL_FLAGS_PERSIST)) - break; - } - - return NULL; -} - -static inline bool is_reg_in_array(mca_mpool_base_registration_t **regs, - int cnt, mca_mpool_base_registration_t *p) -{ - int i; - - for(i = 0; i < cnt; i++) { - if(regs[i] == p) - return true; - } - - return false; -} - -int mca_rcache_vma_tree_find_all( - mca_rcache_vma_module_t *vma_rcache, unsigned char *base, - unsigned char *bound, mca_mpool_base_registration_t **regs, - int reg_cnt) -{ - int cnt = 0; - - if(opal_list_get_size(&vma_rcache->vma_list) == 0) - return cnt; - - do { - mca_rcache_vma_t *vma; - opal_list_item_t *item; - vma = (mca_rcache_vma_t*) - opal_rb_tree_find_with(&vma_rcache->rb_tree, base, - mca_rcache_vma_tree_node_compare_closest); - - if(NULL == vma) { - /* base is bigger than any registered memory */ - break; - } - - if(base < (unsigned char*)vma->start) { - base = (unsigned char*)vma->start; - continue; - } - - for(item = opal_list_get_first(&vma->reg_list); - item != opal_list_get_end(&vma->reg_list); - item = opal_list_get_next(item)) { - mca_rcache_vma_reg_list_item_t *vma_item; - vma_item = (mca_rcache_vma_reg_list_item_t*)item; - if((vma_item->reg->flags & MCA_MPOOL_FLAGS_INVALID) || - is_reg_in_array(regs, cnt, vma_item->reg)) { - continue; - } - regs[cnt++] = vma_item->reg; - if(cnt == reg_cnt) - return cnt; /* no space left in the provided array */ - } - - base = (unsigned char *)vma->end + 1; - } while(bound >= base); - - return cnt; -} - -static inline int mca_rcache_vma_can_insert( - mca_rcache_vma_module_t *vma_rcache, size_t nbytes, size_t limit) -{ - if(0 == limit) - return 1; - - if(vma_rcache->reg_cur_cache_size + nbytes <= limit) - return 1; - - return 0; -} - -static inline void mca_rcache_vma_update_byte_count( - mca_rcache_vma_module_t* vma_rcache, - size_t nbytes) -{ - vma_rcache->reg_cur_cache_size += nbytes; -} - -int mca_rcache_vma_tree_insert(mca_rcache_vma_module_t* vma_rcache, - mca_mpool_base_registration_t* reg, size_t limit) -{ - mca_rcache_vma_t *i; - uintptr_t begin = (uintptr_t)reg->base, end = (uintptr_t)reg->bound; - - i = (mca_rcache_vma_t*)opal_rb_tree_find_with(&vma_rcache->rb_tree, - (void*)begin, mca_rcache_vma_tree_node_compare_closest); - - if(!i) - i = (mca_rcache_vma_t*)opal_list_get_end(&vma_rcache->vma_list); - - while (begin <= end) { - mca_rcache_vma_t *vma; - - if((mca_rcache_vma_t*)opal_list_get_end(&vma_rcache->vma_list) == i) { - vma = NULL; - if(mca_rcache_vma_can_insert(vma_rcache, end - begin + 1, limit)) - vma = mca_rcache_vma_new(vma_rcache, begin, end); - - if(!vma) - goto remove; - - mca_rcache_vma_update_byte_count(vma_rcache, end - begin + 1); - - opal_list_append(&vma_rcache->vma_list, &vma->super); - begin = vma->end + 1; - mca_rcache_vma_add_reg(vma, reg); - } else if(i->start > begin) { - uintptr_t tend = (i->start <= end)?(i->start - 1):end; - vma = NULL; - if(mca_rcache_vma_can_insert(vma_rcache, tend - begin + 1, limit)) - vma = mca_rcache_vma_new(vma_rcache, begin, tend); - - if(!vma) - goto remove; - - mca_rcache_vma_update_byte_count(vma_rcache, tend - begin + 1); - - /* insert before */ - opal_list_insert_pos(&vma_rcache->vma_list, &i->super, &vma->super); - i = vma; - begin = vma->end + 1; - mca_rcache_vma_add_reg(vma, reg); - } else if(i->start == begin) { - if (i->end > end) { - vma = mca_rcache_vma_new(vma_rcache, end+1, i->end); - if(!vma) - goto remove; - - i->end = end; - - mca_rcache_vma_copy_reg_list(vma, i); - - /* add after */ - opal_list_insert_pos(&vma_rcache->vma_list, - opal_list_get_next(&i->super), - &vma->super); - mca_rcache_vma_add_reg(i, reg); - begin = end + 1; - } else { - mca_rcache_vma_add_reg(i, reg); - begin = i->end + 1; - } - } else { - vma = mca_rcache_vma_new(vma_rcache, begin, i->end); - - if(!vma) - goto remove; - - i->end = begin - 1; - - mca_rcache_vma_copy_reg_list(vma, i); - - /* add after */ - opal_list_insert_pos(&vma_rcache->vma_list, - opal_list_get_next(&i->super), - &vma->super); - } - - i = (mca_rcache_vma_t*)opal_list_get_next(&i->super); - } - - return OPAL_SUCCESS; - -remove: - mca_rcache_vma_tree_delete(vma_rcache, reg); - return OPAL_ERR_TEMP_OUT_OF_RESOURCE; -} - -/** - * Function to remove previously memory from the tree without freeing it - * - * @param base pointer to the memory to free - * - * @retval OPAL_SUCCESS - * @retval OPAL_ERR_BAD_PARAM if the passed base pointer was invalid - */ -int mca_rcache_vma_tree_delete(mca_rcache_vma_module_t* vma_rcache, - mca_mpool_base_registration_t* reg) -{ - mca_rcache_vma_t *vma; - - vma = (mca_rcache_vma_t*)opal_rb_tree_find_with(&vma_rcache->rb_tree, reg->base, - mca_rcache_vma_tree_node_compare_search); - - if(!vma) - return OPAL_ERROR; - - while(vma != (mca_rcache_vma_t*)opal_list_get_end(&vma_rcache->vma_list) - && vma->start <= (uintptr_t)reg->bound) { - mca_rcache_vma_remove_reg(vma, reg); - - if(opal_list_is_empty(&vma->reg_list)) { - mca_rcache_vma_t *next = (mca_rcache_vma_t*)opal_list_get_next(&vma->super); - opal_rb_tree_delete(&vma_rcache->rb_tree, vma); - mca_rcache_vma_update_byte_count(vma_rcache, - vma->start - vma->end - 1); - opal_list_remove_item(&vma_rcache->vma_list, &vma->super); - opal_list_append(&vma_rcache->vma_delete_list, &vma->super); - vma = next; - } else { - int merged; - - do { - mca_rcache_vma_t *prev = NULL, *next = NULL; - if(opal_list_get_begin(&vma_rcache->vma_list) != - opal_list_get_prev(vma)) - prev = (mca_rcache_vma_t*)opal_list_get_prev(vma); - merged = 0; - - if(prev && vma->start == prev->end + 1 && - mca_rcache_vma_compare_reg_lists(vma, prev)) { - prev->end = vma->end; - opal_list_remove_item(&vma_rcache->vma_list, &vma->super); - opal_rb_tree_delete(&vma_rcache->rb_tree, vma); - opal_list_append(&vma_rcache->vma_delete_list, &vma->super); - vma = prev; - merged = 1; - } - if(opal_list_get_end(&vma_rcache->vma_list) != - opal_list_get_next(vma)) - next = (mca_rcache_vma_t*)opal_list_get_next(vma); - - if(next && vma->end + 1 == next->start && - mca_rcache_vma_compare_reg_lists(vma, next)) { - vma->end = next->end; - opal_list_remove_item(&vma_rcache->vma_list, &next->super); - opal_rb_tree_delete(&vma_rcache->rb_tree, next); - opal_list_append(&vma_rcache->vma_delete_list, &next->super); - merged = 1; - } - } while(merged); - vma = (mca_rcache_vma_t*)opal_list_get_next(vma); - } - } - return 0; -} - -/* Dump out rcache entries within a range of memory. Useful for debugging. */ -void mca_rcache_vma_tree_dump_range(mca_rcache_vma_module_t *vma_rcache, - unsigned char *base, size_t size, char *msg) -{ - unsigned char * bound = base + size -1; - mca_mpool_base_registration_t *reg; - - if (NULL == msg) { - msg = ""; - } - - opal_output(0, "Dumping rcache entries: %s", msg); - - if(opal_list_is_empty(&vma_rcache->vma_list)) { - opal_output(0, " rcache is empty"); - return; - } - - do { - mca_rcache_vma_t *vma; - opal_list_item_t *item; - vma = (mca_rcache_vma_t*) - opal_rb_tree_find_with(&vma_rcache->rb_tree, base, - mca_rcache_vma_tree_node_compare_closest); - - if(NULL == vma) { - /* base is bigger than any registered memory */ - break; - } - - if(base < (unsigned char*)vma->start) { - base = (unsigned char*)vma->start; - continue; - } - - opal_output(0, " vma: base=%p, bound=%p, size=%lu, number of registrations=%d", - (void *)vma->start, (void *)vma->end, vma->end - vma->start + 1, - (int)opal_list_get_size(&vma->reg_list)); - for(item = opal_list_get_first(&vma->reg_list); - item != opal_list_get_end(&vma->reg_list); - item = opal_list_get_next(item)) { - mca_rcache_vma_reg_list_item_t *vma_item; - vma_item = (mca_rcache_vma_reg_list_item_t*)item; - reg = vma_item->reg; - opal_output(0, " reg: base=%p, bound=%p, alloc_base=%p, ref_count=%d, flags=0x%x", - reg->base, reg->bound, reg->alloc_base, reg->ref_count, reg->flags); - } - base = (unsigned char *)vma->end + 1; - } while(bound >= base); -} diff --git a/opal/mca/rcache/vma/rcache_vma_tree.h b/opal/mca/rcache/vma/rcache_vma_tree.h deleted file mode 100644 index 77884eba2f..0000000000 --- a/opal/mca/rcache/vma/rcache_vma_tree.h +++ /dev/null @@ -1,116 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; -*- */ -/** - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2007 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * - * Copyright (c) 2006 Voltaire. All rights reserved. - * Copyright (c) 2009 IBM Corporation. All rights reserved. - * - * Copyright (c) 2013 Cisco Systems, Inc. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** - * @file - * Description of the Registration Cache framework - */ -#ifndef MCA_RCACHE_VMA_TREE_H -#define MCA_RCACHE_VMA_TREE_H -#include "opal/mca/mca.h" -#include "opal/mca/mpool/mpool.h" -#include "rcache_vma.h" -/* - * Data structures for the tree of allocated memory - */ - -struct mca_rcache_vma_reg_list_item_t -{ - opal_list_item_t super; - mca_mpool_base_registration_t *reg; -}; -typedef struct mca_rcache_vma_reg_list_item_t mca_rcache_vma_reg_list_item_t; -OBJ_CLASS_DECLARATION(mca_rcache_vma_reg_list_item_t); - -/** - * The item in the vma_tree itself - */ -struct mca_rcache_vma_t -{ - opal_list_item_t super; /**< the parent class */ - uintptr_t start; /**< the base of the memory range */ - uintptr_t end; /**< the bound of the memory range */ - opal_list_t reg_list; /**< list of regs on this vma */ - opal_list_t reg_delete_list; /**< delayed deletions list for regs on this vma */ - mca_rcache_vma_module_t *rcache; /**< pointer to rcache vma belongs to */ -}; -typedef struct mca_rcache_vma_t mca_rcache_vma_t; - -OBJ_CLASS_DECLARATION(mca_rcache_vma_t); - - -/* - * initialize the vma tree - */ -int mca_rcache_vma_tree_init(mca_rcache_vma_module_t* rcache); - -/* - * clean up the vma tree - */ -void mca_rcache_vma_tree_finalize(mca_rcache_vma_module_t* rcache); - -/** - * Returns the item in the vma tree - */ -mca_mpool_base_registration_t* mca_rcache_vma_tree_find( - mca_rcache_vma_module_t* rcache, - unsigned char* base, - unsigned char *bound - ); -/** - * Returns all registration that overlaps given memory region - */ -int mca_rcache_vma_tree_find_all( - mca_rcache_vma_module_t *vma_rcache, unsigned char *base, - unsigned char *bound, mca_mpool_base_registration_t **regs, - int reg_cnt); - -/* - * insert an item in the vma tree - */ -int mca_rcache_vma_tree_insert(mca_rcache_vma_module_t* rcache, - mca_mpool_base_registration_t* reg, size_t limit); - -/* - * remove an item from the vma tree - */ -int mca_rcache_vma_tree_delete( - mca_rcache_vma_module_t* rcache, - mca_mpool_base_registration_t* reg - ); - -/* - * Destroy a vma - * Do not call this function with rcache lock as it can deadlock - */ -void mca_rcache_vma_destroy(mca_rcache_vma_t *vma); - -/* - * Dump out the contents of the rcache for debugging. - */ -void mca_rcache_vma_tree_dump_range(mca_rcache_vma_module_t *vma_rcache, - unsigned char *base, size_t size, char *msg); - - -#endif /* MCA_RCACHE_VMA_TREE_H */ -