btl/vader: reduce memory footprint when using xpmem
The vader btl kept a per-peer registration cache to keep track of attachments. This is not really a problem with small numbers of local ranks but can be a problem with large SMP machines. To reduce the footprint there is now one registration cache for all xpmem attachments. This will probably increase the lookup time for large transfers but is a worthwhile trade-off. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
родитель
8cc3f288c9
Коммит
a652a193ea
@ -48,6 +48,7 @@
|
||||
#include "opal/sys/atomic.h"
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "opal/mca/rcache/rcache.h"
|
||||
#include "opal/mca/rcache/base/rcache_base_vma.h"
|
||||
#include "opal/mca/btl/base/base.h"
|
||||
#include "opal/mca/rcache/rcache.h"
|
||||
#include "opal/mca/rcache/base/base.h"
|
||||
@ -103,6 +104,7 @@ struct mca_btl_vader_component_t {
|
||||
int vader_free_list_inc; /**< number of elements to alloc when growing free lists */
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
xpmem_segid_t my_seg_id; /**< this rank's xpmem segment id */
|
||||
mca_rcache_base_vma_module_t *vma_module; /**< registration cache for xpmem segments */
|
||||
#endif
|
||||
opal_shmem_ds_t seg_ds; /**< this rank's shared memory segment (when not using xpmem) */
|
||||
|
||||
|
@ -28,7 +28,6 @@
|
||||
|
||||
#include "opal_config.h"
|
||||
#include "btl_vader_xpmem.h"
|
||||
#include "opal/mca/rcache/base/rcache_base_vma.h"
|
||||
|
||||
#define MCA_BTL_VADER_FBOX_ALIGNMENT 32
|
||||
#define MCA_BTL_VADER_FBOX_ALIGNMENT_MASK (MCA_BTL_VADER_FBOX_ALIGNMENT - 1)
|
||||
@ -75,7 +74,6 @@ typedef struct mca_btl_base_endpoint_t {
|
||||
union {
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
struct {
|
||||
mca_rcache_base_vma_module_t *vma_module;
|
||||
xpmem_apid_t apid; /**< xpmem apid for remote peer */
|
||||
} xpmem;
|
||||
#endif
|
||||
|
@ -145,6 +145,12 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
|
||||
/* set flag indicating btl has been inited */
|
||||
vader_btl->btl_inited = true;
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) {
|
||||
mca_btl_vader_component.vma_module = mca_rcache_base_vma_module_alloc ();
|
||||
}
|
||||
#endif
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
@ -171,7 +177,6 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_
|
||||
if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) {
|
||||
/* always use xpmem if it is available */
|
||||
ep->segment_data.xpmem.apid = xpmem_get (modex->xpmem.seg_id, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void *) 0666);
|
||||
ep->segment_data.xpmem.vma_module = mca_rcache_base_vma_module_alloc ();
|
||||
(void) vader_get_registation (ep, modex->xpmem.segment_base, mca_btl_vader_component.segment_size,
|
||||
MCA_RCACHE_FLAGS_PERSIST, (void **) &ep->segment_base);
|
||||
} else {
|
||||
@ -354,6 +359,12 @@ static int vader_finalize(struct mca_btl_base_module_t *btl)
|
||||
opal_shmem_segment_detach (&mca_btl_vader_component.seg_ds);
|
||||
}
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
if (NULL != mca_btl_vader_component.vma_module) {
|
||||
OBJ_RELEASE(mca_btl_vader_component.vma_module);
|
||||
}
|
||||
#endif
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
@ -540,14 +551,6 @@ static void mca_btl_vader_endpoint_constructor (mca_btl_vader_endpoint_t *ep)
|
||||
}
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
static int mca_btl_vader_endpoint_rcache_cleanup (mca_rcache_base_registration_t *reg, void *ctx)
|
||||
{
|
||||
mca_rcache_base_vma_module_t *vma_module = (mca_rcache_base_vma_module_t *) ctx;
|
||||
/* otherwise dereg will fail on assert */
|
||||
reg->ref_count = 0;
|
||||
(void) mca_rcache_base_vma_delete (vma_module, reg);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
|
||||
@ -557,19 +560,7 @@ static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
|
||||
|
||||
#if OPAL_BTL_VADER_HAVE_XPMEM
|
||||
if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) {
|
||||
if (ep->segment_data.xpmem.vma_module) {
|
||||
/* clean out the registration cache */
|
||||
(void) mca_rcache_base_vma_iterate (ep->segment_data.xpmem.vma_module,
|
||||
NULL, (size_t) -1,
|
||||
mca_btl_vader_endpoint_rcache_cleanup,
|
||||
(void *) ep->segment_data.xpmem.vma_module);
|
||||
OBJ_RELEASE(ep->segment_data.xpmem.vma_module);
|
||||
}
|
||||
|
||||
if (ep->segment_base) {
|
||||
xpmem_release (ep->segment_data.xpmem.apid);
|
||||
ep->segment_data.xpmem.apid = 0;
|
||||
}
|
||||
mca_btl_vader_xpmem_cleanup_endpoint (ep);
|
||||
} else
|
||||
#endif
|
||||
if (ep->segment_data.other.seg_ds) {
|
||||
|
@ -32,113 +32,121 @@ int mca_btl_vader_xpmem_init (void)
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
struct vader_check_reg_ctx_t {
|
||||
mca_rcache_base_vma_module_t *vma_module;
|
||||
mca_btl_base_endpoint_t *ep;
|
||||
mca_rcache_base_registration_t **reg;
|
||||
uintptr_t base;
|
||||
uintptr_t bound;
|
||||
};
|
||||
typedef struct vader_check_reg_ctx_t vader_check_reg_ctx_t;
|
||||
|
||||
static int vader_check_reg (mca_rcache_base_registration_t *reg, void *ctx)
|
||||
{
|
||||
vader_check_reg_ctx_t *vader_ctx = (vader_check_reg_ctx_t *) ctx;
|
||||
|
||||
if ((intptr_t) reg->alloc_base != vader_ctx->ep->peer_smp_rank ||
|
||||
(reg->flags & MCA_RCACHE_FLAGS_PERSIST)) {
|
||||
/* ignore this registration */
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
vader_ctx->reg[0] = reg;
|
||||
|
||||
if (vader_ctx->bound <= (uintptr_t) reg->bound && vader_ctx->base >= (uintptr_t) reg->base) {
|
||||
(void)opal_atomic_add (®->ref_count, 1);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* remove this pointer from the rcache and decrement its reference count
|
||||
(so it is detached later) */
|
||||
mca_rcache_base_vma_delete (vader_ctx->vma_module, reg);
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
/* look up the remote pointer in the peer rcache and attach if
|
||||
* necessary */
|
||||
mca_rcache_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *ep, void *rem_ptr,
|
||||
size_t size, int flags, void **local_ptr)
|
||||
{
|
||||
mca_rcache_base_vma_module_t *vma_module = ep->segment_data.xpmem.vma_module;
|
||||
mca_rcache_base_registration_t *regs[10], *reg = NULL;
|
||||
mca_rcache_base_vma_module_t *vma_module = mca_btl_vader_component.vma_module;
|
||||
uint64_t attach_align = 1 << mca_btl_vader_component.log_attach_align;
|
||||
mca_rcache_base_registration_t *reg = NULL;
|
||||
vader_check_reg_ctx_t check_ctx = {.ep = ep, .reg = ®, .vma_module = vma_module};
|
||||
xpmem_addr_t xpmem_addr;
|
||||
uintptr_t base, bound;
|
||||
uint64_t attach_align = 1 << mca_btl_vader_component.log_attach_align;
|
||||
int rc, i;
|
||||
|
||||
/* protect rcache access */
|
||||
OPAL_THREAD_LOCK(&ep->lock);
|
||||
|
||||
/* use btl/self for self communication */
|
||||
assert (ep->peer_smp_rank != MCA_BTL_VADER_LOCAL_RANK);
|
||||
|
||||
base = OPAL_DOWN_ALIGN((uintptr_t) rem_ptr, attach_align, uintptr_t);
|
||||
bound = OPAL_ALIGN((uintptr_t) rem_ptr + size - 1, attach_align, uintptr_t) + 1;
|
||||
if (OPAL_UNLIKELY(bound > VADER_MAX_ADDRESS)) {
|
||||
bound = VADER_MAX_ADDRESS;
|
||||
}
|
||||
|
||||
check_ctx.base = base;
|
||||
check_ctx.bound = bound;
|
||||
|
||||
/* several segments may match the base pointer */
|
||||
rc = mca_rcache_base_vma_find_all (vma_module, (void *) base, bound - base, regs, 10);
|
||||
for (i = 0 ; i < rc ; ++i) {
|
||||
if (bound <= (uintptr_t)regs[i]->bound && base >= (uintptr_t)regs[i]->base) {
|
||||
(void)opal_atomic_add (®s[i]->ref_count, 1);
|
||||
reg = regs[i];
|
||||
goto reg_found;
|
||||
}
|
||||
|
||||
if (regs[i]->flags & MCA_RCACHE_FLAGS_PERSIST) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* remove this pointer from the rcache and decrement its reference count
|
||||
(so it is detached later) */
|
||||
rc = mca_rcache_base_vma_delete (vma_module, regs[i]);
|
||||
if (OPAL_UNLIKELY(0 != rc)) {
|
||||
/* someone beat us to it? */
|
||||
break;
|
||||
}
|
||||
|
||||
rc = mca_rcache_base_vma_iterate (vma_module, (void *) base, bound - base, vader_check_reg, &check_ctx);
|
||||
if (2 == rc) {
|
||||
/* start the new segment from the lower of the two bases */
|
||||
base = (uintptr_t) regs[i]->base < base ? (uintptr_t) regs[i]->base : base;
|
||||
base = (uintptr_t) reg->base < base ? (uintptr_t) reg->base : base;
|
||||
|
||||
(void)opal_atomic_add (®s[i]->ref_count, -1);
|
||||
|
||||
if (OPAL_LIKELY(0 == regs[i]->ref_count)) {
|
||||
if (OPAL_LIKELY(0 == opal_atomic_add_32 (®->ref_count, -1))) {
|
||||
/* this pointer is not in use */
|
||||
(void) xpmem_detach (regs[i]->rcache_context);
|
||||
OBJ_RELEASE(regs[i]);
|
||||
(void) xpmem_detach (reg->rcache_context);
|
||||
OBJ_RELEASE(reg);
|
||||
}
|
||||
|
||||
break;
|
||||
reg = NULL;
|
||||
}
|
||||
|
||||
reg = OBJ_NEW(mca_rcache_base_registration_t);
|
||||
if (OPAL_LIKELY(NULL != reg)) {
|
||||
/* stick around for awhile */
|
||||
reg->ref_count = 2;
|
||||
reg->base = (unsigned char *) base;
|
||||
reg->bound = (unsigned char *) bound;
|
||||
reg->flags = flags;
|
||||
if (NULL == reg) {
|
||||
reg = OBJ_NEW(mca_rcache_base_registration_t);
|
||||
if (OPAL_LIKELY(NULL != reg)) {
|
||||
/* stick around for awhile */
|
||||
reg->ref_count = 2;
|
||||
reg->base = (unsigned char *) base;
|
||||
reg->bound = (unsigned char *) bound;
|
||||
reg->flags = flags;
|
||||
reg->alloc_base = (void *) (intptr_t) ep->peer_smp_rank;
|
||||
|
||||
#if defined(HAVE_SN_XPMEM_H)
|
||||
xpmem_addr.id = ep->segment_data.xpmem.apid;
|
||||
xpmem_addr.id = ep->segment_data.xpmem.apid;
|
||||
#else
|
||||
xpmem_addr.apid = ep->segment_data.xpmem.apid;
|
||||
xpmem_addr.apid = ep->segment_data.xpmem.apid;
|
||||
#endif
|
||||
xpmem_addr.offset = base;
|
||||
xpmem_addr.offset = base;
|
||||
|
||||
reg->rcache_context = xpmem_attach (xpmem_addr, bound - base, NULL);
|
||||
if (OPAL_UNLIKELY((void *)-1 == reg->rcache_context)) {
|
||||
OPAL_THREAD_UNLOCK(&ep->lock);
|
||||
OBJ_RELEASE(reg);
|
||||
return NULL;
|
||||
reg->rcache_context = xpmem_attach (xpmem_addr, bound - base, NULL);
|
||||
if (OPAL_UNLIKELY((void *)-1 == reg->rcache_context)) {
|
||||
OBJ_RELEASE(reg);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
opal_memchecker_base_mem_defined (reg->rcache_context, bound - base);
|
||||
|
||||
mca_rcache_base_vma_insert (vma_module, reg, 0);
|
||||
}
|
||||
|
||||
opal_memchecker_base_mem_defined (reg->rcache_context, bound - base);
|
||||
|
||||
mca_rcache_base_vma_insert (vma_module, reg, 0);
|
||||
}
|
||||
|
||||
reg_found:
|
||||
opal_atomic_wmb ();
|
||||
*local_ptr = (void *) ((uintptr_t) reg->rcache_context +
|
||||
(ptrdiff_t)((uintptr_t) rem_ptr - (uintptr_t) reg->base));
|
||||
|
||||
OPAL_THREAD_UNLOCK(&ep->lock);
|
||||
|
||||
return reg;
|
||||
}
|
||||
|
||||
void vader_return_registration (mca_rcache_base_registration_t *reg, struct mca_btl_base_endpoint_t *ep)
|
||||
{
|
||||
mca_rcache_base_vma_module_t *vma_module = ep->segment_data.xpmem.vma_module;
|
||||
mca_rcache_base_vma_module_t *vma_module = mca_btl_vader_component.vma_module;
|
||||
int32_t ref_count;
|
||||
|
||||
ref_count = opal_atomic_add_32 (®->ref_count, -1);
|
||||
if (OPAL_UNLIKELY(0 == ref_count && !(reg->flags & MCA_RCACHE_FLAGS_PERSIST))) {
|
||||
/* protect rcache access */
|
||||
OPAL_THREAD_LOCK(&ep->lock);
|
||||
mca_rcache_base_vma_delete (vma_module, reg);
|
||||
OPAL_THREAD_UNLOCK(&ep->lock);
|
||||
|
||||
opal_memchecker_base_mem_noaccess (reg->rcache_context, (uintptr_t)(reg->bound - reg->base));
|
||||
(void)xpmem_detach (reg->rcache_context);
|
||||
@ -146,4 +154,31 @@ void vader_return_registration (mca_rcache_base_registration_t *reg, struct mca_
|
||||
}
|
||||
}
|
||||
|
||||
static int mca_btl_vader_endpoint_xpmem_rcache_cleanup (mca_rcache_base_registration_t *reg, void *ctx)
|
||||
{
|
||||
mca_rcache_base_vma_module_t *vma_module = mca_btl_vader_component.vma_module;
|
||||
mca_btl_vader_endpoint_t *ep = (mca_btl_vader_endpoint_t *) ctx;
|
||||
if ((intptr_t) reg->alloc_base == ep->peer_smp_rank) {
|
||||
/* otherwise dereg will fail on assert */
|
||||
reg->ref_count = 0;
|
||||
(void) mca_rcache_base_vma_delete (vma_module, reg);
|
||||
OBJ_RELEASE(reg);
|
||||
}
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
void mca_btl_vader_xpmem_cleanup_endpoint (struct mca_btl_base_endpoint_t *ep)
|
||||
{
|
||||
/* clean out the registration cache */
|
||||
(void) mca_rcache_base_vma_iterate (mca_btl_vader_component.vma_module,
|
||||
NULL, (size_t) -1,
|
||||
mca_btl_vader_endpoint_xpmem_rcache_cleanup,
|
||||
(void *) ep);
|
||||
if (ep->segment_base) {
|
||||
xpmem_release (ep->segment_data.xpmem.apid);
|
||||
ep->segment_data.xpmem.apid = 0;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* OPAL_BTL_VADER_HAVE_XPMEM */
|
||||
|
@ -39,6 +39,7 @@
|
||||
#define VADER_MAX_ADDRESS XPMEM_MAXADDR_SIZE
|
||||
#endif
|
||||
|
||||
struct mca_btl_base_endpoint_t;
|
||||
|
||||
int mca_btl_vader_xpmem_init (void);
|
||||
|
||||
@ -46,6 +47,7 @@ mca_rcache_base_registration_t *vader_get_registation (struct mca_btl_base_endpo
|
||||
size_t size, int flags, void **local_ptr);
|
||||
|
||||
void vader_return_registration (mca_rcache_base_registration_t *reg, struct mca_btl_base_endpoint_t *endpoint);
|
||||
void mca_btl_vader_xpmem_cleanup_endpoint (struct mca_btl_base_endpoint_t *ep);
|
||||
|
||||
#else
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user