1
1

btl/vader: reduce memory footprint when using xpmem

The vader btl kept a per-peer registration cache to keep track of
attachments. This is not really a problem with small numbers of local
ranks but can be a problem with large SMP machines. To reduce the
footprint there is now one registration cache for all xpmem
attachments. This will probably increase the lookup time for large
transfers but is a worthwhile trade-off.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2016-10-27 10:09:43 -06:00
родитель 8cc3f288c9
Коммит a652a193ea
5 изменённых файлов: 113 добавлений и 85 удалений

Просмотреть файл

@ -48,6 +48,7 @@
#include "opal/sys/atomic.h"
#include "opal/mca/btl/btl.h"
#include "opal/mca/rcache/rcache.h"
#include "opal/mca/rcache/base/rcache_base_vma.h"
#include "opal/mca/btl/base/base.h"
#include "opal/mca/rcache/rcache.h"
#include "opal/mca/rcache/base/base.h"
@ -103,6 +104,7 @@ struct mca_btl_vader_component_t {
int vader_free_list_inc; /**< number of elements to alloc when growing free lists */
#if OPAL_BTL_VADER_HAVE_XPMEM
xpmem_segid_t my_seg_id; /**< this rank's xpmem segment id */
mca_rcache_base_vma_module_t *vma_module; /**< registration cache for xpmem segments */
#endif
opal_shmem_ds_t seg_ds; /**< this rank's shared memory segment (when not using xpmem) */

Просмотреть файл

@ -28,7 +28,6 @@
#include "opal_config.h"
#include "btl_vader_xpmem.h"
#include "opal/mca/rcache/base/rcache_base_vma.h"
#define MCA_BTL_VADER_FBOX_ALIGNMENT 32
#define MCA_BTL_VADER_FBOX_ALIGNMENT_MASK (MCA_BTL_VADER_FBOX_ALIGNMENT - 1)
@ -75,7 +74,6 @@ typedef struct mca_btl_base_endpoint_t {
union {
#if OPAL_BTL_VADER_HAVE_XPMEM
struct {
mca_rcache_base_vma_module_t *vma_module;
xpmem_apid_t apid; /**< xpmem apid for remote peer */
} xpmem;
#endif

Просмотреть файл

@ -145,6 +145,12 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
/* set flag indicating btl has been inited */
vader_btl->btl_inited = true;
#if OPAL_BTL_VADER_HAVE_XPMEM
if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) {
mca_btl_vader_component.vma_module = mca_rcache_base_vma_module_alloc ();
}
#endif
return OPAL_SUCCESS;
}
@ -171,7 +177,6 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_
if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) {
/* always use xpmem if it is available */
ep->segment_data.xpmem.apid = xpmem_get (modex->xpmem.seg_id, XPMEM_RDWR, XPMEM_PERMIT_MODE, (void *) 0666);
ep->segment_data.xpmem.vma_module = mca_rcache_base_vma_module_alloc ();
(void) vader_get_registation (ep, modex->xpmem.segment_base, mca_btl_vader_component.segment_size,
MCA_RCACHE_FLAGS_PERSIST, (void **) &ep->segment_base);
} else {
@ -354,6 +359,12 @@ static int vader_finalize(struct mca_btl_base_module_t *btl)
opal_shmem_segment_detach (&mca_btl_vader_component.seg_ds);
}
#if OPAL_BTL_VADER_HAVE_XPMEM
if (NULL != mca_btl_vader_component.vma_module) {
OBJ_RELEASE(mca_btl_vader_component.vma_module);
}
#endif
return OPAL_SUCCESS;
}
@ -540,14 +551,6 @@ static void mca_btl_vader_endpoint_constructor (mca_btl_vader_endpoint_t *ep)
}
#if OPAL_BTL_VADER_HAVE_XPMEM
static int mca_btl_vader_endpoint_rcache_cleanup (mca_rcache_base_registration_t *reg, void *ctx)
{
mca_rcache_base_vma_module_t *vma_module = (mca_rcache_base_vma_module_t *) ctx;
/* otherwise dereg will fail on assert */
reg->ref_count = 0;
(void) mca_rcache_base_vma_delete (vma_module, reg);
return OPAL_SUCCESS;
}
#endif
static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
@ -557,19 +560,7 @@ static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
#if OPAL_BTL_VADER_HAVE_XPMEM
if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) {
if (ep->segment_data.xpmem.vma_module) {
/* clean out the registration cache */
(void) mca_rcache_base_vma_iterate (ep->segment_data.xpmem.vma_module,
NULL, (size_t) -1,
mca_btl_vader_endpoint_rcache_cleanup,
(void *) ep->segment_data.xpmem.vma_module);
OBJ_RELEASE(ep->segment_data.xpmem.vma_module);
}
if (ep->segment_base) {
xpmem_release (ep->segment_data.xpmem.apid);
ep->segment_data.xpmem.apid = 0;
}
mca_btl_vader_xpmem_cleanup_endpoint (ep);
} else
#endif
if (ep->segment_data.other.seg_ds) {

Просмотреть файл

@ -32,113 +32,121 @@ int mca_btl_vader_xpmem_init (void)
return OPAL_SUCCESS;
}
struct vader_check_reg_ctx_t {
mca_rcache_base_vma_module_t *vma_module;
mca_btl_base_endpoint_t *ep;
mca_rcache_base_registration_t **reg;
uintptr_t base;
uintptr_t bound;
};
typedef struct vader_check_reg_ctx_t vader_check_reg_ctx_t;
static int vader_check_reg (mca_rcache_base_registration_t *reg, void *ctx)
{
vader_check_reg_ctx_t *vader_ctx = (vader_check_reg_ctx_t *) ctx;
if ((intptr_t) reg->alloc_base != vader_ctx->ep->peer_smp_rank ||
(reg->flags & MCA_RCACHE_FLAGS_PERSIST)) {
/* ignore this registration */
return OPAL_SUCCESS;
}
vader_ctx->reg[0] = reg;
if (vader_ctx->bound <= (uintptr_t) reg->bound && vader_ctx->base >= (uintptr_t) reg->base) {
(void)opal_atomic_add (&reg->ref_count, 1);
return 1;
}
/* remove this pointer from the rcache and decrement its reference count
(so it is detached later) */
mca_rcache_base_vma_delete (vader_ctx->vma_module, reg);
return 2;
}
/* look up the remote pointer in the peer rcache and attach if
* necessary */
mca_rcache_base_registration_t *vader_get_registation (struct mca_btl_base_endpoint_t *ep, void *rem_ptr,
size_t size, int flags, void **local_ptr)
{
mca_rcache_base_vma_module_t *vma_module = ep->segment_data.xpmem.vma_module;
mca_rcache_base_registration_t *regs[10], *reg = NULL;
mca_rcache_base_vma_module_t *vma_module = mca_btl_vader_component.vma_module;
uint64_t attach_align = 1 << mca_btl_vader_component.log_attach_align;
mca_rcache_base_registration_t *reg = NULL;
vader_check_reg_ctx_t check_ctx = {.ep = ep, .reg = &reg, .vma_module = vma_module};
xpmem_addr_t xpmem_addr;
uintptr_t base, bound;
uint64_t attach_align = 1 << mca_btl_vader_component.log_attach_align;
int rc, i;
/* protect rcache access */
OPAL_THREAD_LOCK(&ep->lock);
/* use btl/self for self communication */
assert (ep->peer_smp_rank != MCA_BTL_VADER_LOCAL_RANK);
base = OPAL_DOWN_ALIGN((uintptr_t) rem_ptr, attach_align, uintptr_t);
bound = OPAL_ALIGN((uintptr_t) rem_ptr + size - 1, attach_align, uintptr_t) + 1;
if (OPAL_UNLIKELY(bound > VADER_MAX_ADDRESS)) {
bound = VADER_MAX_ADDRESS;
}
check_ctx.base = base;
check_ctx.bound = bound;
/* several segments may match the base pointer */
rc = mca_rcache_base_vma_find_all (vma_module, (void *) base, bound - base, regs, 10);
for (i = 0 ; i < rc ; ++i) {
if (bound <= (uintptr_t)regs[i]->bound && base >= (uintptr_t)regs[i]->base) {
(void)opal_atomic_add (&regs[i]->ref_count, 1);
reg = regs[i];
goto reg_found;
}
if (regs[i]->flags & MCA_RCACHE_FLAGS_PERSIST) {
continue;
}
/* remove this pointer from the rcache and decrement its reference count
(so it is detached later) */
rc = mca_rcache_base_vma_delete (vma_module, regs[i]);
if (OPAL_UNLIKELY(0 != rc)) {
/* someone beat us to it? */
break;
}
rc = mca_rcache_base_vma_iterate (vma_module, (void *) base, bound - base, vader_check_reg, &check_ctx);
if (2 == rc) {
/* start the new segment from the lower of the two bases */
base = (uintptr_t) regs[i]->base < base ? (uintptr_t) regs[i]->base : base;
base = (uintptr_t) reg->base < base ? (uintptr_t) reg->base : base;
(void)opal_atomic_add (&regs[i]->ref_count, -1);
if (OPAL_LIKELY(0 == regs[i]->ref_count)) {
if (OPAL_LIKELY(0 == opal_atomic_add_32 (&reg->ref_count, -1))) {
/* this pointer is not in use */
(void) xpmem_detach (regs[i]->rcache_context);
OBJ_RELEASE(regs[i]);
(void) xpmem_detach (reg->rcache_context);
OBJ_RELEASE(reg);
}
break;
reg = NULL;
}
reg = OBJ_NEW(mca_rcache_base_registration_t);
if (OPAL_LIKELY(NULL != reg)) {
/* stick around for awhile */
reg->ref_count = 2;
reg->base = (unsigned char *) base;
reg->bound = (unsigned char *) bound;
reg->flags = flags;
if (NULL == reg) {
reg = OBJ_NEW(mca_rcache_base_registration_t);
if (OPAL_LIKELY(NULL != reg)) {
/* stick around for awhile */
reg->ref_count = 2;
reg->base = (unsigned char *) base;
reg->bound = (unsigned char *) bound;
reg->flags = flags;
reg->alloc_base = (void *) (intptr_t) ep->peer_smp_rank;
#if defined(HAVE_SN_XPMEM_H)
xpmem_addr.id = ep->segment_data.xpmem.apid;
xpmem_addr.id = ep->segment_data.xpmem.apid;
#else
xpmem_addr.apid = ep->segment_data.xpmem.apid;
xpmem_addr.apid = ep->segment_data.xpmem.apid;
#endif
xpmem_addr.offset = base;
xpmem_addr.offset = base;
reg->rcache_context = xpmem_attach (xpmem_addr, bound - base, NULL);
if (OPAL_UNLIKELY((void *)-1 == reg->rcache_context)) {
OPAL_THREAD_UNLOCK(&ep->lock);
OBJ_RELEASE(reg);
return NULL;
reg->rcache_context = xpmem_attach (xpmem_addr, bound - base, NULL);
if (OPAL_UNLIKELY((void *)-1 == reg->rcache_context)) {
OBJ_RELEASE(reg);
return NULL;
}
opal_memchecker_base_mem_defined (reg->rcache_context, bound - base);
mca_rcache_base_vma_insert (vma_module, reg, 0);
}
opal_memchecker_base_mem_defined (reg->rcache_context, bound - base);
mca_rcache_base_vma_insert (vma_module, reg, 0);
}
reg_found:
opal_atomic_wmb ();
*local_ptr = (void *) ((uintptr_t) reg->rcache_context +
(ptrdiff_t)((uintptr_t) rem_ptr - (uintptr_t) reg->base));
OPAL_THREAD_UNLOCK(&ep->lock);
return reg;
}
void vader_return_registration (mca_rcache_base_registration_t *reg, struct mca_btl_base_endpoint_t *ep)
{
mca_rcache_base_vma_module_t *vma_module = ep->segment_data.xpmem.vma_module;
mca_rcache_base_vma_module_t *vma_module = mca_btl_vader_component.vma_module;
int32_t ref_count;
ref_count = opal_atomic_add_32 (&reg->ref_count, -1);
if (OPAL_UNLIKELY(0 == ref_count && !(reg->flags & MCA_RCACHE_FLAGS_PERSIST))) {
/* protect rcache access */
OPAL_THREAD_LOCK(&ep->lock);
mca_rcache_base_vma_delete (vma_module, reg);
OPAL_THREAD_UNLOCK(&ep->lock);
opal_memchecker_base_mem_noaccess (reg->rcache_context, (uintptr_t)(reg->bound - reg->base));
(void)xpmem_detach (reg->rcache_context);
@ -146,4 +154,31 @@ void vader_return_registration (mca_rcache_base_registration_t *reg, struct mca_
}
}
static int mca_btl_vader_endpoint_xpmem_rcache_cleanup (mca_rcache_base_registration_t *reg, void *ctx)
{
mca_rcache_base_vma_module_t *vma_module = mca_btl_vader_component.vma_module;
mca_btl_vader_endpoint_t *ep = (mca_btl_vader_endpoint_t *) ctx;
if ((intptr_t) reg->alloc_base == ep->peer_smp_rank) {
/* otherwise dereg will fail on assert */
reg->ref_count = 0;
(void) mca_rcache_base_vma_delete (vma_module, reg);
OBJ_RELEASE(reg);
}
return OPAL_SUCCESS;
}
void mca_btl_vader_xpmem_cleanup_endpoint (struct mca_btl_base_endpoint_t *ep)
{
/* clean out the registration cache */
(void) mca_rcache_base_vma_iterate (mca_btl_vader_component.vma_module,
NULL, (size_t) -1,
mca_btl_vader_endpoint_xpmem_rcache_cleanup,
(void *) ep);
if (ep->segment_base) {
xpmem_release (ep->segment_data.xpmem.apid);
ep->segment_data.xpmem.apid = 0;
}
}
#endif /* OPAL_BTL_VADER_HAVE_XPMEM */

Просмотреть файл

@ -39,6 +39,7 @@
#define VADER_MAX_ADDRESS XPMEM_MAXADDR_SIZE
#endif
struct mca_btl_base_endpoint_t;
int mca_btl_vader_xpmem_init (void);
@ -46,6 +47,7 @@ mca_rcache_base_registration_t *vader_get_registation (struct mca_btl_base_endpo
size_t size, int flags, void **local_ptr);
void vader_return_registration (mca_rcache_base_registration_t *reg, struct mca_btl_base_endpoint_t *endpoint);
void mca_btl_vader_xpmem_cleanup_endpoint (struct mca_btl_base_endpoint_t *ep);
#else