1
1

Per this RFC from October 8, 2013 and as discuessed in telecon.

http://www.open-mpi.org/community/lists/devel/2013/10/13072.php

Add support for pinning GPU Direct RDMA in openib BTL for better small message latency of GPU buffers. 
Note that none of this is compiled in unless CUDA-aware support is requested.

This commit was SVN r29680.
Этот коммит содержится в:
Rolf vandeVaart 2013-11-13 13:22:39 +00:00
родитель 840e2cb4a2
Коммит 4964a5e98b
13 изменённых файлов: 234 добавлений и 7 удалений

Просмотреть файл

@ -12,6 +12,7 @@
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
*
* $COPYRIGHT$
*
@ -75,6 +76,23 @@ int mca_btl_base_param_register(mca_base_component_t *version,
OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_READONLY,
&module->btl_eager_limit);
#if OPAL_CUDA_SUPPORT_60
/* If no CUDA RDMA support, zero them out */
if (!(MCA_BTL_FLAGS_CUDA_GET & module->btl_flags)) {
module->btl_cuda_eager_limit = 0;
module->btl_cuda_rdma_limit = SIZE_MAX;
}
(void) mca_base_component_var_register(version, "cuda_eager_limit", "Maximum size (in bytes, including header) of \"GPU short\" messages (must be >= 1).",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&module->btl_cuda_eager_limit);
(void) mca_base_component_var_register(version, "cuda_rdma_limit", "Size (in bytes, including header) of GPU buffer when switch to rndv protocol and pipeline.",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&module->btl_cuda_rdma_limit);
#endif /* OPAL_CUDA_SUPPORT_60 */
(void) mca_base_component_var_register(version, "max_send_size", "Maximum size (in bytes) of a single \"phase 2\" fragment of a long message when using the pipeline protocol (must be >= 1)",
MCA_BASE_VAR_TYPE_SIZE_T, NULL, 0, 0,

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2006-2013 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -843,6 +843,10 @@ struct mca_btl_base_module_t {
mca_btl_base_module_register_error_fn_t btl_register_error;
/** fault tolerant even notification */
mca_btl_base_module_ft_event_fn_t btl_ft_event;
#if OPAL_CUDA_SUPPORT_60
size_t btl_cuda_eager_limit; /**< switch from eager to RDMA */
size_t btl_cuda_rdma_limit; /**< switch from RDMA to rndv pipeline */
#endif /* OPAL_CUDA_SUPPORT_60 */
};
typedef struct mca_btl_base_module_t mca_btl_base_module_t;

Просмотреть файл

@ -1219,7 +1219,11 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_src(
openib_btl = (mca_btl_openib_module_t*)btl;
#if OPAL_CUDA_SUPPORT_60
if(opal_convertor_cuda_need_buffers(convertor) == false && 0 == reserve) {
#else
if(opal_convertor_need_buffers(convertor) == false && 0 == reserve) {
#endif /* OPAL_CUDA_SUPPORT_60 */
/* GMS bloody HACK! */
if(registration != NULL || max_data > btl->btl_max_send_size) {
frag = alloc_send_user_frag();
@ -1376,7 +1380,13 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
/* we didn't get a memory registration passed in, so we have to
* register the region ourselves
*/
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, buffer, *size, 0,
uint32_t mflags = 0;
#if OPAL_CUDA_SUPPORT
if (convertor->flags & CONVERTOR_CUDA) {
mflags |= MCA_MPOOL_FLAGS_CUDA_GPU_MEM;
}
#endif /* OPAL_CUDA_SUPPORT */
rc = btl->btl_mpool->mpool_register(btl->btl_mpool, buffer, *size, mflags,
&registration);
if(OMPI_SUCCESS != rc || NULL == registration) {
MCA_BTL_IB_FRAG_RETURN(frag);

Просмотреть файл

@ -311,9 +311,11 @@ struct mca_btl_openib_component_t {
size_t memalign_threshold;
void* (*previous_malloc_hook)(size_t __size, const void*);
#endif
#if OPAL_CUDA_SUPPORT /* CUDA_ASYNC_SEND */
#if OPAL_CUDA_SUPPORT
bool cuda_async_send;
bool cuda_async_recv;
bool cuda_have_gdr;
bool cuda_want_gdr;
#endif /* OPAL_CUDA_SUPPORT */
#if HAVE_DECL_IBV_LINK_LAYER_ETHERNET
bool rroce_enable;

Просмотреть файл

@ -580,6 +580,37 @@ int btl_openib_register_mca_params(void)
mca_btl_openib_module.super.btl_max_send_size = 128 * 1024;
/* Turn of message coalescing - not sure if it works with GPU buffers */
mca_btl_openib_component.use_message_coalescing = 0;
/* Indicates if library was built with GPU Direct RDMA support. Not changeable. */
mca_btl_openib_component.cuda_have_gdr = OPAL_INT_TO_BOOL(OPAL_CUDA_SUPPORT_60);
(void) mca_base_component_var_register(&mca_btl_openib_component.super.btl_version, "have_cuda_gdr_support",
"Whether CUDA GPU Direct RDMA support is built into library or not",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0,
MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_CONSTANT,
&mca_btl_openib_component.cuda_have_gdr);
/* Default for GPU Direct RDMA is off for now */
CHECK(reg_bool("cuda_want_gdr_support", NULL,
"Enable or disable CUDA GPU Direct RDMA support "
"(true = yes; false = no)",
false, &mca_btl_openib_component.cuda_want_gdr));
if (mca_btl_openib_component.cuda_want_gdr && !mca_btl_openib_component.cuda_have_gdr) {
opal_output(0, "GDR support requested but library does not have it built in.");
return OMPI_ERROR;
}
#if OPAL_CUDA_SUPPORT_60
if (mca_btl_openib_component.cuda_want_gdr) {
mca_btl_openib_module.super.btl_flags |= MCA_BTL_FLAGS_CUDA_GET;
mca_btl_openib_module.super.btl_cuda_eager_limit = SIZE_MAX; /* magic number - indicates set it to minimum */
mca_btl_openib_module.super.btl_cuda_rdma_limit = 1024 * 20; /* default switchover is 20K to pipeline */
} else {
mca_btl_openib_module.super.btl_cuda_eager_limit = 0; /* Turns off any of the GPU Direct RDMA code */
mca_btl_openib_module.super.btl_cuda_rdma_limit = 0; /* Unused */
}
#endif /* OPAL_CUDA_SUPPORT_60 */
#endif /* OPAL_CUDA_SUPPORT */
CHECK(mca_btl_base_param_register(
&mca_btl_openib_component.super.btl_version,

Просмотреть файл

@ -32,7 +32,7 @@ struct mca_mpool_common_cuda_reg_t {
};
typedef struct mca_mpool_common_cuda_reg_t mca_mpool_common_cuda_reg_t;
extern bool mca_common_cuda_enabled;
#define OMPI_GDR_SUPPORT 0
#define OMPI_GDR_SUPPORT 1
OMPI_DECLSPEC int mca_common_cuda_register_mca_variables(void);

Просмотреть файл

@ -47,6 +47,10 @@ mcacomponent_LTLIBRARIES = $(component_install)
mca_mpool_grdma_la_SOURCES = $(sources)
mca_mpool_grdma_la_LDFLAGS = -module -avoid-version
mca_mpool_grdma_la_LIBADD = $(mpool_grdma_LIBS)
if OPAL_cuda_support
mca_mpool_grdma_la_LIBADD += \
$(top_ompi_builddir)/ompi/mca/common/cuda/libmca_common_cuda.la
endif
noinst_LTLIBRARIES = $(component_noinst)
libmca_mpool_grdma_la_SOURCES = $(sources)

Просмотреть файл

@ -36,6 +36,9 @@
#include "opal/align.h"
#if OPAL_CUDA_SUPPORT_60
#include "ompi/mca/common/cuda/common_cuda.h"
#endif /* OPAL_CUDA_SUPPORT_60 */
#include "ompi/mca/rcache/rcache.h"
#include "ompi/mca/rcache/base/base.h"
#include "ompi/mca/rte/rte.h"
@ -44,6 +47,9 @@
#include "ompi/mca/mpool/base/base.h"
#include "mpool_grdma.h"
#if OPAL_CUDA_SUPPORT_60
static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *addr, size_t size);
#endif /* OPAL_CUDA_SUPPORT_60 */
static void mca_mpool_grdma_pool_contructor (mca_mpool_grdma_pool_t *pool)
{
memset ((void *)((uintptr_t)pool + sizeof (pool->super)), 0, sizeof (*pool) - sizeof (pool->super));
@ -230,6 +236,17 @@ int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr,
if (!opal_list_is_empty (&mpool_grdma->pool->gc_list))
do_unregistration_gc(mpool);
#if OPAL_CUDA_SUPPORT_60
if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) {
size_t psize;
mca_common_cuda_get_address_range(&base, &psize, addr);
bound = base + psize - 1;
/* Check to see if this memory is in the cache and if it has been freed. If so,
* this call will boot it out of the cache. */
check_for_cuda_freed_memory(mpool, base, psize);
}
#endif /* OPAL_CUDA_SUPPORT_60 */
/* look through existing regs if not persistent registration requested.
* Persistent registration are always registered and placed in the cache */
if(!(bypass_cache || persist)) {
@ -270,6 +287,11 @@ int mca_mpool_grdma_register(mca_mpool_base_module_t *mpool, void *addr,
grdma_reg->base = base;
grdma_reg->bound = bound;
grdma_reg->flags = flags;
#if OPAL_CUDA_SUPPORT_60
if (flags & MCA_MPOOL_FLAGS_CUDA_GPU_MEM) {
mca_common_cuda_get_buffer_id(grdma_reg);
}
#endif /* OPAL_CUDA_SUPPORT_60 */
if (false == bypass_cache) {
rc = mpool->rcache->rcache_insert(mpool->rcache, grdma_reg, 0);
@ -440,6 +462,61 @@ int mca_mpool_grdma_release_memory(struct mca_mpool_base_module_t *mpool,
return rc;
}
/* Make sure this registration request is not stale. In other words, ensure
* that we do not have a cuMemAlloc, cuMemFree, cuMemAlloc state. If we do
* kick out the regisrations and deregister. This function needs to be called
* with the mpool->rcache->lock held. */
#if OPAL_CUDA_SUPPORT_60
static int check_for_cuda_freed_memory(mca_mpool_base_module_t *mpool, void *addr, size_t size)
{
mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t *) mpool;
mca_mpool_base_registration_t *regs[GRDMA_MPOOL_NREGS];
int reg_cnt, i, rc = OMPI_SUCCESS;
mca_mpool_base_registration_t *reg;
mpool->rcache->rcache_find(mpool->rcache, addr, size, &reg);
if (NULL == reg) {
return 0;
}
/* If not previously freed memory, just return 0 */
if (!(mca_common_cuda_previously_freed_memory(reg))) {
return 0;
}
/* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "Before free"); */
/* This memory has been freed. Find all registrations and delete */
do {
reg_cnt = mpool->rcache->rcache_find_all(mpool->rcache, reg->base, reg->bound - reg->base + 1,
regs, GRDMA_MPOOL_NREGS);
for(i = 0 ; i < reg_cnt ; ++i) {
regs[i]->flags |= MCA_MPOOL_FLAGS_INVALID;
if (regs[i]->ref_count) {
opal_output(0, "Release FAILED: ref_count=%d, base=%p, bound=%p, size=%d",
regs[i]->ref_count, regs[i]->base, regs[i]->bound,
(int) (regs[i]->bound - regs[i]->base + 1));
/* memory is being freed, but there are registration in use that
* covers the memory. This can happen even in a correct program,
* but may also be an user error. We can't tell. Mark the
* registration as invalid. It will not be used any more and
* will be unregistered when ref_count will become zero */
rc = OMPI_ERROR; /* tell caller that something was wrong */
} else {
opal_list_remove_item(&mpool_grdma->pool->lru_list,(opal_list_item_t *) regs[i]);
/* Now deregister. Do not use gc_list as we need to kick this out now. */
dereg_mem(regs[i]);
}
}
} while(reg_cnt == GRDMA_MPOOL_NREGS);
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
/* mpool->rcache->rcache_dump_range(mpool->rcache, 0, (size_t)-1, "After free");*/
return rc;
}
#endif /* OPAL_CUDA_SUPPORT_60 */
void mca_mpool_grdma_finalize(struct mca_mpool_base_module_t *mpool)
{
mca_mpool_grdma_module_t *mpool_grdma = (mca_mpool_grdma_module_t*)mpool;

Просмотреть файл

@ -38,6 +38,7 @@
#define MCA_MPOOL_FLAGS_INVALID 0x8
#define MCA_MPOOL_FLAGS_SO_MEM 0x10
#define MCA_MPOOL_FLAGS_CUDA_REGISTER_MEM 0x20
#define MCA_MPOOL_FLAGS_CUDA_GPU_MEM 0x40
struct mca_mpool_base_resources_t;
@ -49,6 +50,9 @@ struct mca_mpool_base_registration_t {
unsigned char* alloc_base;
int32_t ref_count;
uint32_t flags;
#if OPAL_CUDA_SUPPORT_60
unsigned long long gpu_bufID;
#endif /* OPAL_CUDA_SUPPORT_60 */
};
typedef struct mca_mpool_base_registration_t mca_mpool_base_registration_t;

Просмотреть файл

@ -12,9 +12,32 @@ The "eager limit" MCA parameter in the %s BTL was set to a value which
is too low for Open MPI to function properly. Please re-run your job
with a higher eager limit value for this BTL; the exact MCA parameter
name and its corresponding minimum value is shown below.
Local host: %s
BTL name: %s
BTL eager limit value: %d (set via btl_%s_eager_limit)
BTL eager limit minimum: %d
MCA parameter name: btl_%s_eager_limit
[cuda_eager_limit_too_small]
The "CUDA eager limit" MCA parameter in the %s BTL was set to a value which
is too low for Open MPI to function properly. Please re-run your job
with a higher CUDA eager limit value for this BTL; the exact MCA parameter
name and its corresponding minimum value is shown below.
Local host: %s
BTL name: %s
BTL CUDA eager limit value: %d (set via btl_%s_cuda_eager_limit)
BTL CUDA eager limit minimum: %d
MCA parameter name: btl_%s_cuda_eager_limit
[cuda_rdma_limit_too_small]
The "CUDA rdma limit" MCA parameter in the %s BTL was set to a value which
is too low for Open MPI to function properly. Please re-run your job
with a higher CUDA rdma limit value for this BTL; the exact MCA parameter
name and its corresponding minimum value is shown below.
Local host: %s
BTL name: %s
BTL CUDA rndv limit value: %d (set via btl_%s_cuda_rdma_limit)
BTL CUDA rndv limit minimum: %d
MCA parameter name: btl_%s_cuda_rdma_limit

Просмотреть файл

@ -366,6 +366,46 @@ int mca_pml_ob1_add_procs(ompi_proc_t** procs, size_t nprocs)
rc = OMPI_ERR_BAD_PARAM;
goto cleanup_and_return;
}
#if OPAL_CUDA_SUPPORT_60
/* If size is SIZE_MAX, then we know we want to set this to the minimum possible
* value which is the size of the PML header. */
if (SIZE_MAX == sm->btl_module->btl_cuda_eager_limit) {
sm->btl_module->btl_cuda_eager_limit = sizeof(mca_pml_ob1_hdr_t);
}
if (0 != sm->btl_module->btl_cuda_eager_limit) {
if (sm->btl_module->btl_cuda_eager_limit < sizeof(mca_pml_ob1_hdr_t)) {
opal_show_help("help-mpi-pml-ob1.txt", "cuda_eager_limit_too_small",
true,
sm->btl_component->btl_version.mca_component_name,
ompi_process_info.nodename,
sm->btl_component->btl_version.mca_component_name,
sm->btl_module->btl_cuda_eager_limit,
sm->btl_component->btl_version.mca_component_name,
sizeof(mca_pml_ob1_hdr_t),
sm->btl_component->btl_version.mca_component_name);
rc = OMPI_ERR_BAD_PARAM;
goto cleanup_and_return;
}
}
if (0 == sm->btl_module->btl_cuda_rdma_limit) {
/* All is fine. 0 means to ignore value so set to SIZE_MAX */
sm->btl_module->btl_cuda_rdma_limit = SIZE_MAX;
} else {
if (sm->btl_module->btl_cuda_rdma_limit < sm->btl_module->btl_cuda_eager_limit) {
opal_show_help("help-mpi-pml-ob1.txt", "cuda_rdma_limit_too_small",
true,
sm->btl_component->btl_version.mca_component_name,
ompi_process_info.nodename,
sm->btl_component->btl_version.mca_component_name,
sm->btl_module->btl_cuda_rdma_limit,
sm->btl_component->btl_version.mca_component_name,
sm->btl_module->btl_cuda_eager_limit,
sm->btl_component->btl_version.mca_component_name);
rc = OMPI_ERR_BAD_PARAM;
goto cleanup_and_return;
}
}
#endif /* OPAL_CUDA_SUPPORT_60 */
}

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2008 UT-Battelle, LLC. All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2012-2013 NVIDIA Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -54,6 +54,14 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
size_t size) {
int rc;
#if OPAL_CUDA_SUPPORT_41
#if OPAL_CUDA_SUPPORT_60
/* With some BTLs, switch to RNDV from RGET at large messages */
if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
(sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
}
#endif /* OPAL_CUDA_SUPPORT_60 */
sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
unsigned char *base;
@ -120,7 +128,7 @@ size_t mca_pml_ob1_rdma_cuda_btls(
if( NULL != btl_mpool ) {
/* register the memory */
btl_mpool->mpool_register(btl_mpool, base, size, 0, &reg);
btl_mpool->mpool_register(btl_mpool, base, size, MCA_MPOOL_FLAGS_CUDA_GPU_MEM, &reg);
}
if(NULL == reg)

Просмотреть файл

@ -366,6 +366,12 @@ mca_pml_ob1_send_request_start_btl( mca_pml_ob1_send_request_t* sendreq,
size_t eager_limit = btl->btl_eager_limit - sizeof(mca_pml_ob1_hdr_t);
int rc;
#if OPAL_CUDA_SUPPORT_60
if (btl->btl_cuda_eager_limit && (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
eager_limit = btl->btl_cuda_eager_limit - sizeof(mca_pml_ob1_hdr_t);
}
#endif /* OPAL_CUDA_SUPPORT_60 */
if( OPAL_LIKELY(size <= eager_limit) ) {
switch(sendreq->req_send.req_send_mode) {
case MCA_PML_BASE_SEND_SYNCHRONOUS: