1
1

Use opal_show_help to warn about PSM2_CUDA envvar setting

If Open MPI is configured with CUDA, then user also should be using a CUDA build of
PSM2 and therefore be setting PSM2_CUDA environment variable to 1 while using
CUDA buffers for transfers. If we detect this setting to be missing, force set
it. If user wants to use this build for regular (Host buffer) transfers, we
allow the option of setting PSM2_CUDA=0, but print a warning
message to user that it is not a recommended usage scenario.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@intel.com>
Этот коммит содержится в:
Aravind Gopalakrishnan 2017-09-29 17:04:10 -07:00
родитель b11841acaf
Коммит f8a2b7f6bf
6 изменённых файлов: 51 добавлений и 28 удалений

Просмотреть файл

@ -45,3 +45,7 @@ Unknown path record query mechanism %s. Supported mechanisms are %s.
#
[message too big]
Message size %llu bigger than supported by PSM2 API. Max = %llu
#
[no psm2 cuda env]
Using CUDA enabled OpenMPI but PSM2_CUDA environment variable is %s.
This is not a recommended combination. If the application uses %s.

Просмотреть файл

@ -100,9 +100,6 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) {
char *generated_key;
char env_string[256];
int rc;
#if OPAL_CUDA_SUPPORT
char *cuda_env;
#endif
generated_key = getenv(OPAL_MCA_PREFIX"orte_precondition_transports");
memset(uu, 0, sizeof(psm2_uuid_t));
@ -178,11 +175,6 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) {
#if OPAL_CUDA_SUPPORT
ompi_mtl_psm2.super.mtl_flags |= MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE;
cuda_env = getenv("PSM2_CUDA");
if (!cuda_env || ( strcmp(cuda_env, "0") == 0) )
opal_output(0, "Warning: If running with device buffers, there is a"
" chance the application might fail. Try setting PSM2_CUDA=1.\n");
#endif
return OMPI_SUCCESS;

Просмотреть файл

@ -199,6 +199,9 @@ static int
ompi_mtl_psm2_component_register(void)
{
int num_local_procs, num_total_procs;
#if OPAL_CUDA_SUPPORT
char *cuda_env;
#endif
ompi_mtl_psm2.connect_timeout = 180;
(void) mca_base_component_var_register(&mca_mtl_psm2_component.super.mtl_version,
@ -223,6 +226,30 @@ ompi_mtl_psm2_component_register(void)
param_priority = 40;
}
#if OPAL_CUDA_SUPPORT
/*
* If using CUDA enabled OpenMPI, the user likely intends to
* run with CUDA buffers. So, force-set the envvar here if user failed
* to set it.
*/
cuda_env = getenv("PSM2_CUDA");
if (!cuda_env) {
opal_show_help("help-mtl-psm2.txt",
"no psm2 cuda env", true,
"not set",
"Host buffers,\nthere will be a performance penalty"
" due to OMPI force setting this variable now.\n"
"Set environment variable to 0 if using Host buffers" );
setenv("PSM2_CUDA", "1", 0);
} else if (strcmp(cuda_env, "0") == 0) {
opal_show_help("help-mtl-psm2.txt",
"no psm2 cuda env", true,
"set to 0",
"CUDA buffers,\nthe execution will SEGFAULT."
" Set environment variable to 1 if using CUDA buffers");
}
#endif
(void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version,
"priority", "Priority of the PSM2 MTL component",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,

Просмотреть файл

@ -185,7 +185,7 @@ mca_pml_cm_recv(void *addr,
&(datatype->super),
count,
addr,
flags,
flags,
&convertor );
#else
MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count);
@ -195,7 +195,7 @@ mca_pml_cm_recv(void *addr,
&(datatype->super),
count,
addr,
flags,
flags,
&convertor );
#endif

Просмотреть файл

@ -94,7 +94,7 @@ do { \
datatype, \
addr, \
count, \
flags ) \
flags ) \
do { \
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, false); \
(request)->req_base.req_ompi.req_mpi_object.comm = comm; \
@ -116,7 +116,7 @@ do { \
&(datatype->super), \
count, \
addr, \
flags, \
flags, \
&(request)->req_base.req_convertor ); \
} while(0)
#else
@ -127,7 +127,7 @@ do { \
datatype, \
addr, \
count, \
flags ) \
flags ) \
do { \
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, false); \
(request)->req_base.req_ompi.req_mpi_object.comm = comm; \
@ -144,7 +144,7 @@ do { \
&(datatype->super), \
count, \
addr, \
flags, \
flags, \
&(request)->req_base.req_convertor ); \
} while(0)
#endif
@ -158,7 +158,7 @@ do { \
datatype, \
addr, \
count, \
flags, \
flags, \
persistent) \
do { \
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent); \
@ -197,7 +197,7 @@ do { \
datatype, \
addr, \
count, \
flags, \
flags, \
persistent) \
do { \
OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent); \
@ -219,7 +219,7 @@ do { \
&(datatype->super), \
count, \
addr, \
flags, \
flags, \
&(request)->req_base.req_convertor ); \
} while(0)
#endif

Просмотреть файл

@ -127,7 +127,7 @@ do { \
sendmode, \
buf, \
count, \
flags ) \
flags ) \
{ \
OBJ_RETAIN(comm); \
OMPI_DATATYPE_RETAIN(datatype); \
@ -139,7 +139,7 @@ do { \
&(datatype->super), \
count, \
buf, \
flags, \
flags, \
&(req_send)->req_base.req_convertor ); \
(req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \
(req_send)->req_base.req_ompi.req_status.MPI_SOURCE = \
@ -158,7 +158,7 @@ do { \
sendmode, \
buf, \
count, \
flags ) \
flags ) \
{ \
OBJ_RETAIN(comm); \
OMPI_DATATYPE_RETAIN(datatype); \
@ -170,7 +170,7 @@ do { \
&(datatype->super), \
count, \
buf, \
flags, \
flags, \
&(req_send)->req_base.req_convertor ); \
(req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \
(req_send)->req_base.req_ompi.req_status.MPI_SOURCE = \
@ -191,7 +191,7 @@ do { \
sendmode, \
buf, \
count, \
flags ) \
flags ) \
{ \
OBJ_RETAIN(comm); \
OMPI_DATATYPE_RETAIN(datatype); \
@ -203,7 +203,7 @@ do { \
&(datatype->super), \
count, \
buf, \
flags, \
flags, \
&(req_send)->req_base.req_convertor ); \
(req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \
(req_send)->req_base.req_ompi.req_status.MPI_SOURCE = \
@ -223,7 +223,7 @@ do { \
sendmode, \
buf, \
count, \
flags ) \
flags ) \
{ \
OBJ_RETAIN(comm); \
OMPI_DATATYPE_RETAIN(datatype); \
@ -249,7 +249,7 @@ do { \
&(datatype->super), \
count, \
buf, \
flags, \
flags, \
&(req_send)->req_base.req_convertor ); \
} \
(req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \
@ -273,7 +273,7 @@ do { \
blocking, \
buf, \
count, \
flags ) \
flags ) \
do { \
OMPI_REQUEST_INIT(&(sendreq->req_send.req_base.req_ompi), \
persistent); \
@ -289,7 +289,7 @@ do { \
sendmode, \
buf, \
count, \
flags ) \
flags ) \
opal_convertor_get_packed_size( \
&sendreq->req_send.req_base.req_convertor, \
&sendreq->req_count ); \
@ -309,7 +309,7 @@ do { \
sendmode, \
buf, \
count, \
flags ) \
flags ) \
do { \
OMPI_REQUEST_INIT(&(sendreq->req_send.req_base.req_ompi), \
false); \