From f8a2b7f6bfca574a663210dabf1d7945aab5ed80 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Fri, 29 Sep 2017 17:04:10 -0700 Subject: [PATCH] Use opal_show_help to warn about PSM2_CUDA envvar setting If Open MPI is configured with CUDA, then user also should be using a CUDA build of PSM2 and therefore be setting PSM2_CUDA environment variable to 1 while using CUDA buffers for transfers. If we detect this setting to be missing, force set it. If user wants to use this build for regular (Host buffer) transfers, we allow the option of setting PSM2_CUDA=0, but print a warning message to user that it is not a recommended usage scenario. Signed-off-by: Aravind Gopalakrishnan --- ompi/mca/mtl/psm2/help-mtl-psm2.txt | 4 ++++ ompi/mca/mtl/psm2/mtl_psm2.c | 8 -------- ompi/mca/mtl/psm2/mtl_psm2_component.c | 27 ++++++++++++++++++++++++++ ompi/mca/pml/cm/pml_cm.h | 4 ++-- ompi/mca/pml/cm/pml_cm_recvreq.h | 14 ++++++------- ompi/mca/pml/cm/pml_cm_sendreq.h | 22 ++++++++++----------- 6 files changed, 51 insertions(+), 28 deletions(-) diff --git a/ompi/mca/mtl/psm2/help-mtl-psm2.txt b/ompi/mca/mtl/psm2/help-mtl-psm2.txt index 719b060a22..ee876efd20 100644 --- a/ompi/mca/mtl/psm2/help-mtl-psm2.txt +++ b/ompi/mca/mtl/psm2/help-mtl-psm2.txt @@ -45,3 +45,7 @@ Unknown path record query mechanism %s. Supported mechanisms are %s. # [message too big] Message size %llu bigger than supported by PSM2 API. Max = %llu +# +[no psm2 cuda env] +Using CUDA enabled OpenMPI but PSM2_CUDA environment variable is %s. +This is not a recommended combination. If the application uses %s. diff --git a/ompi/mca/mtl/psm2/mtl_psm2.c b/ompi/mca/mtl/psm2/mtl_psm2.c index 6d461a2c76..f0d04a2159 100644 --- a/ompi/mca/mtl/psm2/mtl_psm2.c +++ b/ompi/mca/mtl/psm2/mtl_psm2.c @@ -100,9 +100,6 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) { char *generated_key; char env_string[256]; int rc; -#if OPAL_CUDA_SUPPORT - char *cuda_env; -#endif generated_key = getenv(OPAL_MCA_PREFIX"orte_precondition_transports"); memset(uu, 0, sizeof(psm2_uuid_t)); @@ -178,11 +175,6 @@ int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) { #if OPAL_CUDA_SUPPORT ompi_mtl_psm2.super.mtl_flags |= MCA_MTL_BASE_FLAG_CUDA_INIT_DISABLE; - - cuda_env = getenv("PSM2_CUDA"); - if (!cuda_env || ( strcmp(cuda_env, "0") == 0) ) - opal_output(0, "Warning: If running with device buffers, there is a" - " chance the application might fail. Try setting PSM2_CUDA=1.\n"); #endif return OMPI_SUCCESS; diff --git a/ompi/mca/mtl/psm2/mtl_psm2_component.c b/ompi/mca/mtl/psm2/mtl_psm2_component.c index a536fd6efb..b2d74aeaf2 100644 --- a/ompi/mca/mtl/psm2/mtl_psm2_component.c +++ b/ompi/mca/mtl/psm2/mtl_psm2_component.c @@ -199,6 +199,9 @@ static int ompi_mtl_psm2_component_register(void) { int num_local_procs, num_total_procs; +#if OPAL_CUDA_SUPPORT + char *cuda_env; +#endif ompi_mtl_psm2.connect_timeout = 180; (void) mca_base_component_var_register(&mca_mtl_psm2_component.super.mtl_version, @@ -223,6 +226,30 @@ ompi_mtl_psm2_component_register(void) param_priority = 40; } +#if OPAL_CUDA_SUPPORT + /* + * If using CUDA enabled OpenMPI, the user likely intends to + * run with CUDA buffers. So, force-set the envvar here if user failed + * to set it. + */ + cuda_env = getenv("PSM2_CUDA"); + if (!cuda_env) { + opal_show_help("help-mtl-psm2.txt", + "no psm2 cuda env", true, + "not set", + "Host buffers,\nthere will be a performance penalty" + " due to OMPI force setting this variable now.\n" + "Set environment variable to 0 if using Host buffers" ); + setenv("PSM2_CUDA", "1", 0); + } else if (strcmp(cuda_env, "0") == 0) { + opal_show_help("help-mtl-psm2.txt", + "no psm2 cuda env", true, + "set to 0", + "CUDA buffers,\nthe execution will SEGFAULT." + " Set environment variable to 1 if using CUDA buffers"); + } +#endif + (void) mca_base_component_var_register (&mca_mtl_psm2_component.super.mtl_version, "priority", "Priority of the PSM2 MTL component", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, diff --git a/ompi/mca/pml/cm/pml_cm.h b/ompi/mca/pml/cm/pml_cm.h index 262294fc43..b3c06eb83b 100644 --- a/ompi/mca/pml/cm/pml_cm.h +++ b/ompi/mca/pml/cm/pml_cm.h @@ -185,7 +185,7 @@ mca_pml_cm_recv(void *addr, &(datatype->super), count, addr, - flags, + flags, &convertor ); #else MCA_PML_CM_SWITCH_CUDA_CONVERTOR_OFF(flags, datatype, count); @@ -195,7 +195,7 @@ mca_pml_cm_recv(void *addr, &(datatype->super), count, addr, - flags, + flags, &convertor ); #endif diff --git a/ompi/mca/pml/cm/pml_cm_recvreq.h b/ompi/mca/pml/cm/pml_cm_recvreq.h index 0c79bf4937..6729cac886 100644 --- a/ompi/mca/pml/cm/pml_cm_recvreq.h +++ b/ompi/mca/pml/cm/pml_cm_recvreq.h @@ -94,7 +94,7 @@ do { \ datatype, \ addr, \ count, \ - flags ) \ + flags ) \ do { \ OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, false); \ (request)->req_base.req_ompi.req_mpi_object.comm = comm; \ @@ -116,7 +116,7 @@ do { \ &(datatype->super), \ count, \ addr, \ - flags, \ + flags, \ &(request)->req_base.req_convertor ); \ } while(0) #else @@ -127,7 +127,7 @@ do { \ datatype, \ addr, \ count, \ - flags ) \ + flags ) \ do { \ OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, false); \ (request)->req_base.req_ompi.req_mpi_object.comm = comm; \ @@ -144,7 +144,7 @@ do { \ &(datatype->super), \ count, \ addr, \ - flags, \ + flags, \ &(request)->req_base.req_convertor ); \ } while(0) #endif @@ -158,7 +158,7 @@ do { \ datatype, \ addr, \ count, \ - flags, \ + flags, \ persistent) \ do { \ OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent); \ @@ -197,7 +197,7 @@ do { \ datatype, \ addr, \ count, \ - flags, \ + flags, \ persistent) \ do { \ OMPI_REQUEST_INIT(&(request)->req_base.req_ompi, persistent); \ @@ -219,7 +219,7 @@ do { \ &(datatype->super), \ count, \ addr, \ - flags, \ + flags, \ &(request)->req_base.req_convertor ); \ } while(0) #endif diff --git a/ompi/mca/pml/cm/pml_cm_sendreq.h b/ompi/mca/pml/cm/pml_cm_sendreq.h index 0d006da0f8..ab6dbb631d 100644 --- a/ompi/mca/pml/cm/pml_cm_sendreq.h +++ b/ompi/mca/pml/cm/pml_cm_sendreq.h @@ -127,7 +127,7 @@ do { \ sendmode, \ buf, \ count, \ - flags ) \ + flags ) \ { \ OBJ_RETAIN(comm); \ OMPI_DATATYPE_RETAIN(datatype); \ @@ -139,7 +139,7 @@ do { \ &(datatype->super), \ count, \ buf, \ - flags, \ + flags, \ &(req_send)->req_base.req_convertor ); \ (req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \ (req_send)->req_base.req_ompi.req_status.MPI_SOURCE = \ @@ -158,7 +158,7 @@ do { \ sendmode, \ buf, \ count, \ - flags ) \ + flags ) \ { \ OBJ_RETAIN(comm); \ OMPI_DATATYPE_RETAIN(datatype); \ @@ -170,7 +170,7 @@ do { \ &(datatype->super), \ count, \ buf, \ - flags, \ + flags, \ &(req_send)->req_base.req_convertor ); \ (req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \ (req_send)->req_base.req_ompi.req_status.MPI_SOURCE = \ @@ -191,7 +191,7 @@ do { \ sendmode, \ buf, \ count, \ - flags ) \ + flags ) \ { \ OBJ_RETAIN(comm); \ OMPI_DATATYPE_RETAIN(datatype); \ @@ -203,7 +203,7 @@ do { \ &(datatype->super), \ count, \ buf, \ - flags, \ + flags, \ &(req_send)->req_base.req_convertor ); \ (req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \ (req_send)->req_base.req_ompi.req_status.MPI_SOURCE = \ @@ -223,7 +223,7 @@ do { \ sendmode, \ buf, \ count, \ - flags ) \ + flags ) \ { \ OBJ_RETAIN(comm); \ OMPI_DATATYPE_RETAIN(datatype); \ @@ -249,7 +249,7 @@ do { \ &(datatype->super), \ count, \ buf, \ - flags, \ + flags, \ &(req_send)->req_base.req_convertor ); \ } \ (req_send)->req_base.req_ompi.req_mpi_object.comm = comm; \ @@ -273,7 +273,7 @@ do { \ blocking, \ buf, \ count, \ - flags ) \ + flags ) \ do { \ OMPI_REQUEST_INIT(&(sendreq->req_send.req_base.req_ompi), \ persistent); \ @@ -289,7 +289,7 @@ do { \ sendmode, \ buf, \ count, \ - flags ) \ + flags ) \ opal_convertor_get_packed_size( \ &sendreq->req_send.req_base.req_convertor, \ &sendreq->req_count ); \ @@ -309,7 +309,7 @@ do { \ sendmode, \ buf, \ count, \ - flags ) \ + flags ) \ do { \ OMPI_REQUEST_INIT(&(sendreq->req_send.req_base.req_ompi), \ false); \