From bd4a6fee22d2bf2bf455faf55404cdc452cd6bd6 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Tue, 27 Jun 2017 20:37:34 -0700 Subject: [PATCH] Attempt to detect when we are direct-launched without the necessary PMI support, and thus are incorrectly identified as being "singleton". Advise the user on the required PMI(x) support and error out. Signed-off-by: Ralph Castain --- ompi/runtime/ompi_mpi_init.c | 2 +- orte/mca/ess/base/help-ess-base.txt | 40 +++++++++++++++++++ orte/mca/ess/pmi/ess_pmi_module.c | 20 +++++++++- .../ess/singleton/ess_singleton_component.c | 30 +++++++++++++- orte/mca/schizo/alps/schizo_alps.c | 9 ++++- orte/mca/schizo/orte/schizo_orte.c | 8 +++- orte/mca/schizo/slurm/schizo_slurm.c | 7 ++++ 7 files changed, 109 insertions(+), 7 deletions(-) diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 0aa346a66c..a36dabc08d 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -990,7 +990,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) error: if (ret != OMPI_SUCCESS) { /* Only print a message if one was not already printed */ - if (NULL != error) { + if (NULL != error && OMPI_ERR_SILENT != ret) { const char *err_msg = opal_strerror(ret); opal_show_help("help-mpi-runtime.txt", "mpi_init:startup:internal-failure", true, diff --git a/orte/mca/ess/base/help-ess-base.txt b/orte/mca/ess/base/help-ess-base.txt index ba33cb2d16..0d4907b565 100644 --- a/orte/mca/ess/base/help-ess-base.txt +++ b/orte/mca/ess/base/help-ess-base.txt @@ -49,3 +49,43 @@ MCA parameter: param: %s This is not a recognized signal value. Please fix or remove it. +# +[slurm-error] +The application appears to have been direct launched using "srun", +but OMPI was not built with SLURM's PMI support and therefore cannot +execute. There are several options for building PMI support under +SLURM, depending upon the SLURM version you are using: + + version 16.05 or later: you can use SLURM's PMIx support. This + requires that you configure and build SLURM --with-pmix. + + Versions earlier than 16.05: you must use either SLURM's PMI-1 or + PMI-2 support. SLURM builds PMI-1 by default, or you can manually + install PMI-2. You must then build Open MPI using --with-pmi pointing + to the SLURM PMI library location. + +Please configure as appropriate and try again. +# +[slurm-error2] +The application appears to have been direct launched using "srun", +but OMPI was not built with SLURM support. This usually happens +when OMPI was not configured --with-slurm and we weren't able +to discover a SLURM installation in the usual places. + +Please configure as appropriate and try again. +# +[alps-error] +The application appears to have been direct launched using "aprun", +but OMPI was not built with ALPS PMI support and therefore cannot +execute. You must build Open MPI using --with-pmi pointing +to the ALPS PMI library location. + +Please configure as appropriate and try again. +# +[alps-error2] +The application appears to have been direct launched using "aprun", +but OMPI was not built with ALPS support. This usually happens +when OMPI was not configured --with-alps and we weren't able +to discover an ALPS installation in the usual places. + +Please configure as appropriate and try again. diff --git a/orte/mca/ess/pmi/ess_pmi_module.c b/orte/mca/ess/pmi/ess_pmi_module.c index 4ad414236a..2d852f820a 100644 --- a/orte/mca/ess/pmi/ess_pmi_module.c +++ b/orte/mca/ess/pmi/ess_pmi_module.c @@ -52,6 +52,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/rml/rml.h" +#include "orte/mca/schizo/schizo.h" #include "orte/util/proc_info.h" #include "orte/util/show_help.h" #include "orte/util/name_fns.h" @@ -125,7 +126,24 @@ static int rte_init(void) opal_pmix_base_set_evbase(orte_event_base); /* initialize the selected module */ if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) { - /* we cannot run */ + /* we cannot run - this could be due to being direct launched + * without the required PMI support being built. Try to detect + * that scenario and warn the user */ + if (ORTE_SCHIZO_DIRECT_LAUNCHED == orte_schizo.check_launch_environment() && + NULL != (envar = getenv("ORTE_SCHIZO_DETECTION"))) { + if (0 == strcmp(envar, "SLURM")) { + /* yes to both - so emit a hopefully helpful + * error message and abort */ + orte_show_help_finalize(); + orte_show_help("help-ess-base.txt", "slurm-error", true); + return ORTE_ERR_SILENT; + } else if (0 == strcmp(envar, "ALPS")) { + /* we were direct launched by ALPS */ + orte_show_help_finalize(); + orte_show_help("help-ess-base.txt", "alps-error", true); + return ORTE_ERR_SILENT; + } + } error = "pmix init"; goto error; } diff --git a/orte/mca/ess/singleton/ess_singleton_component.c b/orte/mca/ess/singleton/ess_singleton_component.c index f457d4109f..9730910357 100644 --- a/orte/mca/ess/singleton/ess_singleton_component.c +++ b/orte/mca/ess/singleton/ess_singleton_component.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2016 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -33,6 +33,7 @@ #include "opal/mca/pmix/base/base.h" #include "orte/util/proc_info.h" +#include "orte/util/show_help.h" #include "orte/mca/schizo/schizo.h" #include "orte/mca/ess/ess.h" @@ -131,6 +132,32 @@ static int component_query(mca_base_module_t **module, int *priority) return ORTE_ERROR; } + /* we may be incorrectly trying to run as a singleton - e.g., + * someone direct-launched us under SLURM without building + * ORTE --with-slurm or in a slurm environment (so we didn't + * autodetect slurm). Try to detect that here. Sadly, we + * cannot just use the schizo framework to help us here as + * the corresponding schizo component may not have even + * been build. So we have to do things a little uglier */ + + if (ORTE_SCHIZO_UNMANAGED_SINGLETON == ret) { + /* see if we are in a SLURM allocation */ + if (NULL != getenv("SLURM_NODELIST")) { + /* emit a hopefully helpful error message and abort */ + orte_show_help("help-ess-base.txt", "slurm-error2", true); + *module = NULL; + *priority = 0; + return ORTE_ERR_SILENT; + } + /* see if we are under ALPS */ + if (NULL != getenv("ALPS_APP_ID")) { + orte_show_help("help-ess-base.txt", "alps-error2", true); + *module = NULL; + *priority = 0; + return ORTE_ERR_SILENT; + } + } + /* okay, we want to be selected as we must be a singleton */ *priority = 100; *module = (mca_base_module_t *)&orte_ess_singleton_module; @@ -142,4 +169,3 @@ static int component_close(void) { return ORTE_SUCCESS; } - diff --git a/orte/mca/schizo/alps/schizo_alps.c b/orte/mca/schizo/alps/schizo_alps.c index c1e65c4569..4c7db47092 100644 --- a/orte/mca/schizo/alps/schizo_alps.c +++ b/orte/mca/schizo/alps/schizo_alps.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -65,9 +65,16 @@ static orte_schizo_launch_environ_t check_launch_environment(void) * launch performance penalty for hwloc at high ppn on knl */ opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX "orte_bound_at_launch"); opal_argv_append_nosize(&pushed_vals, "true"); + /* mark that we are native */ + opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION"); + opal_argv_append_nosize(&pushed_vals, "NATIVE"); goto setup; } + /* mark that we are on ALPS */ + opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION"); + opal_argv_append_nosize(&pushed_vals, "ALPS"); + /* see if we are running in a Cray PAGG container */ fd = fopen(proc_job_file, "r"); if (NULL == fd) { diff --git a/orte/mca/schizo/orte/schizo_orte.c b/orte/mca/schizo/orte/schizo_orte.c index b3783fe8fb..d5f31f33db 100644 --- a/orte/mca/schizo/orte/schizo_orte.c +++ b/orte/mca/schizo/orte/schizo_orte.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 Intel, Inc. All rights reserved. + * Copyright (c) 2016-2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -53,7 +53,7 @@ static orte_schizo_launch_environ_t check_launch_environment(void) * so no need to further check that here. Instead, * see if we were direct launched vs launched via mpirun */ if (NULL != orte_process_info.my_daemon_uri) { - /* nope */ + /* yes we were */ myenv = ORTE_SCHIZO_NATIVE_LAUNCHED; opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess"); opal_argv_append_nosize(&pushed_vals, "pmi"); @@ -65,6 +65,10 @@ static orte_schizo_launch_environ_t check_launch_environment(void) myenv = ORTE_SCHIZO_UNMANAGED_SINGLETON; opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess"); opal_argv_append_nosize(&pushed_vals, "singleton"); + /* mark that we are in ORTE */ + opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION"); + opal_argv_append_nosize(&pushed_vals, "ORTE"); + setup: opal_output_verbose(1, orte_schizo_base_framework.framework_output, diff --git a/orte/mca/schizo/slurm/schizo_slurm.c b/orte/mca/schizo/slurm/schizo_slurm.c index 3f5bebe6ce..1038f69044 100644 --- a/orte/mca/schizo/slurm/schizo_slurm.c +++ b/orte/mca/schizo/slurm/schizo_slurm.c @@ -62,6 +62,9 @@ static orte_schizo_launch_environ_t check_launch_environment(void) myenv = ORTE_SCHIZO_NATIVE_LAUNCHED; opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess"); opal_argv_append_nosize(&pushed_vals, "pmi"); + /* mark that we are native */ + opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION"); + opal_argv_append_nosize(&pushed_vals, "NATIVE"); goto setup; } @@ -72,6 +75,10 @@ static orte_schizo_launch_environ_t check_launch_environment(void) return myenv; } + /* mark that we are in SLURM */ + opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION"); + opal_argv_append_nosize(&pushed_vals, "SLURM"); + /* we are in an allocation, but were we direct launched * or are we a singleton? */ if (NULL == getenv("SLURM_STEP_ID")) {