Attempt to detect when we are direct-launched without the necessary PMI support, and thus are incorrectly identified as being "singleton". Advise the user on the required PMI(x) support and error out.
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
7e5e5fe887
Коммит
bd4a6fee22
ompi/runtime
orte/mca
ess
schizo
@ -990,7 +990,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
||||
error:
|
||||
if (ret != OMPI_SUCCESS) {
|
||||
/* Only print a message if one was not already printed */
|
||||
if (NULL != error) {
|
||||
if (NULL != error && OMPI_ERR_SILENT != ret) {
|
||||
const char *err_msg = opal_strerror(ret);
|
||||
opal_show_help("help-mpi-runtime.txt",
|
||||
"mpi_init:startup:internal-failure", true,
|
||||
|
@ -49,3 +49,43 @@ MCA parameter:
|
||||
param: %s
|
||||
|
||||
This is not a recognized signal value. Please fix or remove it.
|
||||
#
|
||||
[slurm-error]
|
||||
The application appears to have been direct launched using "srun",
|
||||
but OMPI was not built with SLURM's PMI support and therefore cannot
|
||||
execute. There are several options for building PMI support under
|
||||
SLURM, depending upon the SLURM version you are using:
|
||||
|
||||
version 16.05 or later: you can use SLURM's PMIx support. This
|
||||
requires that you configure and build SLURM --with-pmix.
|
||||
|
||||
Versions earlier than 16.05: you must use either SLURM's PMI-1 or
|
||||
PMI-2 support. SLURM builds PMI-1 by default, or you can manually
|
||||
install PMI-2. You must then build Open MPI using --with-pmi pointing
|
||||
to the SLURM PMI library location.
|
||||
|
||||
Please configure as appropriate and try again.
|
||||
#
|
||||
[slurm-error2]
|
||||
The application appears to have been direct launched using "srun",
|
||||
but OMPI was not built with SLURM support. This usually happens
|
||||
when OMPI was not configured --with-slurm and we weren't able
|
||||
to discover a SLURM installation in the usual places.
|
||||
|
||||
Please configure as appropriate and try again.
|
||||
#
|
||||
[alps-error]
|
||||
The application appears to have been direct launched using "aprun",
|
||||
but OMPI was not built with ALPS PMI support and therefore cannot
|
||||
execute. You must build Open MPI using --with-pmi pointing
|
||||
to the ALPS PMI library location.
|
||||
|
||||
Please configure as appropriate and try again.
|
||||
#
|
||||
[alps-error2]
|
||||
The application appears to have been direct launched using "aprun",
|
||||
but OMPI was not built with ALPS support. This usually happens
|
||||
when OMPI was not configured --with-alps and we weren't able
|
||||
to discover an ALPS installation in the usual places.
|
||||
|
||||
Please configure as appropriate and try again.
|
||||
|
@ -52,6 +52,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/schizo/schizo.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
@ -125,7 +126,24 @@ static int rte_init(void)
|
||||
opal_pmix_base_set_evbase(orte_event_base);
|
||||
/* initialize the selected module */
|
||||
if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) {
|
||||
/* we cannot run */
|
||||
/* we cannot run - this could be due to being direct launched
|
||||
* without the required PMI support being built. Try to detect
|
||||
* that scenario and warn the user */
|
||||
if (ORTE_SCHIZO_DIRECT_LAUNCHED == orte_schizo.check_launch_environment() &&
|
||||
NULL != (envar = getenv("ORTE_SCHIZO_DETECTION"))) {
|
||||
if (0 == strcmp(envar, "SLURM")) {
|
||||
/* yes to both - so emit a hopefully helpful
|
||||
* error message and abort */
|
||||
orte_show_help_finalize();
|
||||
orte_show_help("help-ess-base.txt", "slurm-error", true);
|
||||
return ORTE_ERR_SILENT;
|
||||
} else if (0 == strcmp(envar, "ALPS")) {
|
||||
/* we were direct launched by ALPS */
|
||||
orte_show_help_finalize();
|
||||
orte_show_help("help-ess-base.txt", "alps-error", true);
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
error = "pmix init";
|
||||
goto error;
|
||||
}
|
||||
|
@ -12,7 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -33,6 +33,7 @@
|
||||
#include "opal/mca/pmix/base/base.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/schizo/schizo.h"
|
||||
|
||||
#include "orte/mca/ess/ess.h"
|
||||
@ -131,6 +132,32 @@ static int component_query(mca_base_module_t **module, int *priority)
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
/* we may be incorrectly trying to run as a singleton - e.g.,
|
||||
* someone direct-launched us under SLURM without building
|
||||
* ORTE --with-slurm or in a slurm environment (so we didn't
|
||||
* autodetect slurm). Try to detect that here. Sadly, we
|
||||
* cannot just use the schizo framework to help us here as
|
||||
* the corresponding schizo component may not have even
|
||||
* been build. So we have to do things a little uglier */
|
||||
|
||||
if (ORTE_SCHIZO_UNMANAGED_SINGLETON == ret) {
|
||||
/* see if we are in a SLURM allocation */
|
||||
if (NULL != getenv("SLURM_NODELIST")) {
|
||||
/* emit a hopefully helpful error message and abort */
|
||||
orte_show_help("help-ess-base.txt", "slurm-error2", true);
|
||||
*module = NULL;
|
||||
*priority = 0;
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
/* see if we are under ALPS */
|
||||
if (NULL != getenv("ALPS_APP_ID")) {
|
||||
orte_show_help("help-ess-base.txt", "alps-error2", true);
|
||||
*module = NULL;
|
||||
*priority = 0;
|
||||
return ORTE_ERR_SILENT;
|
||||
}
|
||||
}
|
||||
|
||||
/* okay, we want to be selected as we must be a singleton */
|
||||
*priority = 100;
|
||||
*module = (mca_base_module_t *)&orte_ess_singleton_module;
|
||||
@ -142,4 +169,3 @@ static int component_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -65,9 +65,16 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
|
||||
* launch performance penalty for hwloc at high ppn on knl */
|
||||
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX "orte_bound_at_launch");
|
||||
opal_argv_append_nosize(&pushed_vals, "true");
|
||||
/* mark that we are native */
|
||||
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
|
||||
opal_argv_append_nosize(&pushed_vals, "NATIVE");
|
||||
goto setup;
|
||||
}
|
||||
|
||||
/* mark that we are on ALPS */
|
||||
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
|
||||
opal_argv_append_nosize(&pushed_vals, "ALPS");
|
||||
|
||||
/* see if we are running in a Cray PAGG container */
|
||||
fd = fopen(proc_job_file, "r");
|
||||
if (NULL == fd) {
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -53,7 +53,7 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
|
||||
* so no need to further check that here. Instead,
|
||||
* see if we were direct launched vs launched via mpirun */
|
||||
if (NULL != orte_process_info.my_daemon_uri) {
|
||||
/* nope */
|
||||
/* yes we were */
|
||||
myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
|
||||
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
|
||||
opal_argv_append_nosize(&pushed_vals, "pmi");
|
||||
@ -65,6 +65,10 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
|
||||
myenv = ORTE_SCHIZO_UNMANAGED_SINGLETON;
|
||||
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
|
||||
opal_argv_append_nosize(&pushed_vals, "singleton");
|
||||
/* mark that we are in ORTE */
|
||||
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
|
||||
opal_argv_append_nosize(&pushed_vals, "ORTE");
|
||||
|
||||
|
||||
setup:
|
||||
opal_output_verbose(1, orte_schizo_base_framework.framework_output,
|
||||
|
@ -62,6 +62,9 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
|
||||
myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
|
||||
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
|
||||
opal_argv_append_nosize(&pushed_vals, "pmi");
|
||||
/* mark that we are native */
|
||||
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
|
||||
opal_argv_append_nosize(&pushed_vals, "NATIVE");
|
||||
goto setup;
|
||||
}
|
||||
|
||||
@ -72,6 +75,10 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
|
||||
return myenv;
|
||||
}
|
||||
|
||||
/* mark that we are in SLURM */
|
||||
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
|
||||
opal_argv_append_nosize(&pushed_vals, "SLURM");
|
||||
|
||||
/* we are in an allocation, but were we direct launched
|
||||
* or are we a singleton? */
|
||||
if (NULL == getenv("SLURM_STEP_ID")) {
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user