Attempt to detect when we are direct-launched without the necessary PMI support, and thus are incorrectly identified as being "singleton". Advise the user on the required PMI(x) support and error out.
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
7e5e5fe887
Коммит
bd4a6fee22
@ -990,7 +990,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
|
|||||||
error:
|
error:
|
||||||
if (ret != OMPI_SUCCESS) {
|
if (ret != OMPI_SUCCESS) {
|
||||||
/* Only print a message if one was not already printed */
|
/* Only print a message if one was not already printed */
|
||||||
if (NULL != error) {
|
if (NULL != error && OMPI_ERR_SILENT != ret) {
|
||||||
const char *err_msg = opal_strerror(ret);
|
const char *err_msg = opal_strerror(ret);
|
||||||
opal_show_help("help-mpi-runtime.txt",
|
opal_show_help("help-mpi-runtime.txt",
|
||||||
"mpi_init:startup:internal-failure", true,
|
"mpi_init:startup:internal-failure", true,
|
||||||
|
@ -49,3 +49,43 @@ MCA parameter:
|
|||||||
param: %s
|
param: %s
|
||||||
|
|
||||||
This is not a recognized signal value. Please fix or remove it.
|
This is not a recognized signal value. Please fix or remove it.
|
||||||
|
#
|
||||||
|
[slurm-error]
|
||||||
|
The application appears to have been direct launched using "srun",
|
||||||
|
but OMPI was not built with SLURM's PMI support and therefore cannot
|
||||||
|
execute. There are several options for building PMI support under
|
||||||
|
SLURM, depending upon the SLURM version you are using:
|
||||||
|
|
||||||
|
version 16.05 or later: you can use SLURM's PMIx support. This
|
||||||
|
requires that you configure and build SLURM --with-pmix.
|
||||||
|
|
||||||
|
Versions earlier than 16.05: you must use either SLURM's PMI-1 or
|
||||||
|
PMI-2 support. SLURM builds PMI-1 by default, or you can manually
|
||||||
|
install PMI-2. You must then build Open MPI using --with-pmi pointing
|
||||||
|
to the SLURM PMI library location.
|
||||||
|
|
||||||
|
Please configure as appropriate and try again.
|
||||||
|
#
|
||||||
|
[slurm-error2]
|
||||||
|
The application appears to have been direct launched using "srun",
|
||||||
|
but OMPI was not built with SLURM support. This usually happens
|
||||||
|
when OMPI was not configured --with-slurm and we weren't able
|
||||||
|
to discover a SLURM installation in the usual places.
|
||||||
|
|
||||||
|
Please configure as appropriate and try again.
|
||||||
|
#
|
||||||
|
[alps-error]
|
||||||
|
The application appears to have been direct launched using "aprun",
|
||||||
|
but OMPI was not built with ALPS PMI support and therefore cannot
|
||||||
|
execute. You must build Open MPI using --with-pmi pointing
|
||||||
|
to the ALPS PMI library location.
|
||||||
|
|
||||||
|
Please configure as appropriate and try again.
|
||||||
|
#
|
||||||
|
[alps-error2]
|
||||||
|
The application appears to have been direct launched using "aprun",
|
||||||
|
but OMPI was not built with ALPS support. This usually happens
|
||||||
|
when OMPI was not configured --with-alps and we weren't able
|
||||||
|
to discover an ALPS installation in the usual places.
|
||||||
|
|
||||||
|
Please configure as appropriate and try again.
|
||||||
|
@ -52,6 +52,7 @@
|
|||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/grpcomm/grpcomm.h"
|
#include "orte/mca/grpcomm/grpcomm.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
|
#include "orte/mca/schizo/schizo.h"
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/util/show_help.h"
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
@ -125,7 +126,24 @@ static int rte_init(void)
|
|||||||
opal_pmix_base_set_evbase(orte_event_base);
|
opal_pmix_base_set_evbase(orte_event_base);
|
||||||
/* initialize the selected module */
|
/* initialize the selected module */
|
||||||
if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) {
|
if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) {
|
||||||
/* we cannot run */
|
/* we cannot run - this could be due to being direct launched
|
||||||
|
* without the required PMI support being built. Try to detect
|
||||||
|
* that scenario and warn the user */
|
||||||
|
if (ORTE_SCHIZO_DIRECT_LAUNCHED == orte_schizo.check_launch_environment() &&
|
||||||
|
NULL != (envar = getenv("ORTE_SCHIZO_DETECTION"))) {
|
||||||
|
if (0 == strcmp(envar, "SLURM")) {
|
||||||
|
/* yes to both - so emit a hopefully helpful
|
||||||
|
* error message and abort */
|
||||||
|
orte_show_help_finalize();
|
||||||
|
orte_show_help("help-ess-base.txt", "slurm-error", true);
|
||||||
|
return ORTE_ERR_SILENT;
|
||||||
|
} else if (0 == strcmp(envar, "ALPS")) {
|
||||||
|
/* we were direct launched by ALPS */
|
||||||
|
orte_show_help_finalize();
|
||||||
|
orte_show_help("help-ess-base.txt", "alps-error", true);
|
||||||
|
return ORTE_ERR_SILENT;
|
||||||
|
}
|
||||||
|
}
|
||||||
error = "pmix init";
|
error = "pmix init";
|
||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -33,6 +33,7 @@
|
|||||||
#include "opal/mca/pmix/base/base.h"
|
#include "opal/mca/pmix/base/base.h"
|
||||||
|
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/mca/schizo/schizo.h"
|
#include "orte/mca/schizo/schizo.h"
|
||||||
|
|
||||||
#include "orte/mca/ess/ess.h"
|
#include "orte/mca/ess/ess.h"
|
||||||
@ -131,6 +132,32 @@ static int component_query(mca_base_module_t **module, int *priority)
|
|||||||
return ORTE_ERROR;
|
return ORTE_ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* we may be incorrectly trying to run as a singleton - e.g.,
|
||||||
|
* someone direct-launched us under SLURM without building
|
||||||
|
* ORTE --with-slurm or in a slurm environment (so we didn't
|
||||||
|
* autodetect slurm). Try to detect that here. Sadly, we
|
||||||
|
* cannot just use the schizo framework to help us here as
|
||||||
|
* the corresponding schizo component may not have even
|
||||||
|
* been build. So we have to do things a little uglier */
|
||||||
|
|
||||||
|
if (ORTE_SCHIZO_UNMANAGED_SINGLETON == ret) {
|
||||||
|
/* see if we are in a SLURM allocation */
|
||||||
|
if (NULL != getenv("SLURM_NODELIST")) {
|
||||||
|
/* emit a hopefully helpful error message and abort */
|
||||||
|
orte_show_help("help-ess-base.txt", "slurm-error2", true);
|
||||||
|
*module = NULL;
|
||||||
|
*priority = 0;
|
||||||
|
return ORTE_ERR_SILENT;
|
||||||
|
}
|
||||||
|
/* see if we are under ALPS */
|
||||||
|
if (NULL != getenv("ALPS_APP_ID")) {
|
||||||
|
orte_show_help("help-ess-base.txt", "alps-error2", true);
|
||||||
|
*module = NULL;
|
||||||
|
*priority = 0;
|
||||||
|
return ORTE_ERR_SILENT;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* okay, we want to be selected as we must be a singleton */
|
/* okay, we want to be selected as we must be a singleton */
|
||||||
*priority = 100;
|
*priority = 100;
|
||||||
*module = (mca_base_module_t *)&orte_ess_singleton_module;
|
*module = (mca_base_module_t *)&orte_ess_singleton_module;
|
||||||
@ -142,4 +169,3 @@ static int component_close(void)
|
|||||||
{
|
{
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -65,9 +65,16 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
|
|||||||
* launch performance penalty for hwloc at high ppn on knl */
|
* launch performance penalty for hwloc at high ppn on knl */
|
||||||
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX "orte_bound_at_launch");
|
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX "orte_bound_at_launch");
|
||||||
opal_argv_append_nosize(&pushed_vals, "true");
|
opal_argv_append_nosize(&pushed_vals, "true");
|
||||||
|
/* mark that we are native */
|
||||||
|
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
|
||||||
|
opal_argv_append_nosize(&pushed_vals, "NATIVE");
|
||||||
goto setup;
|
goto setup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* mark that we are on ALPS */
|
||||||
|
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
|
||||||
|
opal_argv_append_nosize(&pushed_vals, "ALPS");
|
||||||
|
|
||||||
/* see if we are running in a Cray PAGG container */
|
/* see if we are running in a Cray PAGG container */
|
||||||
fd = fopen(proc_job_file, "r");
|
fd = fopen(proc_job_file, "r");
|
||||||
if (NULL == fd) {
|
if (NULL == fd) {
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2016 Intel, Inc. All rights reserved.
|
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -53,7 +53,7 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
|
|||||||
* so no need to further check that here. Instead,
|
* so no need to further check that here. Instead,
|
||||||
* see if we were direct launched vs launched via mpirun */
|
* see if we were direct launched vs launched via mpirun */
|
||||||
if (NULL != orte_process_info.my_daemon_uri) {
|
if (NULL != orte_process_info.my_daemon_uri) {
|
||||||
/* nope */
|
/* yes we were */
|
||||||
myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
|
myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
|
||||||
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
|
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
|
||||||
opal_argv_append_nosize(&pushed_vals, "pmi");
|
opal_argv_append_nosize(&pushed_vals, "pmi");
|
||||||
@ -65,6 +65,10 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
|
|||||||
myenv = ORTE_SCHIZO_UNMANAGED_SINGLETON;
|
myenv = ORTE_SCHIZO_UNMANAGED_SINGLETON;
|
||||||
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
|
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
|
||||||
opal_argv_append_nosize(&pushed_vals, "singleton");
|
opal_argv_append_nosize(&pushed_vals, "singleton");
|
||||||
|
/* mark that we are in ORTE */
|
||||||
|
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
|
||||||
|
opal_argv_append_nosize(&pushed_vals, "ORTE");
|
||||||
|
|
||||||
|
|
||||||
setup:
|
setup:
|
||||||
opal_output_verbose(1, orte_schizo_base_framework.framework_output,
|
opal_output_verbose(1, orte_schizo_base_framework.framework_output,
|
||||||
|
@ -62,6 +62,9 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
|
|||||||
myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
|
myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
|
||||||
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
|
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
|
||||||
opal_argv_append_nosize(&pushed_vals, "pmi");
|
opal_argv_append_nosize(&pushed_vals, "pmi");
|
||||||
|
/* mark that we are native */
|
||||||
|
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
|
||||||
|
opal_argv_append_nosize(&pushed_vals, "NATIVE");
|
||||||
goto setup;
|
goto setup;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,6 +75,10 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
|
|||||||
return myenv;
|
return myenv;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* mark that we are in SLURM */
|
||||||
|
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
|
||||||
|
opal_argv_append_nosize(&pushed_vals, "SLURM");
|
||||||
|
|
||||||
/* we are in an allocation, but were we direct launched
|
/* we are in an allocation, but were we direct launched
|
||||||
* or are we a singleton? */
|
* or are we a singleton? */
|
||||||
if (NULL == getenv("SLURM_STEP_ID")) {
|
if (NULL == getenv("SLURM_STEP_ID")) {
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user