1
1

Merge pull request from rhc54/topic/warn

Attempt to detect when we are direct-launched without the necessary P…
Этот коммит содержится в:
Ralph Castain 2017-06-29 16:53:12 -07:00 коммит произвёл GitHub
родитель cb19296b71 bd4a6fee22
Коммит 7cbea77238
7 изменённых файлов: 109 добавлений и 7 удалений

Просмотреть файл

@ -990,7 +990,7 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
error:
if (ret != OMPI_SUCCESS) {
/* Only print a message if one was not already printed */
if (NULL != error) {
if (NULL != error && OMPI_ERR_SILENT != ret) {
const char *err_msg = opal_strerror(ret);
opal_show_help("help-mpi-runtime.txt",
"mpi_init:startup:internal-failure", true,

Просмотреть файл

@ -49,3 +49,43 @@ MCA parameter:
param: %s
This is not a recognized signal value. Please fix or remove it.
#
[slurm-error]
The application appears to have been direct launched using "srun",
but OMPI was not built with SLURM's PMI support and therefore cannot
execute. There are several options for building PMI support under
SLURM, depending upon the SLURM version you are using:
version 16.05 or later: you can use SLURM's PMIx support. This
requires that you configure and build SLURM --with-pmix.
Versions earlier than 16.05: you must use either SLURM's PMI-1 or
PMI-2 support. SLURM builds PMI-1 by default, or you can manually
install PMI-2. You must then build Open MPI using --with-pmi pointing
to the SLURM PMI library location.
Please configure as appropriate and try again.
#
[slurm-error2]
The application appears to have been direct launched using "srun",
but OMPI was not built with SLURM support. This usually happens
when OMPI was not configured --with-slurm and we weren't able
to discover a SLURM installation in the usual places.
Please configure as appropriate and try again.
#
[alps-error]
The application appears to have been direct launched using "aprun",
but OMPI was not built with ALPS PMI support and therefore cannot
execute. You must build Open MPI using --with-pmi pointing
to the ALPS PMI library location.
Please configure as appropriate and try again.
#
[alps-error2]
The application appears to have been direct launched using "aprun",
but OMPI was not built with ALPS support. This usually happens
when OMPI was not configured --with-alps and we weren't able
to discover an ALPS installation in the usual places.
Please configure as appropriate and try again.

Просмотреть файл

@ -52,6 +52,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/schizo/schizo.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
@ -125,7 +126,24 @@ static int rte_init(void)
opal_pmix_base_set_evbase(orte_event_base);
/* initialize the selected module */
if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) {
/* we cannot run */
/* we cannot run - this could be due to being direct launched
* without the required PMI support being built. Try to detect
* that scenario and warn the user */
if (ORTE_SCHIZO_DIRECT_LAUNCHED == orte_schizo.check_launch_environment() &&
NULL != (envar = getenv("ORTE_SCHIZO_DETECTION"))) {
if (0 == strcmp(envar, "SLURM")) {
/* yes to both - so emit a hopefully helpful
* error message and abort */
orte_show_help_finalize();
orte_show_help("help-ess-base.txt", "slurm-error", true);
return ORTE_ERR_SILENT;
} else if (0 == strcmp(envar, "ALPS")) {
/* we were direct launched by ALPS */
orte_show_help_finalize();
orte_show_help("help-ess-base.txt", "alps-error", true);
return ORTE_ERR_SILENT;
}
}
error = "pmix init";
goto error;
}

Просмотреть файл

@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -33,6 +33,7 @@
#include "opal/mca/pmix/base/base.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/mca/schizo/schizo.h"
#include "orte/mca/ess/ess.h"
@ -131,6 +132,32 @@ static int component_query(mca_base_module_t **module, int *priority)
return ORTE_ERROR;
}
/* we may be incorrectly trying to run as a singleton - e.g.,
* someone direct-launched us under SLURM without building
* ORTE --with-slurm or in a slurm environment (so we didn't
* autodetect slurm). Try to detect that here. Sadly, we
* cannot just use the schizo framework to help us here as
* the corresponding schizo component may not have even
* been build. So we have to do things a little uglier */
if (ORTE_SCHIZO_UNMANAGED_SINGLETON == ret) {
/* see if we are in a SLURM allocation */
if (NULL != getenv("SLURM_NODELIST")) {
/* emit a hopefully helpful error message and abort */
orte_show_help("help-ess-base.txt", "slurm-error2", true);
*module = NULL;
*priority = 0;
return ORTE_ERR_SILENT;
}
/* see if we are under ALPS */
if (NULL != getenv("ALPS_APP_ID")) {
orte_show_help("help-ess-base.txt", "alps-error2", true);
*module = NULL;
*priority = 0;
return ORTE_ERR_SILENT;
}
}
/* okay, we want to be selected as we must be a singleton */
*priority = 100;
*module = (mca_base_module_t *)&orte_ess_singleton_module;
@ -142,4 +169,3 @@ static int component_close(void)
{
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -65,9 +65,16 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
* launch performance penalty for hwloc at high ppn on knl */
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX "orte_bound_at_launch");
opal_argv_append_nosize(&pushed_vals, "true");
/* mark that we are native */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "NATIVE");
goto setup;
}
/* mark that we are on ALPS */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "ALPS");
/* see if we are running in a Cray PAGG container */
fd = fopen(proc_job_file, "r");
if (NULL == fd) {

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -53,7 +53,7 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
* so no need to further check that here. Instead,
* see if we were direct launched vs launched via mpirun */
if (NULL != orte_process_info.my_daemon_uri) {
/* nope */
/* yes we were */
myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
opal_argv_append_nosize(&pushed_vals, "pmi");
@ -65,6 +65,10 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
myenv = ORTE_SCHIZO_UNMANAGED_SINGLETON;
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
opal_argv_append_nosize(&pushed_vals, "singleton");
/* mark that we are in ORTE */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "ORTE");
setup:
opal_output_verbose(1, orte_schizo_base_framework.framework_output,

Просмотреть файл

@ -62,6 +62,9 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
myenv = ORTE_SCHIZO_NATIVE_LAUNCHED;
opal_argv_append_nosize(&pushed_envs, OPAL_MCA_PREFIX"ess");
opal_argv_append_nosize(&pushed_vals, "pmi");
/* mark that we are native */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "NATIVE");
goto setup;
}
@ -72,6 +75,10 @@ static orte_schizo_launch_environ_t check_launch_environment(void)
return myenv;
}
/* mark that we are in SLURM */
opal_argv_append_nosize(&pushed_envs, "ORTE_SCHIZO_DETECTION");
opal_argv_append_nosize(&pushed_vals, "SLURM");
/* we are in an allocation, but were we direct launched
* or are we a singleton? */
if (NULL == getenv("SLURM_STEP_ID")) {