eee9f7ae3a
It turns that there is an incompatibility between the Cray PMI library and the default configuration for building Open MPI (master). To work around this, we now disable use of aprun for direct launch of Open MPI jobs except under specific conditions. The problem is that there are now (on master) packages getting initialized that do not work properly across a fork operation. As part of a constructor in the Cray PMI library, a fork operation is done to simplify use of shared memory between the processes in a job on the same node. This ends up thoroughly messing up the Open MPI initialization process in the case that dlopen support is enabled. The initialization process gets about half-way through when the PMIX framework is opened and components are loaded, which triggers the Cray PMI constructor and hence the fork operation. There are two workarounds for this: 1) configure Open MPI for Cray XE/XC systems using aprun with the --disable-dlopen option 2) set the PMI_NO_FORK environment variable in the shell in which the aprun command is run. Without taking these measures, a Open MPI job will just hang at job startup in the first attempt to "thread-shift" the PMIx fence_nb operation. Additional hangs occur at shutdown if this problem is worked around, again due to the insertion of a fork operation halfway through the Open MPI initialization procedure. This commit detects if the conditions that bring out the hang situation are present, and if so, prints out a message and aborts the job launch. Note on systems using slurm, the PMI_NO_FORK environment variable is set as part of the srun job launch, hence this issue is avoided on those systems. Signed-off-by: Howard Pritchard <howardp@lanl.gov>
145 строки
4.5 KiB
C
145 строки
4.5 KiB
C
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
|
/*
|
|
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
|
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
|
|
* reserved.
|
|
* Copyright (c) 2016 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*
|
|
* These symbols are in a file by themselves to provide nice linker
|
|
* semantics. Since linkers generally pull in symbols by object
|
|
* files, keeping these symbols as the only symbols in this file
|
|
* prevents utility programs such as "ompi_info" from having to import
|
|
* entire components just to query their version and parameters.
|
|
*/
|
|
|
|
#include "opal_config.h"
|
|
|
|
#include "opal/constants.h"
|
|
#include "opal/mca/pmix/pmix.h"
|
|
#include "opal/util/show_help.h"
|
|
#include "pmix_cray.h"
|
|
#include <sys/syscall.h>
|
|
#include <pmi.h>
|
|
|
|
/*
|
|
* Public string showing the pmix cray component version number
|
|
*/
|
|
const char *opal_pmix_cray_component_version_string =
|
|
"OPAL cray pmix MCA component version " OPAL_VERSION;
|
|
|
|
/*
|
|
* Local function
|
|
*/
|
|
static int pmix_cray_component_open(void);
|
|
static int pmix_cray_component_query(mca_base_module_t **module, int *priority);
|
|
static int pmix_cray_component_close(void);
|
|
|
|
|
|
/*
|
|
* Instantiate the public struct with all of our public information
|
|
* and pointers to our public functions in it
|
|
*/
|
|
|
|
opal_pmix_cray_component_t mca_pmix_cray_component = {
|
|
{
|
|
/* First, the mca_component_t struct containing meta information
|
|
about the component itself */
|
|
|
|
.base_version = {
|
|
/* Indicate that we are a pmix v1.1.0 component (which also
|
|
implies a specific MCA version) */
|
|
|
|
OPAL_PMIX_BASE_VERSION_2_0_0,
|
|
|
|
/* Component name and version */
|
|
|
|
.mca_component_name = "cray",
|
|
MCA_BASE_MAKE_VERSION(component, OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
|
|
OPAL_RELEASE_VERSION),
|
|
|
|
/* Component open and close functions */
|
|
|
|
.mca_open_component = pmix_cray_component_open,
|
|
.mca_close_component = pmix_cray_component_close,
|
|
.mca_query_component = pmix_cray_component_query,
|
|
},
|
|
/* Next the MCA v1.0.0 component meta data */
|
|
.base_data = {
|
|
/* The component is checkpoint ready */
|
|
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
|
}
|
|
},
|
|
.cache_local = NULL,
|
|
.cache_global = NULL,
|
|
};
|
|
|
|
static int pmix_cray_component_open(void)
|
|
{
|
|
/*
|
|
* Turns out that there's a lot of reliance on libevent
|
|
* and the default behavior of Cray PMI to fork
|
|
* in a constructor breaks libevent.
|
|
*
|
|
* Open MPI will not launch correctly on Cray XE/XC systems
|
|
* under these conditions:
|
|
*
|
|
* 1) direct launch using aprun, and
|
|
* 2) PMI_NO_FORK env. variable is not set, nor was
|
|
* 3) --disable-dlopen used as part of configury
|
|
*
|
|
* Under SLURM, PMI_NO_FORK is always set, so we can combine
|
|
* the check for conditions 1) and 2) together
|
|
*/
|
|
|
|
#if OPAL_ENABLE_DLOPEN_SUPPORT
|
|
if (NULL == getenv("PMI_NO_FORK")) {
|
|
opal_show_help("help-pmix-cray.txt", "aprun-not-supported", true);
|
|
exit(-1);
|
|
}
|
|
#endif
|
|
return OPAL_SUCCESS;
|
|
}
|
|
|
|
static int pmix_cray_component_query(mca_base_module_t **module, int *priority)
|
|
{
|
|
int rc;
|
|
const char proc_job_file[]="/proc/job";
|
|
FILE *fd = NULL, *fd_task_is_app = NULL;
|
|
char task_is_app_fname[PATH_MAX];
|
|
|
|
/* disqualify ourselves if not running in a Cray PAGG container, or we
|
|
were launched by the orte/mpirun launcher */
|
|
fd = fopen(proc_job_file, "r");
|
|
if ((fd == NULL) || (getenv("OMPI_NO_USE_CRAY_PMI") != NULL)) {
|
|
*priority = 0;
|
|
*module = NULL;
|
|
rc = OPAL_ERROR;
|
|
} else {
|
|
snprintf(task_is_app_fname,sizeof(task_is_app_fname),
|
|
"/proc/self/task/%ld/task_is_app",syscall(SYS_gettid));
|
|
fd_task_is_app = fopen(task_is_app_fname, "r");
|
|
if (fd_task_is_app != NULL) { /* okay we're in a PAGG container,
|
|
and we are an app task (not just a process
|
|
running on a mom node, for example),
|
|
so we should give cray pmi a shot. */
|
|
*priority = 90;
|
|
*module = (mca_base_module_t *)&opal_pmix_cray_module;
|
|
fclose(fd_task_is_app);
|
|
rc = OPAL_SUCCESS;
|
|
}
|
|
fclose(fd);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int pmix_cray_component_close(void)
|
|
{
|
|
return OPAL_SUCCESS;
|
|
}
|