Merge pull request #1257 from hppritcha/topic/disable_mpirun_for_native_slurm_crayxc
plm/alps: only use srun for Native SLURM
Этот коммит содержится в:
Коммит
2362bf0c0c
@ -39,3 +39,7 @@ the map for this application. This can be caused by a lack of
|
||||
an allocation, or by an error in the Open MPI code. Please check
|
||||
to ensure you have a ALPS allocation. If you do, then please pass
|
||||
the error to the Open MPI user's mailing list for assistance.
|
||||
#
|
||||
[slurm-not-supported]
|
||||
mpirun is not a supported launcher on Cray XC using Native SLURM.
|
||||
srun must be used to launch jobs on these systems.
|
||||
|
@ -46,6 +46,7 @@ ORTE_MODULE_DECLSPEC extern orte_plm_alps_component_t
|
||||
mca_plm_alps_component;
|
||||
ORTE_DECLSPEC extern orte_plm_base_module_t
|
||||
orte_plm_alps_module;
|
||||
extern bool mca_plm_alps_using_aprun;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -43,6 +43,7 @@
|
||||
*/
|
||||
const char *mca_plm_alps_component_version_string =
|
||||
"Open MPI alps plm MCA component version " ORTE_VERSION;
|
||||
bool mca_plm_alps_using_aprun = {true};
|
||||
|
||||
|
||||
/*
|
||||
@ -136,28 +137,19 @@ static int plm_alps_open(void)
|
||||
|
||||
static int orte_plm_alps_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
int alps_wlm_active = 1;
|
||||
#if CRAY_WLM_DETECT
|
||||
char slurm[]="SLURM";
|
||||
|
||||
if(!strcmp(slurm,wlm_detect_get_active())) {
|
||||
alps_wlm_active = 0;
|
||||
mca_plm_alps_using_aprun = false;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (alps_wlm_active) {
|
||||
*priority = mca_plm_alps_component.priority;
|
||||
*module = (mca_base_module_t *) &orte_plm_alps_module;
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:alps: available for selection",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
} else {
|
||||
*priority = 0;
|
||||
*module = NULL;
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:alps: not available, slurm present",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
}
|
||||
*priority = mca_plm_alps_component.priority;
|
||||
*module = (mca_base_module_t *) &orte_plm_alps_module;
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
|
||||
"%s plm:alps: available for selection",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -121,6 +121,23 @@ static int plm_alps_init(void)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* owing to way the SLURM PLM component works, we can't use
|
||||
* it on Cray XC systems as currently designed. The problem
|
||||
* is the MPI processes launched on the head node (where the
|
||||
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
|
||||
* (mpirun) which is not a child of a slurmd daemon. This
|
||||
* means that any RDMA credentials obtained via the odls/alps
|
||||
* local launcher are incorrect.
|
||||
*
|
||||
* So for now, we just don't support mpirun launched jobs
|
||||
* on Cray XC systems using Native SLURM.
|
||||
*/
|
||||
if (false == mca_plm_alps_using_aprun) {
|
||||
orte_show_help("help-plm-alps.txt", "slurm-not-supported", true);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
if (orte_do_not_launch) {
|
||||
/* must map daemons since we won't be launching them */
|
||||
orte_plm_globals.daemon_nodes_assigned_at_launch = true;
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user