1
1

Merge pull request #1257 from hppritcha/topic/disable_mpirun_for_native_slurm_crayxc

plm/alps: only use srun for Native SLURM
Этот коммит содержится в:
Howard Pritchard 2015-12-22 21:07:41 -07:00
родитель 5ec5bd08c1 39367ca0bf
Коммит 2362bf0c0c
4 изменённых файлов: 29 добавлений и 15 удалений

Просмотреть файл

@ -39,3 +39,7 @@ the map for this application. This can be caused by a lack of
an allocation, or by an error in the Open MPI code. Please check
to ensure you have a ALPS allocation. If you do, then please pass
the error to the Open MPI user's mailing list for assistance.
#
[slurm-not-supported]
mpirun is not a supported launcher on Cray XC using Native SLURM.
srun must be used to launch jobs on these systems.

Просмотреть файл

@ -46,6 +46,7 @@ ORTE_MODULE_DECLSPEC extern orte_plm_alps_component_t
mca_plm_alps_component;
ORTE_DECLSPEC extern orte_plm_base_module_t
orte_plm_alps_module;
extern bool mca_plm_alps_using_aprun;
END_C_DECLS

Просмотреть файл

@ -43,6 +43,7 @@
*/
const char *mca_plm_alps_component_version_string =
"Open MPI alps plm MCA component version " ORTE_VERSION;
bool mca_plm_alps_using_aprun = {true};
/*
@ -136,28 +137,19 @@ static int plm_alps_open(void)
static int orte_plm_alps_component_query(mca_base_module_t **module, int *priority)
{
int alps_wlm_active = 1;
#if CRAY_WLM_DETECT
char slurm[]="SLURM";
if(!strcmp(slurm,wlm_detect_get_active())) {
alps_wlm_active = 0;
mca_plm_alps_using_aprun = false;
}
#endif
if (alps_wlm_active) {
*priority = mca_plm_alps_component.priority;
*module = (mca_base_module_t *) &orte_plm_alps_module;
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
"%s plm:alps: available for selection",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
} else {
*priority = 0;
*module = NULL;
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
"%s plm:alps: not available, slurm present",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
*priority = mca_plm_alps_component.priority;
*module = (mca_base_module_t *) &orte_plm_alps_module;
OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
"%s plm:alps: available for selection",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return ORTE_SUCCESS;
}

Просмотреть файл

@ -121,6 +121,23 @@ static int plm_alps_init(void)
return rc;
}
/*
* owing to way the SLURM PLM component works, we can't use
* it on Cray XC systems as currently designed. The problem
* is the MPI processes launched on the head node (where the
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
* (mpirun) which is not a child of a slurmd daemon. This
* means that any RDMA credentials obtained via the odls/alps
* local launcher are incorrect.
*
* So for now, we just don't support mpirun launched jobs
* on Cray XC systems using Native SLURM.
*/
if (false == mca_plm_alps_using_aprun) {
orte_show_help("help-plm-alps.txt", "slurm-not-supported", true);
exit(-1);
}
if (orte_do_not_launch) {
/* must map daemons since we won't be launching them */
orte_plm_globals.daemon_nodes_assigned_at_launch = true;