diff --git a/orte/mca/plm/alps/help-plm-alps.txt b/orte/mca/plm/alps/help-plm-alps.txt index 78a49f79c5..f109299a86 100644 --- a/orte/mca/plm/alps/help-plm-alps.txt +++ b/orte/mca/plm/alps/help-plm-alps.txt @@ -39,3 +39,7 @@ the map for this application. This can be caused by a lack of an allocation, or by an error in the Open MPI code. Please check to ensure you have a ALPS allocation. If you do, then please pass the error to the Open MPI user's mailing list for assistance. +# +[slurm-not-supported] +mpirun is not a supported launcher on Cray XC using Native SLURM. +srun must be used to launch jobs on these systems. diff --git a/orte/mca/plm/alps/plm_alps.h b/orte/mca/plm/alps/plm_alps.h index 4eadc2e232..d15ae07ffa 100644 --- a/orte/mca/plm/alps/plm_alps.h +++ b/orte/mca/plm/alps/plm_alps.h @@ -46,6 +46,7 @@ ORTE_MODULE_DECLSPEC extern orte_plm_alps_component_t mca_plm_alps_component; ORTE_DECLSPEC extern orte_plm_base_module_t orte_plm_alps_module; +extern bool mca_plm_alps_using_aprun; END_C_DECLS diff --git a/orte/mca/plm/alps/plm_alps_component.c b/orte/mca/plm/alps/plm_alps_component.c index f688632039..72162c03dc 100644 --- a/orte/mca/plm/alps/plm_alps_component.c +++ b/orte/mca/plm/alps/plm_alps_component.c @@ -43,6 +43,7 @@ */ const char *mca_plm_alps_component_version_string = "Open MPI alps plm MCA component version " ORTE_VERSION; +bool mca_plm_alps_using_aprun = {true}; /* @@ -136,28 +137,19 @@ static int plm_alps_open(void) static int orte_plm_alps_component_query(mca_base_module_t **module, int *priority) { - int alps_wlm_active = 1; #if CRAY_WLM_DETECT char slurm[]="SLURM"; if(!strcmp(slurm,wlm_detect_get_active())) { - alps_wlm_active = 0; + mca_plm_alps_using_aprun = false; } #endif - if (alps_wlm_active) { - *priority = mca_plm_alps_component.priority; - *module = (mca_base_module_t *) &orte_plm_alps_module; - OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, - "%s plm:alps: available for selection", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - } else { - *priority = 0; - *module = NULL; - OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, - "%s plm:alps: not available, slurm present", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - } + *priority = mca_plm_alps_component.priority; + *module = (mca_base_module_t *) &orte_plm_alps_module; + OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, + "%s plm:alps: available for selection", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; } diff --git a/orte/mca/plm/alps/plm_alps_module.c b/orte/mca/plm/alps/plm_alps_module.c index 48cbc16514..8cf9c287fe 100644 --- a/orte/mca/plm/alps/plm_alps_module.c +++ b/orte/mca/plm/alps/plm_alps_module.c @@ -121,6 +121,23 @@ static int plm_alps_init(void) return rc; } + /* + * owing to way the SLURM PLM component works, we can't use + * it on Cray XC systems as currently designed. The problem + * is the MPI processes launched on the head node (where the + * ORTE_PROC_IS_HNP evalues to true) get launched by a daemon + * (mpirun) which is not a child of a slurmd daemon. This + * means that any RDMA credentials obtained via the odls/alps + * local launcher are incorrect. + * + * So for now, we just don't support mpirun launched jobs + * on Cray XC systems using Native SLURM. + */ + if (false == mca_plm_alps_using_aprun) { + orte_show_help("help-plm-alps.txt", "slurm-not-supported", true); + exit(-1); + } + if (orte_do_not_launch) { /* must map daemons since we won't be launching them */ orte_plm_globals.daemon_nodes_assigned_at_launch = true;