From 39367ca0bf56a316682d2e1a690fb50d3ab6f244 Mon Sep 17 00:00:00 2001 From: Howard Pritchard Date: Tue, 22 Dec 2015 10:48:32 -0800 Subject: [PATCH] plm/alps: only use srun for Native SLURM Turns out that the way the SLURM plm works is not compatible with the way MPI processes on Cray XC obtain RDMA credentials to use the high speed network. Unlike with ALPS, the mpirun process is on the first compute node in the job. With the current PLM launch system, mpirun (HNP daemon) launches the MPI ranks on that node rather than relying on srun. This will probably require a significant amount of effort to rework to support Native SLURM on Cray XC's. As a short term alternative, have the alps plm (which gets selected by default again on Cray systems regardless of the launch system) check whether or not srun or alps is being used on the system. If alps is not being used, print a helpful message for the user and abort the job launch. Signed-off-by: Howard Pritchard --- orte/mca/plm/alps/help-plm-alps.txt | 4 ++++ orte/mca/plm/alps/plm_alps.h | 1 + orte/mca/plm/alps/plm_alps_component.c | 22 +++++++--------------- orte/mca/plm/alps/plm_alps_module.c | 17 +++++++++++++++++ 4 files changed, 29 insertions(+), 15 deletions(-) diff --git a/orte/mca/plm/alps/help-plm-alps.txt b/orte/mca/plm/alps/help-plm-alps.txt index 78a49f79c5..f109299a86 100644 --- a/orte/mca/plm/alps/help-plm-alps.txt +++ b/orte/mca/plm/alps/help-plm-alps.txt @@ -39,3 +39,7 @@ the map for this application. This can be caused by a lack of an allocation, or by an error in the Open MPI code. Please check to ensure you have a ALPS allocation. If you do, then please pass the error to the Open MPI user's mailing list for assistance. +# +[slurm-not-supported] +mpirun is not a supported launcher on Cray XC using Native SLURM. +srun must be used to launch jobs on these systems. diff --git a/orte/mca/plm/alps/plm_alps.h b/orte/mca/plm/alps/plm_alps.h index 4eadc2e232..d15ae07ffa 100644 --- a/orte/mca/plm/alps/plm_alps.h +++ b/orte/mca/plm/alps/plm_alps.h @@ -46,6 +46,7 @@ ORTE_MODULE_DECLSPEC extern orte_plm_alps_component_t mca_plm_alps_component; ORTE_DECLSPEC extern orte_plm_base_module_t orte_plm_alps_module; +extern bool mca_plm_alps_using_aprun; END_C_DECLS diff --git a/orte/mca/plm/alps/plm_alps_component.c b/orte/mca/plm/alps/plm_alps_component.c index f688632039..72162c03dc 100644 --- a/orte/mca/plm/alps/plm_alps_component.c +++ b/orte/mca/plm/alps/plm_alps_component.c @@ -43,6 +43,7 @@ */ const char *mca_plm_alps_component_version_string = "Open MPI alps plm MCA component version " ORTE_VERSION; +bool mca_plm_alps_using_aprun = {true}; /* @@ -136,28 +137,19 @@ static int plm_alps_open(void) static int orte_plm_alps_component_query(mca_base_module_t **module, int *priority) { - int alps_wlm_active = 1; #if CRAY_WLM_DETECT char slurm[]="SLURM"; if(!strcmp(slurm,wlm_detect_get_active())) { - alps_wlm_active = 0; + mca_plm_alps_using_aprun = false; } #endif - if (alps_wlm_active) { - *priority = mca_plm_alps_component.priority; - *module = (mca_base_module_t *) &orte_plm_alps_module; - OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, - "%s plm:alps: available for selection", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - } else { - *priority = 0; - *module = NULL; - OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, - "%s plm:alps: not available, slurm present", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - } + *priority = mca_plm_alps_component.priority; + *module = (mca_base_module_t *) &orte_plm_alps_module; + OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output, + "%s plm:alps: available for selection", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; } diff --git a/orte/mca/plm/alps/plm_alps_module.c b/orte/mca/plm/alps/plm_alps_module.c index 48cbc16514..8cf9c287fe 100644 --- a/orte/mca/plm/alps/plm_alps_module.c +++ b/orte/mca/plm/alps/plm_alps_module.c @@ -121,6 +121,23 @@ static int plm_alps_init(void) return rc; } + /* + * owing to way the SLURM PLM component works, we can't use + * it on Cray XC systems as currently designed. The problem + * is the MPI processes launched on the head node (where the + * ORTE_PROC_IS_HNP evalues to true) get launched by a daemon + * (mpirun) which is not a child of a slurmd daemon. This + * means that any RDMA credentials obtained via the odls/alps + * local launcher are incorrect. + * + * So for now, we just don't support mpirun launched jobs + * on Cray XC systems using Native SLURM. + */ + if (false == mca_plm_alps_using_aprun) { + orte_show_help("help-plm-alps.txt", "slurm-not-supported", true); + exit(-1); + } + if (orte_do_not_launch) { /* must map daemons since we won't be launching them */ orte_plm_globals.daemon_nodes_assigned_at_launch = true;