From a29ca2bb0d7e0a1f4749fe88c1aa6bd09837d0a0 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 5 Apr 2017 17:32:39 -0700 Subject: [PATCH] Enable slurm operations on Cray with constraints Cleanup some errors in the nidmap code that caused us to send unnecessary topologies Signed-off-by: Ralph Castain --- orte/mca/plm/alps/help-plm-alps.txt | 5 +---- orte/mca/plm/alps/plm_alps.h | 2 +- orte/mca/plm/alps/plm_alps_component.c | 8 ++++++-- orte/mca/plm/alps/plm_alps_module.c | 17 ----------------- orte/mca/plm/slurm/help-plm-slurm.txt | 17 ++++++++++++++++- orte/mca/plm/slurm/plm_slurm.h | 2 ++ orte/mca/plm/slurm/plm_slurm_component.c | 10 ++++++++++ orte/mca/plm/slurm/plm_slurm_module.c | 21 ++++++++++++++++++++- orte/util/nidmap.c | 23 +++++++++++++++++------ 9 files changed, 73 insertions(+), 32 deletions(-) diff --git a/orte/mca/plm/alps/help-plm-alps.txt b/orte/mca/plm/alps/help-plm-alps.txt index f109299a86..c0e3d0470f 100644 --- a/orte/mca/plm/alps/help-plm-alps.txt +++ b/orte/mca/plm/alps/help-plm-alps.txt @@ -10,6 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -39,7 +40,3 @@ the map for this application. This can be caused by a lack of an allocation, or by an error in the Open MPI code. Please check to ensure you have a ALPS allocation. If you do, then please pass the error to the Open MPI user's mailing list for assistance. -# -[slurm-not-supported] -mpirun is not a supported launcher on Cray XC using Native SLURM. -srun must be used to launch jobs on these systems. diff --git a/orte/mca/plm/alps/plm_alps.h b/orte/mca/plm/alps/plm_alps.h index d15ae07ffa..bdc039feda 100644 --- a/orte/mca/plm/alps/plm_alps.h +++ b/orte/mca/plm/alps/plm_alps.h @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -46,7 +47,6 @@ ORTE_MODULE_DECLSPEC extern orte_plm_alps_component_t mca_plm_alps_component; ORTE_DECLSPEC extern orte_plm_base_module_t orte_plm_alps_module; -extern bool mca_plm_alps_using_aprun; END_C_DECLS diff --git a/orte/mca/plm/alps/plm_alps_component.c b/orte/mca/plm/alps/plm_alps_component.c index e474cd5913..f906a5cb1b 100644 --- a/orte/mca/plm/alps/plm_alps_component.c +++ b/orte/mca/plm/alps/plm_alps_component.c @@ -12,6 +12,7 @@ * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -43,7 +44,6 @@ */ const char *mca_plm_alps_component_version_string = "Open MPI alps plm MCA component version " ORTE_VERSION; -bool mca_plm_alps_using_aprun = {true}; /* @@ -158,7 +158,11 @@ static int orte_plm_alps_component_query(mca_base_module_t **module, int *priori } if((NULL != wlm_detected) && !strcmp(slurm, wlm_detected)) { - mca_plm_alps_using_aprun = false; + /* we are in a Cray SLURM environment, so we don't want + * this plm component */ + *priority = 0; + *module = NULL; + return ORTE_ERROR; } #endif diff --git a/orte/mca/plm/alps/plm_alps_module.c b/orte/mca/plm/alps/plm_alps_module.c index 2592cf5363..61b1c32dba 100644 --- a/orte/mca/plm/alps/plm_alps_module.c +++ b/orte/mca/plm/alps/plm_alps_module.c @@ -121,23 +121,6 @@ static int plm_alps_init(void) return rc; } - /* - * owing to way the SLURM PLM component works, we can't use - * it on Cray XC systems as currently designed. The problem - * is the MPI processes launched on the head node (where the - * ORTE_PROC_IS_HNP evalues to true) get launched by a daemon - * (mpirun) which is not a child of a slurmd daemon. This - * means that any RDMA credentials obtained via the odls/alps - * local launcher are incorrect. - * - * So for now, we just don't support mpirun launched jobs - * on Cray XC systems using Native SLURM. - */ - if (false == mca_plm_alps_using_aprun) { - orte_show_help("help-plm-alps.txt", "slurm-not-supported", true); - exit(-1); - } - if (orte_do_not_launch) { /* must map daemons since we won't be launching them */ orte_plm_globals.daemon_nodes_assigned_at_launch = true; diff --git a/orte/mca/plm/slurm/help-plm-slurm.txt b/orte/mca/plm/slurm/help-plm-slurm.txt index 8c450c0a28..837c3e88a8 100644 --- a/orte/mca/plm/slurm/help-plm-slurm.txt +++ b/orte/mca/plm/slurm/help-plm-slurm.txt @@ -10,7 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2014 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -49,3 +49,18 @@ are running. Please consult with your system administrator about obtaining such support. +[no-local-support] +The SLURM process starter cannot start processes local to +mpirun when executing under a Cray environment. The problem +is that mpirun is not itself a child of a slurmd daemon. Thus, +any processes mpirun itself starts will inherit incorrect +RDMA credentials. + +Your application will be mapped and run (assuming adequate +resources) on the remaining allocated nodes. If adequate +resources are not available, you will need to exit and obtain +a larger allocation. + +This situation will be fixed in a future release. Meantime, +you can turn "off" this warning by setting the plm_slurm_warning +MCA param to 0. diff --git a/orte/mca/plm/slurm/plm_slurm.h b/orte/mca/plm/slurm/plm_slurm.h index eae239edf0..1e88ef60a8 100644 --- a/orte/mca/plm/slurm/plm_slurm.h +++ b/orte/mca/plm/slurm/plm_slurm.h @@ -9,6 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -29,6 +30,7 @@ BEGIN_C_DECLS struct orte_plm_slurm_component_t { orte_plm_base_component_t super; char *custom_args; + bool slurm_warning_msg; }; typedef struct orte_plm_slurm_component_t orte_plm_slurm_component_t; diff --git a/orte/mca/plm/slurm/plm_slurm_component.c b/orte/mca/plm/slurm/plm_slurm_component.c index 90d14dd24c..3e29bd4623 100644 --- a/orte/mca/plm/slurm/plm_slurm_component.c +++ b/orte/mca/plm/slurm/plm_slurm_component.c @@ -12,6 +12,7 @@ * All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2017 Intel, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -28,7 +29,9 @@ #include "orte_config.h" #include "orte/constants.h" +#include "opal/util/opal_environ.h" #include "orte/util/name_fns.h" +#include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/plm/plm.h" @@ -99,6 +102,13 @@ static int plm_slurm_register(void) MCA_BASE_VAR_SCOPE_READONLY, &mca_plm_slurm_component.custom_args); + mca_plm_slurm_component.slurm_warning_msg = true; + (void) mca_base_component_var_register (comp, "warning", "Turn off warning message", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_plm_slurm_component.slurm_warning_msg); + return ORTE_SUCCESS; } diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index 1008ef09ee..9b6969f60f 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -65,7 +65,7 @@ #include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_quit.h" #include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/rmaps/rmaps.h" +#include "orte/mca/rmaps/base/base.h" #include "orte/mca/state/state.h" #include "orte/orted/orted.h" @@ -193,6 +193,25 @@ static void launch_daemons(int fd, short args, void *cbdata) "%s plm:slurm: LAUNCH DAEMONS CALLED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); +#if SLURM_CRAY_ENV + /* if we are in a Cray-SLURM environment, then we cannot + * launch procs local to the HNP. The problem + * is the MPI processes launched on the head node (where the + * ORTE_PROC_IS_HNP evalues to true) get launched by a daemon + * (mpirun) which is not a child of a slurmd daemon. This + * means that any RDMA credentials obtained via the odls/alps + * local launcher are incorrect. So warn the user and set + * the envar for no_schedule_local if mpirun is not on a + * system management node (i.e. is part of the allocation) + * and the "no_use_local" flag hasn't been set */ + if (mca_plm_slurm_component.slurm_warning_msg && + (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL))) { + orte_show_help("help-plm-slurm.txt", "no-local-support", true); + ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL); + mca_plm_slurm_component.slurm_warning_msg = false; // only do this once + } +#endif + /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ diff --git a/orte/util/nidmap.c b/orte/util/nidmap.c index c2f9abae2a..51ea46141a 100644 --- a/orte/util/nidmap.c +++ b/orte/util/nidmap.c @@ -618,14 +618,25 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer) /* handle the topologies - as the most common case by far * is to have homogeneous topologies, we only send them - * if something is different */ - if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) { - ui8 = 2; - } else { - ui8 = 1; + * if something is different. We know that the HNP is + * the first topology, and that any differing topology + * on the compute nodes must follow. So send the topologies + * if and only if: + * + * (a) the HNP is being used to house application procs and + * there is more than one topology on our list; or + * + * (b) the HNP is not being used, but there are more than + * two topologies on our list, thus indicating that + * there are multiple topologies on the compute nodes + */ + if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) { + /* remove the first topo on the list */ + item = opal_list_remove_first(&topos); + OBJ_RELEASE(item); } tmp = NULL; - if (ui8 < opal_list_get_size(&topos)) { + if (1 < opal_list_get_size(&topos)) { opal_buffer_t bucket, *bptr; OBJ_CONSTRUCT(&bucket, opal_buffer_t); while (NULL != (item = opal_list_remove_first(&topos))) {