diff --git a/config/orte_check_slurm.m4 b/config/orte_check_slurm.m4 index b59e5f5804..ee5cd02cce 100644 --- a/config/orte_check_slurm.m4 +++ b/config/orte_check_slurm.m4 @@ -13,6 +13,7 @@ # Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2016 Los Alamos National Security, LLC. All rights # reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -68,6 +69,15 @@ AC_DEFUN([ORTE_CHECK_SLURM],[ [orte_check_slurm_happy="yes"], [orte_check_slurm_happy="no"])]) + # check to see if this is a Cray nativized slurm env. + + slurm_cray_env=0 + OPAL_CHECK_ALPS([orte_slurm_cray], + [slurm_cray_env=1]) + + AC_DEFINE_UNQUOTED([SLURM_CRAY_ENV],[$slurm_cray_env], + [defined to 1 if slurm cray env, 0 otherwise]) + OPAL_SUMMARY_ADD([[Resource Managers]],[[Slurm]],[$1],[$orte_check_slurm_happy]) fi diff --git a/orte/mca/plm/slurm/configure.m4 b/orte/mca/plm/slurm/configure.m4 index 6aabe47710..fa7267e531 100644 --- a/orte/mca/plm/slurm/configure.m4 +++ b/orte/mca/plm/slurm/configure.m4 @@ -13,6 +13,7 @@ # Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2011-2016 Los Alamos National Security, LLC. # All rights reserved. +# Copyright (c) 2017 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -38,12 +39,4 @@ AC_DEFUN([MCA_orte_plm_slurm_CONFIG],[ AC_SUBST([plm_slurm_LDFLAGS]) AC_SUBST([plm_slurm_LIBS]) - # check to see if this is a Cray nativized slurm env. - - slurm_cray_env=0 - OPAL_CHECK_ALPS([plm_slurm_cray], - [slurm_cray_env=1]) - - AC_DEFINE_UNQUOTED([SLURM_CRAY_ENV],[$slurm_cray_env], - [defined to 1 if slurm cray env, 0 otherwise]) ])dnl diff --git a/orte/mca/plm/slurm/help-plm-slurm.txt b/orte/mca/plm/slurm/help-plm-slurm.txt index 837c3e88a8..9cc5af5b44 100644 --- a/orte/mca/plm/slurm/help-plm-slurm.txt +++ b/orte/mca/plm/slurm/help-plm-slurm.txt @@ -49,18 +49,3 @@ are running. Please consult with your system administrator about obtaining such support. -[no-local-support] -The SLURM process starter cannot start processes local to -mpirun when executing under a Cray environment. The problem -is that mpirun is not itself a child of a slurmd daemon. Thus, -any processes mpirun itself starts will inherit incorrect -RDMA credentials. - -Your application will be mapped and run (assuming adequate -resources) on the remaining allocated nodes. If adequate -resources are not available, you will need to exit and obtain -a larger allocation. - -This situation will be fixed in a future release. Meantime, -you can turn "off" this warning by setting the plm_slurm_warning -MCA param to 0. diff --git a/orte/mca/plm/slurm/plm_slurm_module.c b/orte/mca/plm/slurm/plm_slurm_module.c index fc62b057f3..4c5e7e1167 100644 --- a/orte/mca/plm/slurm/plm_slurm_module.c +++ b/orte/mca/plm/slurm/plm_slurm_module.c @@ -193,25 +193,6 @@ static void launch_daemons(int fd, short args, void *cbdata) "%s plm:slurm: LAUNCH DAEMONS CALLED", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); -#if SLURM_CRAY_ENV - /* if we are in a Cray-SLURM environment, then we cannot - * launch procs local to the HNP. The problem - * is the MPI processes launched on the head node (where the - * ORTE_PROC_IS_HNP evalues to true) get launched by a daemon - * (mpirun) which is not a child of a slurmd daemon. This - * means that any RDMA credentials obtained via the odls/alps - * local launcher are incorrect. So warn the user and set - * the envar for no_schedule_local if mpirun is not on a - * system management node (i.e. is part of the allocation) - * and the "no_use_local" flag hasn't been set */ - if (mca_plm_slurm_component.slurm_warning_msg && - (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL))) { - orte_show_help("help-plm-slurm.txt", "no-local-support", true); - ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL); - mca_plm_slurm_component.slurm_warning_msg = false; // only do this once - } -#endif - /* if we are launching debugger daemons, then just go * do it - no new daemons will be launched */ diff --git a/orte/mca/ras/base/ras_base_node.c b/orte/mca/ras/base/ras_base_node.c index ae11c44db5..5fd3b3dda2 100644 --- a/orte/mca/ras/base/ras_base_node.c +++ b/orte/mca/ras/base/ras_base_node.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -30,6 +30,7 @@ #include "opal/util/if.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rmaps/base/base.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" @@ -46,7 +47,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) int rc, i; orte_node_t *node, *hnp_node, *nptr; char *ptr; - bool hnp_alone = true; + bool hnp_alone = true, skiphnp = false; orte_attribute_t *kv; char **alias=NULL, **nalias; @@ -77,6 +78,33 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) /* get the hnp node's info */ hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); +#if SLURM_CRAY_ENV + /* if we are in a Cray-SLURM environment, then we cannot + * launch procs local to the HNP. The problem + * is the MPI processes launched on the head node (where the + * ORTE_PROC_IS_HNP evalues to true) get launched by a daemon + * (mpirun) which is not a child of a slurmd daemon. This + * means that any RDMA credentials obtained via the odls/alps + * local launcher are incorrect. Test for this condition. If + * found, then take steps to ensure we launch a daemon on + * the same node as mpirun and that it gets used to fork + * local procs instead of mpirun so they get the proper + * credential */ + if (NULL != hnp_node) { + OPAL_LIST_FOREACH(node, nodes, orte_node_t) { + if (orte_ifislocal(node->name)) { + orte_hnp_is_allocated = true; + break; + } + } + if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) { + hnp_node->name = strdup("mpirun"); + skiphnp = true; + ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL); + } + } +#endif + /* cycle through the list */ while (NULL != (item = opal_list_remove_first(nodes))) { @@ -86,7 +114,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) * first position since it is the first one entered. We need to check to see * if this node is the same as the HNP's node so we don't double-enter it */ - if (NULL != hnp_node && orte_ifislocal(node->name)) { + if (!skiphnp && NULL != hnp_node && orte_ifislocal(node->name)) { OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output, "%s ras:base:node_insert updating HNP [%s] info to %ld slots", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -189,7 +217,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata) * ensure we don't have any domain info in the node record * for the hnp */ - if (!orte_have_fqdn_allocation && !hnp_alone) { + if (NULL != hnp_node && !orte_have_fqdn_allocation && !hnp_alone) { if (NULL != (ptr = strchr(hnp_node->name, '.'))) { *ptr = '\0'; }