1
1

Enable full operations under SLURM on Cray systems by co-locating a daemon with mpirun when mpirun is executing on a compute node in that environment. This allows local application procs to inherit their security credential from the daemon as it will have been launched via SLURM

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
This commit is contained in:
Ralph Castain 2017-05-06 19:08:50 -07:00
parent 88948f752f
commit a143800bce
5 changed files with 43 additions and 46 deletions

View File

@ -13,6 +13,7 @@
# Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2016 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -68,6 +69,15 @@ AC_DEFUN([ORTE_CHECK_SLURM],[
[orte_check_slurm_happy="yes"],
[orte_check_slurm_happy="no"])])
# check to see if this is a Cray nativized slurm env.
slurm_cray_env=0
OPAL_CHECK_ALPS([orte_slurm_cray],
[slurm_cray_env=1])
AC_DEFINE_UNQUOTED([SLURM_CRAY_ENV],[$slurm_cray_env],
[defined to 1 if slurm cray env, 0 otherwise])
OPAL_SUMMARY_ADD([[Resource Managers]],[[Slurm]],[$1],[$orte_check_slurm_happy])
fi

View File

@ -13,6 +13,7 @@
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2016 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -38,12 +39,4 @@ AC_DEFUN([MCA_orte_plm_slurm_CONFIG],[
AC_SUBST([plm_slurm_LDFLAGS])
AC_SUBST([plm_slurm_LIBS])
# check to see if this is a Cray nativized slurm env.
slurm_cray_env=0
OPAL_CHECK_ALPS([plm_slurm_cray],
[slurm_cray_env=1])
AC_DEFINE_UNQUOTED([SLURM_CRAY_ENV],[$slurm_cray_env],
[defined to 1 if slurm cray env, 0 otherwise])
])dnl

View File

@ -49,18 +49,3 @@ are running.
Please consult with your system administrator about obtaining
such support.
[no-local-support]
The SLURM process starter cannot start processes local to
mpirun when executing under a Cray environment. The problem
is that mpirun is not itself a child of a slurmd daemon. Thus,
any processes mpirun itself starts will inherit incorrect
RDMA credentials.
Your application will be mapped and run (assuming adequate
resources) on the remaining allocated nodes. If adequate
resources are not available, you will need to exit and obtain
a larger allocation.
This situation will be fixed in a future release. Meantime,
you can turn "off" this warning by setting the plm_slurm_warning
MCA param to 0.

View File

@ -193,25 +193,6 @@ static void launch_daemons(int fd, short args, void *cbdata)
"%s plm:slurm: LAUNCH DAEMONS CALLED",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
#if SLURM_CRAY_ENV
/* if we are in a Cray-SLURM environment, then we cannot
* launch procs local to the HNP. The problem
* is the MPI processes launched on the head node (where the
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
* (mpirun) which is not a child of a slurmd daemon. This
* means that any RDMA credentials obtained via the odls/alps
* local launcher are incorrect. So warn the user and set
* the envar for no_schedule_local if mpirun is not on a
* system management node (i.e. is part of the allocation)
* and the "no_use_local" flag hasn't been set */
if (mca_plm_slurm_component.slurm_warning_msg &&
(orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL))) {
orte_show_help("help-plm-slurm.txt", "no-local-support", true);
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
mca_plm_slurm_component.slurm_warning_msg = false; // only do this once
}
#endif
/* if we are launching debugger daemons, then just go
* do it - no new daemons will be launched
*/

View File

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -30,6 +30,7 @@
#include "opal/util/if.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
@ -46,7 +47,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
int rc, i;
orte_node_t *node, *hnp_node, *nptr;
char *ptr;
bool hnp_alone = true;
bool hnp_alone = true, skiphnp = false;
orte_attribute_t *kv;
char **alias=NULL, **nalias;
@ -77,6 +78,33 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
/* get the hnp node's info */
hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
#if SLURM_CRAY_ENV
/* if we are in a Cray-SLURM environment, then we cannot
* launch procs local to the HNP. The problem
* is the MPI processes launched on the head node (where the
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
* (mpirun) which is not a child of a slurmd daemon. This
* means that any RDMA credentials obtained via the odls/alps
* local launcher are incorrect. Test for this condition. If
* found, then take steps to ensure we launch a daemon on
* the same node as mpirun and that it gets used to fork
* local procs instead of mpirun so they get the proper
* credential */
if (NULL != hnp_node) {
OPAL_LIST_FOREACH(node, nodes, orte_node_t) {
if (orte_ifislocal(node->name)) {
orte_hnp_is_allocated = true;
break;
}
}
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
hnp_node->name = strdup("mpirun");
skiphnp = true;
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
}
}
#endif
/* cycle through the list */
while (NULL != (item = opal_list_remove_first(nodes))) {
@ -86,7 +114,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
* first position since it is the first one entered. We need to check to see
* if this node is the same as the HNP's node so we don't double-enter it
*/
if (NULL != hnp_node && orte_ifislocal(node->name)) {
if (!skiphnp && NULL != hnp_node && orte_ifislocal(node->name)) {
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
"%s ras:base:node_insert updating HNP [%s] info to %ld slots",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -189,7 +217,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
* ensure we don't have any domain info in the node record
* for the hnp
*/
if (!orte_have_fqdn_allocation && !hnp_alone) {
if (NULL != hnp_node && !orte_have_fqdn_allocation && !hnp_alone) {
if (NULL != (ptr = strchr(hnp_node->name, '.'))) {
*ptr = '\0';
}