Enable full operations under SLURM on Cray systems by co-locating a daemon with mpirun when mpirun is executing on a compute node in that environment. This allows local application procs to inherit their security credential from the daemon as it will have been launched via SLURM
Signed-off-by: Ralph Castain <rhc@open-mpi.org>
This commit is contained in:
parent
88948f752f
commit
a143800bce
@ -13,6 +13,7 @@
|
||||
# Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2016 Los Alamos National Security, LLC. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -68,6 +69,15 @@ AC_DEFUN([ORTE_CHECK_SLURM],[
|
||||
[orte_check_slurm_happy="yes"],
|
||||
[orte_check_slurm_happy="no"])])
|
||||
|
||||
# check to see if this is a Cray nativized slurm env.
|
||||
|
||||
slurm_cray_env=0
|
||||
OPAL_CHECK_ALPS([orte_slurm_cray],
|
||||
[slurm_cray_env=1])
|
||||
|
||||
AC_DEFINE_UNQUOTED([SLURM_CRAY_ENV],[$slurm_cray_env],
|
||||
[defined to 1 if slurm cray env, 0 otherwise])
|
||||
|
||||
OPAL_SUMMARY_ADD([[Resource Managers]],[[Slurm]],[$1],[$orte_check_slurm_happy])
|
||||
fi
|
||||
|
||||
|
@ -13,6 +13,7 @@
|
||||
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2011-2016 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -38,12 +39,4 @@ AC_DEFUN([MCA_orte_plm_slurm_CONFIG],[
|
||||
AC_SUBST([plm_slurm_LDFLAGS])
|
||||
AC_SUBST([plm_slurm_LIBS])
|
||||
|
||||
# check to see if this is a Cray nativized slurm env.
|
||||
|
||||
slurm_cray_env=0
|
||||
OPAL_CHECK_ALPS([plm_slurm_cray],
|
||||
[slurm_cray_env=1])
|
||||
|
||||
AC_DEFINE_UNQUOTED([SLURM_CRAY_ENV],[$slurm_cray_env],
|
||||
[defined to 1 if slurm cray env, 0 otherwise])
|
||||
])dnl
|
||||
|
@ -49,18 +49,3 @@ are running.
|
||||
|
||||
Please consult with your system administrator about obtaining
|
||||
such support.
|
||||
[no-local-support]
|
||||
The SLURM process starter cannot start processes local to
|
||||
mpirun when executing under a Cray environment. The problem
|
||||
is that mpirun is not itself a child of a slurmd daemon. Thus,
|
||||
any processes mpirun itself starts will inherit incorrect
|
||||
RDMA credentials.
|
||||
|
||||
Your application will be mapped and run (assuming adequate
|
||||
resources) on the remaining allocated nodes. If adequate
|
||||
resources are not available, you will need to exit and obtain
|
||||
a larger allocation.
|
||||
|
||||
This situation will be fixed in a future release. Meantime,
|
||||
you can turn "off" this warning by setting the plm_slurm_warning
|
||||
MCA param to 0.
|
||||
|
@ -193,25 +193,6 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
"%s plm:slurm: LAUNCH DAEMONS CALLED",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
#if SLURM_CRAY_ENV
|
||||
/* if we are in a Cray-SLURM environment, then we cannot
|
||||
* launch procs local to the HNP. The problem
|
||||
* is the MPI processes launched on the head node (where the
|
||||
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
|
||||
* (mpirun) which is not a child of a slurmd daemon. This
|
||||
* means that any RDMA credentials obtained via the odls/alps
|
||||
* local launcher are incorrect. So warn the user and set
|
||||
* the envar for no_schedule_local if mpirun is not on a
|
||||
* system management node (i.e. is part of the allocation)
|
||||
* and the "no_use_local" flag hasn't been set */
|
||||
if (mca_plm_slurm_component.slurm_warning_msg &&
|
||||
(orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL))) {
|
||||
orte_show_help("help-plm-slurm.txt", "no-local-support", true);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
|
||||
mca_plm_slurm_component.slurm_warning_msg = false; // only do this once
|
||||
}
|
||||
#endif
|
||||
|
||||
/* if we are launching debugger daemons, then just go
|
||||
* do it - no new daemons will be launched
|
||||
*/
|
||||
|
@ -11,7 +11,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -30,6 +30,7 @@
|
||||
#include "opal/util/if.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
@ -46,7 +47,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
int rc, i;
|
||||
orte_node_t *node, *hnp_node, *nptr;
|
||||
char *ptr;
|
||||
bool hnp_alone = true;
|
||||
bool hnp_alone = true, skiphnp = false;
|
||||
orte_attribute_t *kv;
|
||||
char **alias=NULL, **nalias;
|
||||
|
||||
@ -77,6 +78,33 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
|
||||
/* get the hnp node's info */
|
||||
hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
|
||||
#if SLURM_CRAY_ENV
|
||||
/* if we are in a Cray-SLURM environment, then we cannot
|
||||
* launch procs local to the HNP. The problem
|
||||
* is the MPI processes launched on the head node (where the
|
||||
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
|
||||
* (mpirun) which is not a child of a slurmd daemon. This
|
||||
* means that any RDMA credentials obtained via the odls/alps
|
||||
* local launcher are incorrect. Test for this condition. If
|
||||
* found, then take steps to ensure we launch a daemon on
|
||||
* the same node as mpirun and that it gets used to fork
|
||||
* local procs instead of mpirun so they get the proper
|
||||
* credential */
|
||||
if (NULL != hnp_node) {
|
||||
OPAL_LIST_FOREACH(node, nodes, orte_node_t) {
|
||||
if (orte_ifislocal(node->name)) {
|
||||
orte_hnp_is_allocated = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
|
||||
hnp_node->name = strdup("mpirun");
|
||||
skiphnp = true;
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* cycle through the list */
|
||||
while (NULL != (item = opal_list_remove_first(nodes))) {
|
||||
@ -86,7 +114,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
* first position since it is the first one entered. We need to check to see
|
||||
* if this node is the same as the HNP's node so we don't double-enter it
|
||||
*/
|
||||
if (NULL != hnp_node && orte_ifislocal(node->name)) {
|
||||
if (!skiphnp && NULL != hnp_node && orte_ifislocal(node->name)) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
|
||||
"%s ras:base:node_insert updating HNP [%s] info to %ld slots",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -189,7 +217,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
* ensure we don't have any domain info in the node record
|
||||
* for the hnp
|
||||
*/
|
||||
if (!orte_have_fqdn_allocation && !hnp_alone) {
|
||||
if (NULL != hnp_node && !orte_have_fqdn_allocation && !hnp_alone) {
|
||||
if (NULL != (ptr = strchr(hnp_node->name, '.'))) {
|
||||
*ptr = '\0';
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user