1
1

SLURM: launch all processes via slurmd

It turns out that the approach of having the HNP do the
fork/exec of MPI ranks on the head node in a SLURM environment
introduces problems when users/sysadmins want to use the SLURM
scancl tool or sbatch --signal option to signal a job.

This commit disables use of the HNP fork/exec procedure when
a job is launched into a SLURM controlled allocation.

update NEWS with a blurb about new ras framework mca parameter.

related to #3998

Signed-off-by: Howard Pritchard <hppritcha@gmail.com>
Этот коммит содержится в:
Howard Pritchard 2017-08-01 03:04:02 -06:00
родитель a7a30424cb
Коммит d08be74573
4 изменённых файлов: 47 добавлений и 25 удалений

5
NEWS
Просмотреть файл

@ -65,6 +65,11 @@ Master (not on release branches yet)
via --enable-mpi-cxx. via --enable-mpi-cxx.
- Removed embedded VampirTrace. It is in maintenance mode since 2013. - Removed embedded VampirTrace. It is in maintenance mode since 2013.
Please consider Score-P (score-p.org) as an external replacement. Please consider Score-P (score-p.org) as an external replacement.
- Add a mca parameter ras_base_launch_orted_on_hn to allow for launching
MPI processes on the same node where mpirun is executing using a separate
orte daemon, rather than the mpirun process. This may be useful to set to
true when using SLURM, as it improves interoperability with SLURM's signal
propagation tools. By default it is set to false, except for Cray XC systems.
3.0.0 -- July, 2017 3.0.0 -- July, 2017
------------------- -------------------

Просмотреть файл

@ -51,6 +51,7 @@ typedef struct orte_ras_base_t {
orte_ras_base_module_t *active_module; orte_ras_base_module_t *active_module;
int total_slots_alloc; int total_slots_alloc;
int multiplier; int multiplier;
bool launch_orted_on_hn;
} orte_ras_base_t; } orte_ras_base_t;
ORTE_DECLSPEC extern orte_ras_base_t orte_ras_base; ORTE_DECLSPEC extern orte_ras_base_t orte_ras_base;

Просмотреть файл

@ -59,6 +59,31 @@ static int ras_register(mca_base_register_flag_t flags)
NULL, 0, 0, NULL, 0, 0,
OPAL_INFO_LVL_9, OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &orte_ras_base.multiplier); MCA_BASE_VAR_SCOPE_READONLY, &orte_ras_base.multiplier);
#if SLURM_CRAY_ENV
/*
* If we are in a Cray-SLURM environment, then we cannot
* launch procs local to the HNP. The problem
* is the MPI processes launched on the head node (where the
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
* (mpirun) which is not a child of a slurmd daemon. This
* means that any RDMA credentials obtained via the odls/alps
* local launcher are incorrect. Test for this condition. If
* found, then take steps to ensure we launch a daemon on
* the same node as mpirun and that it gets used to fork
* local procs instead of mpirun so they get the proper
* credential */
orte_ras_base.launch_orted_on_hn = true;
#else
orte_ras_base.launch_orted_on_hn = false;
#endif
mca_base_var_register("orte", "ras", "base", "launch_orted_on_hn",
"Launch an orte daemon on the head node",
MCA_BASE_VAR_TYPE_BOOL,
NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &orte_ras_base.launch_orted_on_hn);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }

Просмотреть файл

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved. * University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California. * Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved. * All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights * Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science * Copyright (c) 2015 Research Organization for Information Science
@ -78,18 +78,9 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
/* get the hnp node's info */ /* get the hnp node's info */
hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0); hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
#if SLURM_CRAY_ENV
/* if we are in a Cray-SLURM environment, then we cannot if ((orte_ras_base.launch_orted_on_hn == true) &&
* launch procs local to the HNP. The problem (orte_managed_allocation)) {
* is the MPI processes launched on the head node (where the
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
* (mpirun) which is not a child of a slurmd daemon. This
* means that any RDMA credentials obtained via the odls/alps
* local launcher are incorrect. Test for this condition. If
* found, then take steps to ensure we launch a daemon on
* the same node as mpirun and that it gets used to fork
* local procs instead of mpirun so they get the proper
* credential */
if (NULL != hnp_node) { if (NULL != hnp_node) {
OPAL_LIST_FOREACH(node, nodes, orte_node_t) { OPAL_LIST_FOREACH(node, nodes, orte_node_t) {
if (orte_ifislocal(node->name)) { if (orte_ifislocal(node->name)) {
@ -97,14 +88,14 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
break; break;
} }
} }
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) { if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) &
ORTE_MAPPING_NO_USE_LOCAL)) {
hnp_node->name = strdup("mpirun"); hnp_node->name = strdup("mpirun");
skiphnp = true; skiphnp = true;
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL); ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
} }
} }
#endif }
/* cycle through the list */ /* cycle through the list */
while (NULL != (item = opal_list_remove_first(nodes))) { while (NULL != (item = opal_list_remove_first(nodes))) {