SLURM: launch all processes via slurmd
It turns out that the approach of having the HNP do the fork/exec of MPI ranks on the head node in a SLURM environment introduces problems when users/sysadmins want to use the SLURM scancl tool or sbatch --signal option to signal a job. This commit disables use of the HNP fork/exec procedure when a job is launched into a SLURM controlled allocation. update NEWS with a blurb about new ras framework mca parameter. related to #3998 Signed-off-by: Howard Pritchard <hppritcha@gmail.com>
Этот коммит содержится в:
родитель
a7a30424cb
Коммит
d08be74573
5
NEWS
5
NEWS
@ -65,6 +65,11 @@ Master (not on release branches yet)
|
|||||||
via --enable-mpi-cxx.
|
via --enable-mpi-cxx.
|
||||||
- Removed embedded VampirTrace. It is in maintenance mode since 2013.
|
- Removed embedded VampirTrace. It is in maintenance mode since 2013.
|
||||||
Please consider Score-P (score-p.org) as an external replacement.
|
Please consider Score-P (score-p.org) as an external replacement.
|
||||||
|
- Add a mca parameter ras_base_launch_orted_on_hn to allow for launching
|
||||||
|
MPI processes on the same node where mpirun is executing using a separate
|
||||||
|
orte daemon, rather than the mpirun process. This may be useful to set to
|
||||||
|
true when using SLURM, as it improves interoperability with SLURM's signal
|
||||||
|
propagation tools. By default it is set to false, except for Cray XC systems.
|
||||||
|
|
||||||
3.0.0 -- July, 2017
|
3.0.0 -- July, 2017
|
||||||
-------------------
|
-------------------
|
||||||
|
@ -51,6 +51,7 @@ typedef struct orte_ras_base_t {
|
|||||||
orte_ras_base_module_t *active_module;
|
orte_ras_base_module_t *active_module;
|
||||||
int total_slots_alloc;
|
int total_slots_alloc;
|
||||||
int multiplier;
|
int multiplier;
|
||||||
|
bool launch_orted_on_hn;
|
||||||
} orte_ras_base_t;
|
} orte_ras_base_t;
|
||||||
|
|
||||||
ORTE_DECLSPEC extern orte_ras_base_t orte_ras_base;
|
ORTE_DECLSPEC extern orte_ras_base_t orte_ras_base;
|
||||||
|
@ -59,6 +59,31 @@ static int ras_register(mca_base_register_flag_t flags)
|
|||||||
NULL, 0, 0,
|
NULL, 0, 0,
|
||||||
OPAL_INFO_LVL_9,
|
OPAL_INFO_LVL_9,
|
||||||
MCA_BASE_VAR_SCOPE_READONLY, &orte_ras_base.multiplier);
|
MCA_BASE_VAR_SCOPE_READONLY, &orte_ras_base.multiplier);
|
||||||
|
#if SLURM_CRAY_ENV
|
||||||
|
/*
|
||||||
|
* If we are in a Cray-SLURM environment, then we cannot
|
||||||
|
* launch procs local to the HNP. The problem
|
||||||
|
* is the MPI processes launched on the head node (where the
|
||||||
|
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
|
||||||
|
* (mpirun) which is not a child of a slurmd daemon. This
|
||||||
|
* means that any RDMA credentials obtained via the odls/alps
|
||||||
|
* local launcher are incorrect. Test for this condition. If
|
||||||
|
* found, then take steps to ensure we launch a daemon on
|
||||||
|
* the same node as mpirun and that it gets used to fork
|
||||||
|
* local procs instead of mpirun so they get the proper
|
||||||
|
* credential */
|
||||||
|
|
||||||
|
orte_ras_base.launch_orted_on_hn = true;
|
||||||
|
#else
|
||||||
|
orte_ras_base.launch_orted_on_hn = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
mca_base_var_register("orte", "ras", "base", "launch_orted_on_hn",
|
||||||
|
"Launch an orte daemon on the head node",
|
||||||
|
MCA_BASE_VAR_TYPE_BOOL,
|
||||||
|
NULL, 0, 0,
|
||||||
|
OPAL_INFO_LVL_9,
|
||||||
|
MCA_BASE_VAR_SCOPE_READONLY, &orte_ras_base.launch_orted_on_hn);
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
* Copyright (c) 2015 Research Organization for Information Science
|
* Copyright (c) 2015 Research Organization for Information Science
|
||||||
@ -78,18 +78,9 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
|||||||
|
|
||||||
/* get the hnp node's info */
|
/* get the hnp node's info */
|
||||||
hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
|
hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
|
||||||
#if SLURM_CRAY_ENV
|
|
||||||
/* if we are in a Cray-SLURM environment, then we cannot
|
if ((orte_ras_base.launch_orted_on_hn == true) &&
|
||||||
* launch procs local to the HNP. The problem
|
(orte_managed_allocation)) {
|
||||||
* is the MPI processes launched on the head node (where the
|
|
||||||
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
|
|
||||||
* (mpirun) which is not a child of a slurmd daemon. This
|
|
||||||
* means that any RDMA credentials obtained via the odls/alps
|
|
||||||
* local launcher are incorrect. Test for this condition. If
|
|
||||||
* found, then take steps to ensure we launch a daemon on
|
|
||||||
* the same node as mpirun and that it gets used to fork
|
|
||||||
* local procs instead of mpirun so they get the proper
|
|
||||||
* credential */
|
|
||||||
if (NULL != hnp_node) {
|
if (NULL != hnp_node) {
|
||||||
OPAL_LIST_FOREACH(node, nodes, orte_node_t) {
|
OPAL_LIST_FOREACH(node, nodes, orte_node_t) {
|
||||||
if (orte_ifislocal(node->name)) {
|
if (orte_ifislocal(node->name)) {
|
||||||
@ -97,14 +88,14 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
|
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) &
|
||||||
|
ORTE_MAPPING_NO_USE_LOCAL)) {
|
||||||
hnp_node->name = strdup("mpirun");
|
hnp_node->name = strdup("mpirun");
|
||||||
skiphnp = true;
|
skiphnp = true;
|
||||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
|
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
}
|
||||||
|
|
||||||
|
|
||||||
/* cycle through the list */
|
/* cycle through the list */
|
||||||
while (NULL != (item = opal_list_remove_first(nodes))) {
|
while (NULL != (item = opal_list_remove_first(nodes))) {
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user