SLURM: launch all processes via slurmd
It turns out that the approach of having the HNP do the fork/exec of MPI ranks on the head node in a SLURM environment introduces problems when users/sysadmins want to use the SLURM scancl tool or sbatch --signal option to signal a job. This commit disables use of the HNP fork/exec procedure when a job is launched into a SLURM controlled allocation. update NEWS with a blurb about new ras framework mca parameter. related to #3998 Signed-off-by: Howard Pritchard <hppritcha@gmail.com>
Этот коммит содержится в:
родитель
a7a30424cb
Коммит
d08be74573
5
NEWS
5
NEWS
@ -65,6 +65,11 @@ Master (not on release branches yet)
|
||||
via --enable-mpi-cxx.
|
||||
- Removed embedded VampirTrace. It is in maintenance mode since 2013.
|
||||
Please consider Score-P (score-p.org) as an external replacement.
|
||||
- Add a mca parameter ras_base_launch_orted_on_hn to allow for launching
|
||||
MPI processes on the same node where mpirun is executing using a separate
|
||||
orte daemon, rather than the mpirun process. This may be useful to set to
|
||||
true when using SLURM, as it improves interoperability with SLURM's signal
|
||||
propagation tools. By default it is set to false, except for Cray XC systems.
|
||||
|
||||
3.0.0 -- July, 2017
|
||||
-------------------
|
||||
|
@ -51,6 +51,7 @@ typedef struct orte_ras_base_t {
|
||||
orte_ras_base_module_t *active_module;
|
||||
int total_slots_alloc;
|
||||
int multiplier;
|
||||
bool launch_orted_on_hn;
|
||||
} orte_ras_base_t;
|
||||
|
||||
ORTE_DECLSPEC extern orte_ras_base_t orte_ras_base;
|
||||
|
@ -59,6 +59,31 @@ static int ras_register(mca_base_register_flag_t flags)
|
||||
NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &orte_ras_base.multiplier);
|
||||
#if SLURM_CRAY_ENV
|
||||
/*
|
||||
* If we are in a Cray-SLURM environment, then we cannot
|
||||
* launch procs local to the HNP. The problem
|
||||
* is the MPI processes launched on the head node (where the
|
||||
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
|
||||
* (mpirun) which is not a child of a slurmd daemon. This
|
||||
* means that any RDMA credentials obtained via the odls/alps
|
||||
* local launcher are incorrect. Test for this condition. If
|
||||
* found, then take steps to ensure we launch a daemon on
|
||||
* the same node as mpirun and that it gets used to fork
|
||||
* local procs instead of mpirun so they get the proper
|
||||
* credential */
|
||||
|
||||
orte_ras_base.launch_orted_on_hn = true;
|
||||
#else
|
||||
orte_ras_base.launch_orted_on_hn = false;
|
||||
#endif
|
||||
|
||||
mca_base_var_register("orte", "ras", "base", "launch_orted_on_hn",
|
||||
"Launch an orte daemon on the head node",
|
||||
MCA_BASE_VAR_TYPE_BOOL,
|
||||
NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &orte_ras_base.launch_orted_on_hn);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -9,7 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
|
||||
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Research Organization for Information Science
|
||||
@ -78,18 +78,9 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
|
||||
/* get the hnp node's info */
|
||||
hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
|
||||
#if SLURM_CRAY_ENV
|
||||
/* if we are in a Cray-SLURM environment, then we cannot
|
||||
* launch procs local to the HNP. The problem
|
||||
* is the MPI processes launched on the head node (where the
|
||||
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
|
||||
* (mpirun) which is not a child of a slurmd daemon. This
|
||||
* means that any RDMA credentials obtained via the odls/alps
|
||||
* local launcher are incorrect. Test for this condition. If
|
||||
* found, then take steps to ensure we launch a daemon on
|
||||
* the same node as mpirun and that it gets used to fork
|
||||
* local procs instead of mpirun so they get the proper
|
||||
* credential */
|
||||
|
||||
if ((orte_ras_base.launch_orted_on_hn == true) &&
|
||||
(orte_managed_allocation)) {
|
||||
if (NULL != hnp_node) {
|
||||
OPAL_LIST_FOREACH(node, nodes, orte_node_t) {
|
||||
if (orte_ifislocal(node->name)) {
|
||||
@ -97,14 +88,14 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
|
||||
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) &
|
||||
ORTE_MAPPING_NO_USE_LOCAL)) {
|
||||
hnp_node->name = strdup("mpirun");
|
||||
skiphnp = true;
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
/* cycle through the list */
|
||||
while (NULL != (item = opal_list_remove_first(nodes))) {
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user