Enable slurm operations on Cray with constraints
Cleanup some errors in the nidmap code that caused us to send unnecessary topologies Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
bf668ad1e9
Коммит
a29ca2bb0d
@ -10,6 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -39,7 +40,3 @@ the map for this application. This can be caused by a lack of
|
||||
an allocation, or by an error in the Open MPI code. Please check
|
||||
to ensure you have a ALPS allocation. If you do, then please pass
|
||||
the error to the Open MPI user's mailing list for assistance.
|
||||
#
|
||||
[slurm-not-supported]
|
||||
mpirun is not a supported launcher on Cray XC using Native SLURM.
|
||||
srun must be used to launch jobs on these systems.
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -46,7 +47,6 @@ ORTE_MODULE_DECLSPEC extern orte_plm_alps_component_t
|
||||
mca_plm_alps_component;
|
||||
ORTE_DECLSPEC extern orte_plm_base_module_t
|
||||
orte_plm_alps_module;
|
||||
extern bool mca_plm_alps_using_aprun;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -43,7 +44,6 @@
|
||||
*/
|
||||
const char *mca_plm_alps_component_version_string =
|
||||
"Open MPI alps plm MCA component version " ORTE_VERSION;
|
||||
bool mca_plm_alps_using_aprun = {true};
|
||||
|
||||
|
||||
/*
|
||||
@ -158,7 +158,11 @@ static int orte_plm_alps_component_query(mca_base_module_t **module, int *priori
|
||||
}
|
||||
|
||||
if((NULL != wlm_detected) && !strcmp(slurm, wlm_detected)) {
|
||||
mca_plm_alps_using_aprun = false;
|
||||
/* we are in a Cray SLURM environment, so we don't want
|
||||
* this plm component */
|
||||
*priority = 0;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -121,23 +121,6 @@ static int plm_alps_init(void)
|
||||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* owing to way the SLURM PLM component works, we can't use
|
||||
* it on Cray XC systems as currently designed. The problem
|
||||
* is the MPI processes launched on the head node (where the
|
||||
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
|
||||
* (mpirun) which is not a child of a slurmd daemon. This
|
||||
* means that any RDMA credentials obtained via the odls/alps
|
||||
* local launcher are incorrect.
|
||||
*
|
||||
* So for now, we just don't support mpirun launched jobs
|
||||
* on Cray XC systems using Native SLURM.
|
||||
*/
|
||||
if (false == mca_plm_alps_using_aprun) {
|
||||
orte_show_help("help-plm-alps.txt", "slurm-not-supported", true);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
if (orte_do_not_launch) {
|
||||
/* must map daemons since we won't be launching them */
|
||||
orte_plm_globals.daemon_nodes_assigned_at_launch = true;
|
||||
|
@ -10,7 +10,7 @@
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -49,3 +49,18 @@ are running.
|
||||
|
||||
Please consult with your system administrator about obtaining
|
||||
such support.
|
||||
[no-local-support]
|
||||
The SLURM process starter cannot start processes local to
|
||||
mpirun when executing under a Cray environment. The problem
|
||||
is that mpirun is not itself a child of a slurmd daemon. Thus,
|
||||
any processes mpirun itself starts will inherit incorrect
|
||||
RDMA credentials.
|
||||
|
||||
Your application will be mapped and run (assuming adequate
|
||||
resources) on the remaining allocated nodes. If adequate
|
||||
resources are not available, you will need to exit and obtain
|
||||
a larger allocation.
|
||||
|
||||
This situation will be fixed in a future release. Meantime,
|
||||
you can turn "off" this warning by setting the plm_slurm_warning
|
||||
MCA param to 0.
|
||||
|
@ -9,6 +9,7 @@
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -29,6 +30,7 @@ BEGIN_C_DECLS
|
||||
struct orte_plm_slurm_component_t {
|
||||
orte_plm_base_component_t super;
|
||||
char *custom_args;
|
||||
bool slurm_warning_msg;
|
||||
};
|
||||
typedef struct orte_plm_slurm_component_t orte_plm_slurm_component_t;
|
||||
|
||||
|
@ -12,6 +12,7 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -28,7 +29,9 @@
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
|
||||
#include "orte/mca/plm/plm.h"
|
||||
@ -99,6 +102,13 @@ static int plm_slurm_register(void)
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_plm_slurm_component.custom_args);
|
||||
|
||||
mca_plm_slurm_component.slurm_warning_msg = true;
|
||||
(void) mca_base_component_var_register (comp, "warning", "Turn off warning message",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&mca_plm_slurm_component.slurm_warning_msg);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -65,7 +65,7 @@
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/rmaps/rmaps.h"
|
||||
#include "orte/mca/rmaps/base/base.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
|
||||
#include "orte/orted/orted.h"
|
||||
@ -193,6 +193,25 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
||||
"%s plm:slurm: LAUNCH DAEMONS CALLED",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
#if SLURM_CRAY_ENV
|
||||
/* if we are in a Cray-SLURM environment, then we cannot
|
||||
* launch procs local to the HNP. The problem
|
||||
* is the MPI processes launched on the head node (where the
|
||||
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
|
||||
* (mpirun) which is not a child of a slurmd daemon. This
|
||||
* means that any RDMA credentials obtained via the odls/alps
|
||||
* local launcher are incorrect. So warn the user and set
|
||||
* the envar for no_schedule_local if mpirun is not on a
|
||||
* system management node (i.e. is part of the allocation)
|
||||
* and the "no_use_local" flag hasn't been set */
|
||||
if (mca_plm_slurm_component.slurm_warning_msg &&
|
||||
(orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL))) {
|
||||
orte_show_help("help-plm-slurm.txt", "no-local-support", true);
|
||||
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
|
||||
mca_plm_slurm_component.slurm_warning_msg = false; // only do this once
|
||||
}
|
||||
#endif
|
||||
|
||||
/* if we are launching debugger daemons, then just go
|
||||
* do it - no new daemons will be launched
|
||||
*/
|
||||
|
@ -618,14 +618,25 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
|
||||
|
||||
/* handle the topologies - as the most common case by far
|
||||
* is to have homogeneous topologies, we only send them
|
||||
* if something is different */
|
||||
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
|
||||
ui8 = 2;
|
||||
} else {
|
||||
ui8 = 1;
|
||||
* if something is different. We know that the HNP is
|
||||
* the first topology, and that any differing topology
|
||||
* on the compute nodes must follow. So send the topologies
|
||||
* if and only if:
|
||||
*
|
||||
* (a) the HNP is being used to house application procs and
|
||||
* there is more than one topology on our list; or
|
||||
*
|
||||
* (b) the HNP is not being used, but there are more than
|
||||
* two topologies on our list, thus indicating that
|
||||
* there are multiple topologies on the compute nodes
|
||||
*/
|
||||
if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
|
||||
/* remove the first topo on the list */
|
||||
item = opal_list_remove_first(&topos);
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
tmp = NULL;
|
||||
if (ui8 < opal_list_get_size(&topos)) {
|
||||
if (1 < opal_list_get_size(&topos)) {
|
||||
opal_buffer_t bucket, *bptr;
|
||||
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
|
||||
while (NULL != (item = opal_list_remove_first(&topos))) {
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user