Enable slurm operations on Cray with constraints
Cleanup some errors in the nidmap code that caused us to send unnecessary topologies Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
родитель
bf668ad1e9
Коммит
a29ca2bb0d
@ -10,6 +10,7 @@
|
|||||||
# University of Stuttgart. All rights reserved.
|
# University of Stuttgart. All rights reserved.
|
||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
|
# Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
@ -39,7 +40,3 @@ the map for this application. This can be caused by a lack of
|
|||||||
an allocation, or by an error in the Open MPI code. Please check
|
an allocation, or by an error in the Open MPI code. Please check
|
||||||
to ensure you have a ALPS allocation. If you do, then please pass
|
to ensure you have a ALPS allocation. If you do, then please pass
|
||||||
the error to the Open MPI user's mailing list for assistance.
|
the error to the Open MPI user's mailing list for assistance.
|
||||||
#
|
|
||||||
[slurm-not-supported]
|
|
||||||
mpirun is not a supported launcher on Cray XC using Native SLURM.
|
|
||||||
srun must be used to launch jobs on these systems.
|
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -46,7 +47,6 @@ ORTE_MODULE_DECLSPEC extern orte_plm_alps_component_t
|
|||||||
mca_plm_alps_component;
|
mca_plm_alps_component;
|
||||||
ORTE_DECLSPEC extern orte_plm_base_module_t
|
ORTE_DECLSPEC extern orte_plm_base_module_t
|
||||||
orte_plm_alps_module;
|
orte_plm_alps_module;
|
||||||
extern bool mca_plm_alps_using_aprun;
|
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
|
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -43,7 +44,6 @@
|
|||||||
*/
|
*/
|
||||||
const char *mca_plm_alps_component_version_string =
|
const char *mca_plm_alps_component_version_string =
|
||||||
"Open MPI alps plm MCA component version " ORTE_VERSION;
|
"Open MPI alps plm MCA component version " ORTE_VERSION;
|
||||||
bool mca_plm_alps_using_aprun = {true};
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -158,7 +158,11 @@ static int orte_plm_alps_component_query(mca_base_module_t **module, int *priori
|
|||||||
}
|
}
|
||||||
|
|
||||||
if((NULL != wlm_detected) && !strcmp(slurm, wlm_detected)) {
|
if((NULL != wlm_detected) && !strcmp(slurm, wlm_detected)) {
|
||||||
mca_plm_alps_using_aprun = false;
|
/* we are in a Cray SLURM environment, so we don't want
|
||||||
|
* this plm component */
|
||||||
|
*priority = 0;
|
||||||
|
*module = NULL;
|
||||||
|
return ORTE_ERROR;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -121,23 +121,6 @@ static int plm_alps_init(void)
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* owing to way the SLURM PLM component works, we can't use
|
|
||||||
* it on Cray XC systems as currently designed. The problem
|
|
||||||
* is the MPI processes launched on the head node (where the
|
|
||||||
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
|
|
||||||
* (mpirun) which is not a child of a slurmd daemon. This
|
|
||||||
* means that any RDMA credentials obtained via the odls/alps
|
|
||||||
* local launcher are incorrect.
|
|
||||||
*
|
|
||||||
* So for now, we just don't support mpirun launched jobs
|
|
||||||
* on Cray XC systems using Native SLURM.
|
|
||||||
*/
|
|
||||||
if (false == mca_plm_alps_using_aprun) {
|
|
||||||
orte_show_help("help-plm-alps.txt", "slurm-not-supported", true);
|
|
||||||
exit(-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (orte_do_not_launch) {
|
if (orte_do_not_launch) {
|
||||||
/* must map daemons since we won't be launching them */
|
/* must map daemons since we won't be launching them */
|
||||||
orte_plm_globals.daemon_nodes_assigned_at_launch = true;
|
orte_plm_globals.daemon_nodes_assigned_at_launch = true;
|
||||||
|
@ -10,7 +10,7 @@
|
|||||||
# University of Stuttgart. All rights reserved.
|
# University of Stuttgart. All rights reserved.
|
||||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
# Copyright (c) 2014 Intel, Inc. All rights reserved.
|
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
|
||||||
# $COPYRIGHT$
|
# $COPYRIGHT$
|
||||||
#
|
#
|
||||||
# Additional copyrights may follow
|
# Additional copyrights may follow
|
||||||
@ -49,3 +49,18 @@ are running.
|
|||||||
|
|
||||||
Please consult with your system administrator about obtaining
|
Please consult with your system administrator about obtaining
|
||||||
such support.
|
such support.
|
||||||
|
[no-local-support]
|
||||||
|
The SLURM process starter cannot start processes local to
|
||||||
|
mpirun when executing under a Cray environment. The problem
|
||||||
|
is that mpirun is not itself a child of a slurmd daemon. Thus,
|
||||||
|
any processes mpirun itself starts will inherit incorrect
|
||||||
|
RDMA credentials.
|
||||||
|
|
||||||
|
Your application will be mapped and run (assuming adequate
|
||||||
|
resources) on the remaining allocated nodes. If adequate
|
||||||
|
resources are not available, you will need to exit and obtain
|
||||||
|
a larger allocation.
|
||||||
|
|
||||||
|
This situation will be fixed in a future release. Meantime,
|
||||||
|
you can turn "off" this warning by setting the plm_slurm_warning
|
||||||
|
MCA param to 0.
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
* University of Stuttgart. All rights reserved.
|
* University of Stuttgart. All rights reserved.
|
||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
|
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -29,6 +30,7 @@ BEGIN_C_DECLS
|
|||||||
struct orte_plm_slurm_component_t {
|
struct orte_plm_slurm_component_t {
|
||||||
orte_plm_base_component_t super;
|
orte_plm_base_component_t super;
|
||||||
char *custom_args;
|
char *custom_args;
|
||||||
|
bool slurm_warning_msg;
|
||||||
};
|
};
|
||||||
typedef struct orte_plm_slurm_component_t orte_plm_slurm_component_t;
|
typedef struct orte_plm_slurm_component_t orte_plm_slurm_component_t;
|
||||||
|
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
|
* Copyright (c) 2017 Intel, Inc. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -28,7 +29,9 @@
|
|||||||
#include "orte_config.h"
|
#include "orte_config.h"
|
||||||
#include "orte/constants.h"
|
#include "orte/constants.h"
|
||||||
|
|
||||||
|
#include "opal/util/opal_environ.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
|
#include "orte/util/show_help.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
|
||||||
#include "orte/mca/plm/plm.h"
|
#include "orte/mca/plm/plm.h"
|
||||||
@ -99,6 +102,13 @@ static int plm_slurm_register(void)
|
|||||||
MCA_BASE_VAR_SCOPE_READONLY,
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
&mca_plm_slurm_component.custom_args);
|
&mca_plm_slurm_component.custom_args);
|
||||||
|
|
||||||
|
mca_plm_slurm_component.slurm_warning_msg = true;
|
||||||
|
(void) mca_base_component_var_register (comp, "warning", "Turn off warning message",
|
||||||
|
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||||
|
OPAL_INFO_LVL_9,
|
||||||
|
MCA_BASE_VAR_SCOPE_READONLY,
|
||||||
|
&mca_plm_slurm_component.slurm_warning_msg);
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -65,7 +65,7 @@
|
|||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
#include "orte/runtime/orte_quit.h"
|
#include "orte/runtime/orte_quit.h"
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
#include "orte/mca/errmgr/errmgr.h"
|
||||||
#include "orte/mca/rmaps/rmaps.h"
|
#include "orte/mca/rmaps/base/base.h"
|
||||||
#include "orte/mca/state/state.h"
|
#include "orte/mca/state/state.h"
|
||||||
|
|
||||||
#include "orte/orted/orted.h"
|
#include "orte/orted/orted.h"
|
||||||
@ -193,6 +193,25 @@ static void launch_daemons(int fd, short args, void *cbdata)
|
|||||||
"%s plm:slurm: LAUNCH DAEMONS CALLED",
|
"%s plm:slurm: LAUNCH DAEMONS CALLED",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||||
|
|
||||||
|
#if SLURM_CRAY_ENV
|
||||||
|
/* if we are in a Cray-SLURM environment, then we cannot
|
||||||
|
* launch procs local to the HNP. The problem
|
||||||
|
* is the MPI processes launched on the head node (where the
|
||||||
|
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
|
||||||
|
* (mpirun) which is not a child of a slurmd daemon. This
|
||||||
|
* means that any RDMA credentials obtained via the odls/alps
|
||||||
|
* local launcher are incorrect. So warn the user and set
|
||||||
|
* the envar for no_schedule_local if mpirun is not on a
|
||||||
|
* system management node (i.e. is part of the allocation)
|
||||||
|
* and the "no_use_local" flag hasn't been set */
|
||||||
|
if (mca_plm_slurm_component.slurm_warning_msg &&
|
||||||
|
(orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL))) {
|
||||||
|
orte_show_help("help-plm-slurm.txt", "no-local-support", true);
|
||||||
|
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
|
||||||
|
mca_plm_slurm_component.slurm_warning_msg = false; // only do this once
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/* if we are launching debugger daemons, then just go
|
/* if we are launching debugger daemons, then just go
|
||||||
* do it - no new daemons will be launched
|
* do it - no new daemons will be launched
|
||||||
*/
|
*/
|
||||||
|
@ -618,14 +618,25 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
|
|||||||
|
|
||||||
/* handle the topologies - as the most common case by far
|
/* handle the topologies - as the most common case by far
|
||||||
* is to have homogeneous topologies, we only send them
|
* is to have homogeneous topologies, we only send them
|
||||||
* if something is different */
|
* if something is different. We know that the HNP is
|
||||||
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
|
* the first topology, and that any differing topology
|
||||||
ui8 = 2;
|
* on the compute nodes must follow. So send the topologies
|
||||||
} else {
|
* if and only if:
|
||||||
ui8 = 1;
|
*
|
||||||
|
* (a) the HNP is being used to house application procs and
|
||||||
|
* there is more than one topology on our list; or
|
||||||
|
*
|
||||||
|
* (b) the HNP is not being used, but there are more than
|
||||||
|
* two topologies on our list, thus indicating that
|
||||||
|
* there are multiple topologies on the compute nodes
|
||||||
|
*/
|
||||||
|
if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
|
||||||
|
/* remove the first topo on the list */
|
||||||
|
item = opal_list_remove_first(&topos);
|
||||||
|
OBJ_RELEASE(item);
|
||||||
}
|
}
|
||||||
tmp = NULL;
|
tmp = NULL;
|
||||||
if (ui8 < opal_list_get_size(&topos)) {
|
if (1 < opal_list_get_size(&topos)) {
|
||||||
opal_buffer_t bucket, *bptr;
|
opal_buffer_t bucket, *bptr;
|
||||||
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
|
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
|
||||||
while (NULL != (item = opal_list_remove_first(&topos))) {
|
while (NULL != (item = opal_list_remove_first(&topos))) {
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user