1
1

Enable slurm operations on Cray with constraints

Cleanup some errors in the nidmap code that caused us to send unnecessary topologies

Signed-off-by: Ralph Castain <rhc@open-mpi.org>
Этот коммит содержится в:
Ralph Castain 2017-04-05 17:32:39 -07:00
родитель bf668ad1e9
Коммит a29ca2bb0d
9 изменённых файлов: 73 добавлений и 32 удалений

Просмотреть файл

@ -10,6 +10,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -39,7 +40,3 @@ the map for this application. This can be caused by a lack of
an allocation, or by an error in the Open MPI code. Please check
to ensure you have a ALPS allocation. If you do, then please pass
the error to the Open MPI user's mailing list for assistance.
#
[slurm-not-supported]
mpirun is not a supported launcher on Cray XC using Native SLURM.
srun must be used to launch jobs on these systems.

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -46,7 +47,6 @@ ORTE_MODULE_DECLSPEC extern orte_plm_alps_component_t
mca_plm_alps_component;
ORTE_DECLSPEC extern orte_plm_base_module_t
orte_plm_alps_module;
extern bool mca_plm_alps_using_aprun;
END_C_DECLS

Просмотреть файл

@ -12,6 +12,7 @@
* All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -43,7 +44,6 @@
*/
const char *mca_plm_alps_component_version_string =
"Open MPI alps plm MCA component version " ORTE_VERSION;
bool mca_plm_alps_using_aprun = {true};
/*
@ -158,7 +158,11 @@ static int orte_plm_alps_component_query(mca_base_module_t **module, int *priori
}
if((NULL != wlm_detected) && !strcmp(slurm, wlm_detected)) {
mca_plm_alps_using_aprun = false;
/* we are in a Cray SLURM environment, so we don't want
* this plm component */
*priority = 0;
*module = NULL;
return ORTE_ERROR;
}
#endif

Просмотреть файл

@ -121,23 +121,6 @@ static int plm_alps_init(void)
return rc;
}
/*
* owing to way the SLURM PLM component works, we can't use
* it on Cray XC systems as currently designed. The problem
* is the MPI processes launched on the head node (where the
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
* (mpirun) which is not a child of a slurmd daemon. This
* means that any RDMA credentials obtained via the odls/alps
* local launcher are incorrect.
*
* So for now, we just don't support mpirun launched jobs
* on Cray XC systems using Native SLURM.
*/
if (false == mca_plm_alps_using_aprun) {
orte_show_help("help-plm-alps.txt", "slurm-not-supported", true);
exit(-1);
}
if (orte_do_not_launch) {
/* must map daemons since we won't be launching them */
orte_plm_globals.daemon_nodes_assigned_at_launch = true;

Просмотреть файл

@ -10,7 +10,7 @@
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2014 Intel, Inc. All rights reserved.
# Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -49,3 +49,18 @@ are running.
Please consult with your system administrator about obtaining
such support.
[no-local-support]
The SLURM process starter cannot start processes local to
mpirun when executing under a Cray environment. The problem
is that mpirun is not itself a child of a slurmd daemon. Thus,
any processes mpirun itself starts will inherit incorrect
RDMA credentials.
Your application will be mapped and run (assuming adequate
resources) on the remaining allocated nodes. If adequate
resources are not available, you will need to exit and obtain
a larger allocation.
This situation will be fixed in a future release. Meantime,
you can turn "off" this warning by setting the plm_slurm_warning
MCA param to 0.

Просмотреть файл

@ -9,6 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -29,6 +30,7 @@ BEGIN_C_DECLS
struct orte_plm_slurm_component_t {
orte_plm_base_component_t super;
char *custom_args;
bool slurm_warning_msg;
};
typedef struct orte_plm_slurm_component_t orte_plm_slurm_component_t;

Просмотреть файл

@ -12,6 +12,7 @@
* All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2017 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -28,7 +29,9 @@
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/util/opal_environ.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/plm/plm.h"
@ -99,6 +102,13 @@ static int plm_slurm_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&mca_plm_slurm_component.custom_args);
mca_plm_slurm_component.slurm_warning_msg = true;
(void) mca_base_component_var_register (comp, "warning", "Turn off warning message",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY,
&mca_plm_slurm_component.slurm_warning_msg);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -65,7 +65,7 @@
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_quit.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/state/state.h"
#include "orte/orted/orted.h"
@ -193,6 +193,25 @@ static void launch_daemons(int fd, short args, void *cbdata)
"%s plm:slurm: LAUNCH DAEMONS CALLED",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
#if SLURM_CRAY_ENV
/* if we are in a Cray-SLURM environment, then we cannot
* launch procs local to the HNP. The problem
* is the MPI processes launched on the head node (where the
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
* (mpirun) which is not a child of a slurmd daemon. This
* means that any RDMA credentials obtained via the odls/alps
* local launcher are incorrect. So warn the user and set
* the envar for no_schedule_local if mpirun is not on a
* system management node (i.e. is part of the allocation)
* and the "no_use_local" flag hasn't been set */
if (mca_plm_slurm_component.slurm_warning_msg &&
(orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL))) {
orte_show_help("help-plm-slurm.txt", "no-local-support", true);
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
mca_plm_slurm_component.slurm_warning_msg = false; // only do this once
}
#endif
/* if we are launching debugger daemons, then just go
* do it - no new daemons will be launched
*/

Просмотреть файл

@ -618,14 +618,25 @@ int orte_util_encode_nodemap(opal_buffer_t *buffer)
/* handle the topologies - as the most common case by far
* is to have homogeneous topologies, we only send them
* if something is different */
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
ui8 = 2;
} else {
ui8 = 1;
* if something is different. We know that the HNP is
* the first topology, and that any differing topology
* on the compute nodes must follow. So send the topologies
* if and only if:
*
* (a) the HNP is being used to house application procs and
* there is more than one topology on our list; or
*
* (b) the HNP is not being used, but there are more than
* two topologies on our list, thus indicating that
* there are multiple topologies on the compute nodes
*/
if (!orte_hnp_is_allocated || (ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
/* remove the first topo on the list */
item = opal_list_remove_first(&topos);
OBJ_RELEASE(item);
}
tmp = NULL;
if (ui8 < opal_list_get_size(&topos)) {
if (1 < opal_list_get_size(&topos)) {
opal_buffer_t bucket, *bptr;
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
while (NULL != (item = opal_list_remove_first(&topos))) {