1
1

Merge pull request #3467 from rhc54/topic/slurm

Enable full operations under SLURM on Cray systems
Этот коммит содержится в:
Ralph Castain 2017-05-07 06:38:27 -07:00 коммит произвёл GitHub
родитель 88948f752f a143800bce
Коммит ee4ce13e16
5 изменённых файлов: 43 добавлений и 46 удалений

Просмотреть файл

@ -13,6 +13,7 @@
# Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2016 Los Alamos National Security, LLC. All rights
# reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -68,6 +69,15 @@ AC_DEFUN([ORTE_CHECK_SLURM],[
[orte_check_slurm_happy="yes"],
[orte_check_slurm_happy="no"])])
# check to see if this is a Cray nativized slurm env.
slurm_cray_env=0
OPAL_CHECK_ALPS([orte_slurm_cray],
[slurm_cray_env=1])
AC_DEFINE_UNQUOTED([SLURM_CRAY_ENV],[$slurm_cray_env],
[defined to 1 if slurm cray env, 0 otherwise])
OPAL_SUMMARY_ADD([[Resource Managers]],[[Slurm]],[$1],[$orte_check_slurm_happy])
fi

Просмотреть файл

@ -13,6 +13,7 @@
# Copyright (c) 2009-2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2011-2016 Los Alamos National Security, LLC.
# All rights reserved.
# Copyright (c) 2017 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -38,12 +39,4 @@ AC_DEFUN([MCA_orte_plm_slurm_CONFIG],[
AC_SUBST([plm_slurm_LDFLAGS])
AC_SUBST([plm_slurm_LIBS])
# check to see if this is a Cray nativized slurm env.
slurm_cray_env=0
OPAL_CHECK_ALPS([plm_slurm_cray],
[slurm_cray_env=1])
AC_DEFINE_UNQUOTED([SLURM_CRAY_ENV],[$slurm_cray_env],
[defined to 1 if slurm cray env, 0 otherwise])
])dnl

Просмотреть файл

@ -49,18 +49,3 @@ are running.
Please consult with your system administrator about obtaining
such support.
[no-local-support]
The SLURM process starter cannot start processes local to
mpirun when executing under a Cray environment. The problem
is that mpirun is not itself a child of a slurmd daemon. Thus,
any processes mpirun itself starts will inherit incorrect
RDMA credentials.
Your application will be mapped and run (assuming adequate
resources) on the remaining allocated nodes. If adequate
resources are not available, you will need to exit and obtain
a larger allocation.
This situation will be fixed in a future release. Meantime,
you can turn "off" this warning by setting the plm_slurm_warning
MCA param to 0.

Просмотреть файл

@ -193,25 +193,6 @@ static void launch_daemons(int fd, short args, void *cbdata)
"%s plm:slurm: LAUNCH DAEMONS CALLED",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
#if SLURM_CRAY_ENV
/* if we are in a Cray-SLURM environment, then we cannot
* launch procs local to the HNP. The problem
* is the MPI processes launched on the head node (where the
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
* (mpirun) which is not a child of a slurmd daemon. This
* means that any RDMA credentials obtained via the odls/alps
* local launcher are incorrect. So warn the user and set
* the envar for no_schedule_local if mpirun is not on a
* system management node (i.e. is part of the allocation)
* and the "no_use_local" flag hasn't been set */
if (mca_plm_slurm_component.slurm_warning_msg &&
(orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL))) {
orte_show_help("help-plm-slurm.txt", "no-local-support", true);
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
mca_plm_slurm_component.slurm_warning_msg = false; // only do this once
}
#endif
/* if we are launching debugger daemons, then just go
* do it - no new daemons will be launched
*/

Просмотреть файл

@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -30,6 +30,7 @@
#include "opal/util/if.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
@ -46,7 +47,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
int rc, i;
orte_node_t *node, *hnp_node, *nptr;
char *ptr;
bool hnp_alone = true;
bool hnp_alone = true, skiphnp = false;
orte_attribute_t *kv;
char **alias=NULL, **nalias;
@ -77,6 +78,33 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
/* get the hnp node's info */
hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
#if SLURM_CRAY_ENV
/* if we are in a Cray-SLURM environment, then we cannot
* launch procs local to the HNP. The problem
* is the MPI processes launched on the head node (where the
* ORTE_PROC_IS_HNP evalues to true) get launched by a daemon
* (mpirun) which is not a child of a slurmd daemon. This
* means that any RDMA credentials obtained via the odls/alps
* local launcher are incorrect. Test for this condition. If
* found, then take steps to ensure we launch a daemon on
* the same node as mpirun and that it gets used to fork
* local procs instead of mpirun so they get the proper
* credential */
if (NULL != hnp_node) {
OPAL_LIST_FOREACH(node, nodes, orte_node_t) {
if (orte_ifislocal(node->name)) {
orte_hnp_is_allocated = true;
break;
}
}
if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping) & ORTE_MAPPING_NO_USE_LOCAL)) {
hnp_node->name = strdup("mpirun");
skiphnp = true;
ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_USE_LOCAL);
}
}
#endif
/* cycle through the list */
while (NULL != (item = opal_list_remove_first(nodes))) {
@ -86,7 +114,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
* first position since it is the first one entered. We need to check to see
* if this node is the same as the HNP's node so we don't double-enter it
*/
if (NULL != hnp_node && orte_ifislocal(node->name)) {
if (!skiphnp && NULL != hnp_node && orte_ifislocal(node->name)) {
OPAL_OUTPUT_VERBOSE((5, orte_ras_base_framework.framework_output,
"%s ras:base:node_insert updating HNP [%s] info to %ld slots",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -189,7 +217,7 @@ int orte_ras_base_node_insert(opal_list_t* nodes, orte_job_t *jdata)
* ensure we don't have any domain info in the node record
* for the hnp
*/
if (!orte_have_fqdn_allocation && !hnp_alone) {
if (NULL != hnp_node && !orte_have_fqdn_allocation && !hnp_alone) {
if (NULL != (ptr = strchr(hnp_node->name, '.'))) {
*ptr = '\0';
}