Now that we have PMI support, this is no longer needed
This commit was SVN r26254.
Этот коммит содержится в:
родитель
71f9e69c62
Коммит
a34be856aa
@ -1,47 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
dist_pkgdata_DATA = help-ess-slurmd.txt
|
||||
|
||||
sources = \
|
||||
ess_slurmd.h \
|
||||
ess_slurmd_component.c \
|
||||
ess_slurmd_module.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_ess_slurmd_DSO
|
||||
component_noinst =
|
||||
component_install = mca_ess_slurmd.la
|
||||
else
|
||||
component_noinst = libmca_ess_slurmd.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_ess_slurmd_la_SOURCES = $(sources)
|
||||
mca_ess_slurmd_la_LDFLAGS = -module -avoid-version $(ess_slurmd_LDFLAGS)
|
||||
mca_ess_slurmd_la_LIBADD = $(ess_slurmd_LIBS)
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_ess_slurmd_la_SOURCES =$(sources)
|
||||
libmca_ess_slurmd_la_LDFLAGS = -module -avoid-version $(ess_slurmd_LDFLAGS)
|
||||
libmca_ess_slurmd_la_LIBADD = $(ess_slurmd_LIBS)
|
@ -1,44 +0,0 @@
|
||||
# -*- shell-script -*-
|
||||
#
|
||||
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2009-2011 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
AC_DEFUN([MCA_orte_ess_slurmd_PRIORITY], [10])
|
||||
|
||||
# MCA_ess_slurmd_CONFIG([action-if-found], [action-if-not-found])
|
||||
# -----------------------------------------------------------
|
||||
AC_DEFUN([MCA_orte_ess_slurmd_CONFIG],[
|
||||
AC_CONFIG_FILES([orte/mca/ess/slurmd/Makefile])
|
||||
|
||||
ORTE_CHECK_SLURM([ess_slurmd], [ess_slurmd_good=1], [ess_slurmd_good=0])
|
||||
|
||||
# if check worked, set wrapper flags if so.
|
||||
# Evaluate succeed / fail
|
||||
AS_IF([test "$ess_slurmd_good" = "1" -a "$orte_without_full_support" = 0],
|
||||
[ess_slurmd_WRAPPER_EXTRA_LDFLAGS="$ess_slurmd_LDFLAGS"
|
||||
ess_slurmd_WRAPPER_EXTRA_LIBS="$ess_slurmd_LIBS"
|
||||
$1],
|
||||
[$2])
|
||||
|
||||
# set build flags to use in makefile
|
||||
AC_SUBST([ess_slurmd_CPPFLAGS])
|
||||
AC_SUBST([ess_slurmd_LDFLAGS])
|
||||
AC_SUBST([ess_slurmd_LIBS])
|
||||
|
||||
])dnl
|
@ -1,35 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef ORTE_ESS_SLURMD_H
|
||||
#define ORTE_ESS_SLURMD_H
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_ess_base_component_t mca_ess_slurmd_component;
|
||||
|
||||
/*
|
||||
* Module open / close
|
||||
*/
|
||||
int orte_ess_slurmd_component_open(void);
|
||||
int orte_ess_slurmd_component_close(void);
|
||||
int orte_ess_slurmd_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* ORTE_ESS_SLURMD_H */
|
@ -1,99 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
* These symbols are in a file by themselves to provide nice linker
|
||||
* semantics. Since linkers generally pull in symbols by object
|
||||
* files, keeping these symbols as the only symbols in this file
|
||||
* prevents utility programs such as "ompi_info" from having to import
|
||||
* entire components just to query their version and parameters.
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/ess/slurmd/ess_slurmd.h"
|
||||
|
||||
extern orte_ess_base_module_t orte_ess_slurmd_module;
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointers to our public functions in it
|
||||
*/
|
||||
orte_ess_base_component_t mca_ess_slurmd_component = {
|
||||
{
|
||||
ORTE_ESS_BASE_VERSION_2_0_0,
|
||||
|
||||
/* Component name and version */
|
||||
"slurmd",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
orte_ess_slurmd_component_open,
|
||||
orte_ess_slurmd_component_close,
|
||||
orte_ess_slurmd_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
int
|
||||
orte_ess_slurmd_component_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int orte_ess_slurmd_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
/* Are we an MPI proc running under a SLURM job? Were
|
||||
* we given a path back to the HNP? If the
|
||||
* answer to the first is "yes" and the second
|
||||
* is "no", then we were not launched
|
||||
* by mpirun but are in a slurm world
|
||||
*/
|
||||
|
||||
if (ORTE_PROC_IS_MPI &&
|
||||
NULL != getenv("SLURM_JOBID") &&
|
||||
NULL != getenv("SLURM_STEPID") &&
|
||||
NULL == orte_process_info.my_hnp_uri) {
|
||||
*priority = 30;
|
||||
*module = (mca_base_module_t *)&orte_ess_slurmd_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* Sadly, no */
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
orte_ess_slurmd_component_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -1,730 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
|
||||
* University Research and Technology
|
||||
* Corporation. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
* University of Stuttgart. All rights reserved.
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif /* HAVE_STRING_H */
|
||||
#include <ctype.h>
|
||||
#ifdef HAVE_NETDB_H
|
||||
#include <netdb.h>
|
||||
#endif
|
||||
#ifdef HAVE_IFADDRS_H
|
||||
#include <ifaddrs.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/mca/base/mca_base_param.h"
|
||||
#include "opal/util/argv.h"
|
||||
#include "opal/class/opal_pointer_array.h"
|
||||
#include "opal/mca/hwloc/base/base.h"
|
||||
#include "opal/util/printf.h"
|
||||
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/util/pre_condition_transports.h"
|
||||
#include "orte/util/regex.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/ess/base/base.h"
|
||||
#include "orte/mca/ess/slurmd/ess_slurmd.h"
|
||||
|
||||
static int rte_init(void);
|
||||
static int rte_finalize(void);
|
||||
static void rte_abort(int error_code, bool report);
|
||||
|
||||
orte_ess_base_module_t orte_ess_slurmd_module = {
|
||||
rte_init,
|
||||
rte_finalize,
|
||||
rte_abort,
|
||||
orte_ess_base_proc_get_locality,
|
||||
orte_ess_base_proc_get_daemon,
|
||||
orte_ess_base_proc_get_hostname,
|
||||
orte_ess_base_proc_get_local_rank,
|
||||
orte_ess_base_proc_get_node_rank,
|
||||
orte_ess_base_update_pidmap,
|
||||
orte_ess_base_update_nidmap,
|
||||
NULL /* ft_event */
|
||||
};
|
||||
|
||||
/* Local globals */
|
||||
static bool app_init_complete;
|
||||
static bool slurm20;
|
||||
|
||||
/* Local functions */
|
||||
static int discover_nodes(char *regexp, char*** nodelist);
|
||||
static int parse_ranges(char *base, char *ranges, char ***names);
|
||||
static int parse_range(char *base, char *range, char ***names);
|
||||
|
||||
/**** MODULE FUNCTIONS ****/
|
||||
|
||||
static int rte_init(void)
|
||||
{
|
||||
int ret;
|
||||
char *error = NULL;
|
||||
int32_t jobfam, stepid;
|
||||
char **nodes = NULL;
|
||||
char *envar;
|
||||
int i, j;
|
||||
orte_nid_t *node;
|
||||
orte_jmap_t *jmap;
|
||||
orte_pmap_t *pmap;
|
||||
orte_vpid_t vpid;
|
||||
int local_rank;
|
||||
int nodeid;
|
||||
int num_nodes;
|
||||
int cpus_per_task;
|
||||
char *regexp, *tasks_per_node;
|
||||
int *ppn;
|
||||
bool block=false, cyclic=false;
|
||||
uint64_t unique_key[2];
|
||||
char *cs_env, *string_key;
|
||||
|
||||
/* init flag */
|
||||
app_init_complete = false;
|
||||
slurm20 = false;
|
||||
|
||||
/* run the prolog */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
|
||||
error = "orte_ess_base_std_prolog";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Only application procs can use this module. Since we
|
||||
* were directly launched by srun, we need to bootstrap
|
||||
* our own global info so we can startup. Srun will have
|
||||
* provided that info in our environment, so get it from there
|
||||
*/
|
||||
|
||||
/* declare ourselves to be standalone - i.e., not launched by orted */
|
||||
orte_standalone_operation = true;
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
/* get the topology */
|
||||
if (NULL == opal_hwloc_topology) {
|
||||
if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
|
||||
error = "topology discovery";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* get the slurm jobid - this will be our job family */
|
||||
envar = getenv("SLURM_JOBID");
|
||||
/* don't need to check this for NULL - if it was, we would
|
||||
* never have been selected anyway
|
||||
*/
|
||||
jobfam = strtol(envar, NULL, 10);
|
||||
/* get the slurm stepid - this will be our local jobid */
|
||||
if (NULL == (envar = getenv("SLURM_STEPID"))) {
|
||||
error = "could not get SLURM_STEPID";
|
||||
goto error;
|
||||
}
|
||||
/* because the stepid could be zero, and we want the local
|
||||
* jobid to be unique, increment it by one so the system
|
||||
* doesn't think that we are a bunch of daemons!
|
||||
*/
|
||||
stepid = strtol(envar, NULL, 10) + 1;
|
||||
/* now build the jobid */
|
||||
ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid);
|
||||
|
||||
/* setup transport keys in case the MPI layer needs them -
|
||||
* we can use the SLURM jobid and stepid as unique keys
|
||||
* because they are unique values assigned by the RM
|
||||
*/
|
||||
unique_key[0] = (uint64_t)jobfam;
|
||||
unique_key[1] = (uint64_t)stepid;
|
||||
if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
asprintf(&envar, "%s=%s", cs_env, string_key);
|
||||
putenv(envar);
|
||||
/* cannot free the envar as that messes of our environ */
|
||||
free(cs_env);
|
||||
free(string_key);
|
||||
|
||||
/* get my local nodeid */
|
||||
if (NULL == (envar = getenv("SLURM_NODEID"))) {
|
||||
error = "could not get SLURM_NODEID";
|
||||
goto error;
|
||||
}
|
||||
nodeid = strtol(envar, NULL, 10);
|
||||
ORTE_PROC_MY_DAEMON->jobid = 0;
|
||||
ORTE_PROC_MY_DAEMON->vpid = nodeid;
|
||||
|
||||
/* get the node list */
|
||||
if (NULL == (regexp = getenv("SLURM_STEP_NODELIST"))) {
|
||||
error = "could not get SLURM_STEP_NODELIST";
|
||||
goto error;
|
||||
}
|
||||
/* break that down into a list of nodes */
|
||||
if (ORTE_SUCCESS != (ret = discover_nodes(regexp, &nodes))) {
|
||||
error = "could not parse node list";
|
||||
goto error;
|
||||
}
|
||||
num_nodes = opal_argv_count(nodes);
|
||||
orte_process_info.num_nodes = num_nodes;
|
||||
|
||||
/* setup the nidmap arrays */
|
||||
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_util_nidmap_init";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* set the size of the nidmap storage so we minimize realloc's */
|
||||
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&orte_nidmap, orte_process_info.num_nodes))) {
|
||||
error = "could not set pointer array size for nidmap";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* get the slurm procid - this will be our vpid */
|
||||
if (NULL == (envar = getenv("SLURM_PROCID"))) {
|
||||
error = "could not get SLURM_PROCID";
|
||||
goto error;
|
||||
}
|
||||
ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10);
|
||||
/* get the number of procs in this job */
|
||||
if (NULL == (envar = getenv("SLURM_STEP_NUM_TASKS"))) {
|
||||
error = "could not get SLURM_STEP_NUM_TASKS";
|
||||
goto error;
|
||||
}
|
||||
orte_process_info.num_procs = strtol(envar, NULL, 10);
|
||||
/* set the app_num so that MPI attributes get set correctly */
|
||||
orte_process_info.app_num = 1;
|
||||
/* if this is SLURM 2.0 or above, get our port
|
||||
* assignments for use in the OOB
|
||||
*/
|
||||
if (NULL != (envar = getenv("SLURM_STEP_RESV_PORTS"))) {
|
||||
/* convert this to an MCA param that will be
|
||||
* picked up by the OOB
|
||||
*/
|
||||
orte_oob_static_ports = strdup(envar);
|
||||
slurm20 = true;
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"%s using SLURM-reserved ports %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
envar));
|
||||
}
|
||||
|
||||
/* get the number of tasks/node */
|
||||
if (NULL == (tasks_per_node = getenv("SLURM_STEP_TASKS_PER_NODE"))) {
|
||||
error = "could not get SLURM_STEP_TASKS_PER_NODE";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* get the number of CPUs per task that the user provided to slurm */
|
||||
if (NULL != (envar = getenv("SLURM_CPUS_PER_TASK"))) {
|
||||
cpus_per_task = strtol(envar, NULL, 10);
|
||||
if(0 >= cpus_per_task) {
|
||||
error = "got bad value from SLURM_CPUS_PER_TASK";
|
||||
goto error;
|
||||
}
|
||||
} else {
|
||||
cpus_per_task = 1;
|
||||
}
|
||||
|
||||
/* compute the ppn */
|
||||
if (ORTE_SUCCESS != (ret = orte_regex_extract_ppn(num_nodes, tasks_per_node, &ppn))) {
|
||||
error = "could not determine #procs on each node";
|
||||
goto error;
|
||||
}
|
||||
/* for slurm, we have to normalize the ppn by the cpus_per_task */
|
||||
for (i=0; i < num_nodes; i++) {
|
||||
ppn[i] /= cpus_per_task;
|
||||
}
|
||||
|
||||
/* get the distribution (i.e., mapping) mode */
|
||||
if (NULL == (envar = getenv("SLURM_DISTRIBUTION")) ||
|
||||
0 == strcmp(envar, "block")) {
|
||||
/* assume byslot mapping */
|
||||
block = true;
|
||||
} else if (0 == strcmp(envar, "cyclic")) {
|
||||
/* bynode mapping */
|
||||
cyclic = true;
|
||||
} else {
|
||||
/* cannot currently support other mapping modes */
|
||||
error = "distribution/mapping mode not supported";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* construct the nidmap */
|
||||
for (i=0; i < num_nodes; i++) {
|
||||
node = OBJ_NEW(orte_nid_t);
|
||||
node->name = strdup(nodes[i]);
|
||||
node->daemon = i;
|
||||
node->index = i;
|
||||
opal_pointer_array_set_item(&orte_nidmap, i, node);
|
||||
}
|
||||
opal_argv_free(nodes);
|
||||
|
||||
/* create a job map for this job */
|
||||
jmap = OBJ_NEW(orte_jmap_t);
|
||||
jmap->job = ORTE_PROC_MY_NAME->jobid;
|
||||
opal_pointer_array_add(&orte_jobmap, jmap);
|
||||
/* update the num procs */
|
||||
jmap->num_procs = orte_process_info.num_procs;
|
||||
|
||||
/* set the size of the pidmap storage so we minimize realloc's */
|
||||
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&jmap->pmap, jmap->num_procs))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "could not set array size for pidmap";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* construct the pidmap */
|
||||
if (block) {
|
||||
/* for each node, cycle through the ppn */
|
||||
vpid = 0;
|
||||
for (i=0; i < orte_nidmap.size; i++) {
|
||||
if (NULL == (node = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
|
||||
continue;
|
||||
}
|
||||
/* compute the vpid for each proc on this node
|
||||
* and add a pmap entry for it
|
||||
*/
|
||||
for (j=0; j < ppn[i]; j++) {
|
||||
pmap = OBJ_NEW(orte_pmap_t);
|
||||
pmap->node = node->index;
|
||||
pmap->local_rank = j;
|
||||
pmap->node_rank = j;
|
||||
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(&jmap->pmap, vpid, pmap))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "could not set pmap values";
|
||||
goto error;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"%s node %d name %s rank %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(int) node->index, node->name, (int)vpid));
|
||||
vpid++;
|
||||
}
|
||||
}
|
||||
} else if (cyclic) {
|
||||
/* cycle across the nodes */
|
||||
vpid = 0;
|
||||
while (vpid < orte_process_info.num_procs) {
|
||||
for (i=0; i < num_nodes && vpid < orte_process_info.num_procs; i++) {
|
||||
if (0 < ppn[i]) {
|
||||
if (NULL == (node = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
|
||||
/* this is an error */
|
||||
error = "error initializing process map";
|
||||
goto error;
|
||||
}
|
||||
pmap = OBJ_NEW(orte_pmap_t);
|
||||
pmap->node = node->index;
|
||||
pmap->local_rank = ppn[i]-1;
|
||||
pmap->node_rank = ppn[i]-1;
|
||||
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(&jmap->pmap, vpid, pmap))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "could not set pmap values";
|
||||
goto error;
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"%s node %d name %s rank %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
(int) node->index, node->name, (int)vpid));
|
||||
vpid++;
|
||||
--ppn[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
free(ppn);
|
||||
/* ensure we pick the correct critical components */
|
||||
putenv("OMPI_MCA_grpcomm=hier");
|
||||
putenv("OMPI_MCA_routed=direct");
|
||||
|
||||
/* get our local rank */
|
||||
if (NULL == (envar = getenv("SLURM_LOCALID"))) {
|
||||
error = "could not get SLURM_LOCALID";
|
||||
goto error;
|
||||
}
|
||||
local_rank = strtol(envar, NULL, 10);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"%s local rank %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
local_rank));
|
||||
|
||||
/* set max procs */
|
||||
if (orte_process_info.max_procs < orte_process_info.num_procs) {
|
||||
orte_process_info.max_procs = orte_process_info.num_procs;
|
||||
}
|
||||
|
||||
/* now use the default procedure to finish my setup */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_ess_base_app_setup";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* flag that we completed init */
|
||||
app_init_complete = true;
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
||||
error:
|
||||
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
|
||||
orte_show_help("help-orte-runtime.txt",
|
||||
"orte_init:startup:internal-failure",
|
||||
true, error, ORTE_ERROR_NAME(ret), ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int rte_finalize(void)
|
||||
{
|
||||
int ret = ORTE_SUCCESS;
|
||||
|
||||
if (app_init_complete) {
|
||||
/* use the default procedure to finish */
|
||||
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
}
|
||||
}
|
||||
|
||||
/* remove the envars that we pushed into environ
|
||||
* so we leave that structure intact
|
||||
*/
|
||||
unsetenv("OMPI_MCA_grpcomm");
|
||||
unsetenv("OMPI_MCA_routed");
|
||||
unsetenv("OMPI_MCA_orte_precondition_transports");
|
||||
|
||||
/* deconstruct my nidmap and jobmap arrays - this
|
||||
* function protects itself from being called
|
||||
* before things were initialized
|
||||
*/
|
||||
orte_util_nidmap_finalize();
|
||||
|
||||
#if OPAL_HAVE_HWLOC
|
||||
if (NULL != opal_hwloc_topology) {
|
||||
opal_hwloc_base_free_topology(opal_hwloc_topology);
|
||||
opal_hwloc_topology = NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void rte_abort(int error_code, bool report)
|
||||
{
|
||||
if (ORTE_ERR_SOCKET_NOT_AVAILABLE == error_code && slurm20) {
|
||||
/* exit silently with a special error code for slurm 2.0 */
|
||||
orte_ess_base_app_abort(108, false);
|
||||
} else {
|
||||
orte_ess_base_app_abort(error_code, report);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover the available resources.
|
||||
*
|
||||
* In order to fully support slurm, we need to be able to handle
|
||||
* node regexp/task_per_node strings such as:
|
||||
* foo,bar 5,3
|
||||
* foo 5
|
||||
* foo[2-10,12,99-105],bar,foobar[3-11] 2(x10),5,100(x16)
|
||||
*
|
||||
* @param *regexp A node regular expression from SLURM (i.e. SLURM_NODELIST)
|
||||
* @param **nodelist argv array to return the found nodes in
|
||||
*/
|
||||
static int discover_nodes(char *regexp, char*** names)
|
||||
{
|
||||
int i, j, len, ret;
|
||||
char *base;
|
||||
char *orig;
|
||||
bool found_range = false;
|
||||
bool more_to_come = false;
|
||||
|
||||
orig = base = strdup(regexp);
|
||||
if (NULL == base) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"%s ess:slurmd:discover: checking nodelist: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
regexp));
|
||||
|
||||
do {
|
||||
/* Find the base */
|
||||
len = strlen(base);
|
||||
for (i = 0; i <= len; ++i) {
|
||||
if (base[i] == '[') {
|
||||
/* we found a range. this gets dealt with below */
|
||||
base[i] = '\0';
|
||||
found_range = true;
|
||||
break;
|
||||
}
|
||||
if (base[i] == ',') {
|
||||
/* we found a singleton node, and there are more to come */
|
||||
base[i] = '\0';
|
||||
found_range = false;
|
||||
more_to_come = true;
|
||||
break;
|
||||
}
|
||||
if (base[i] == '\0') {
|
||||
/* we found a singleton node */
|
||||
found_range = false;
|
||||
more_to_come = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(i == 0) {
|
||||
/* we found a special character at the beginning of the string */
|
||||
orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value", 1, regexp);
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
free(orig);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
if (found_range) {
|
||||
/* If we found a range, now find the end of the range */
|
||||
for (j = i; j < len; ++j) {
|
||||
if (base[j] == ']') {
|
||||
base[j] = '\0';
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (j >= len) {
|
||||
/* we didn't find the end of the range */
|
||||
orte_show_help("help-ess-slurdm.txt", "slurm-env-var-bad-value", 1, regexp);
|
||||
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
|
||||
free(orig);
|
||||
return ORTE_ERR_BAD_PARAM;
|
||||
}
|
||||
|
||||
ret = parse_ranges(base, base + i + 1, names);
|
||||
if(ORTE_SUCCESS != ret) {
|
||||
orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value", 1, regexp);
|
||||
ORTE_ERROR_LOG(ret);
|
||||
free(orig);
|
||||
return ret;
|
||||
}
|
||||
if(base[j + 1] == ',') {
|
||||
more_to_come = true;
|
||||
base = &base[j + 2];
|
||||
} else {
|
||||
more_to_come = false;
|
||||
}
|
||||
} else {
|
||||
/* If we didn't find a range, just add the node */
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"%s ess:slurmd:discover: found node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
base));
|
||||
|
||||
if(ORTE_SUCCESS != (ret = opal_argv_append_nosize(names, base))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
free(orig);
|
||||
return ret;
|
||||
}
|
||||
/* set base equal to the (possible) next base to look at */
|
||||
base = &base[i + 1];
|
||||
}
|
||||
} while(more_to_come);
|
||||
|
||||
free(orig);
|
||||
|
||||
/* All done */
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Parse one or more ranges in a set
|
||||
*
|
||||
* @param base The base text of the node name
|
||||
* @param *ranges A pointer to a range. This can contain multiple ranges
|
||||
* (i.e. "1-3,10" or "5" or "9,0100-0130,250")
|
||||
* @param ***names An argv array to add the newly discovered nodes to
|
||||
*/
|
||||
static int parse_ranges(char *base, char *ranges, char ***names)
|
||||
{
|
||||
int i, len, ret;
|
||||
char *start, *orig;
|
||||
|
||||
/* Look for commas, the separator between ranges */
|
||||
|
||||
len = strlen(ranges);
|
||||
for (orig = start = ranges, i = 0; i < len; ++i) {
|
||||
if (',' == ranges[i]) {
|
||||
ranges[i] = '\0';
|
||||
ret = parse_range(base, start, names);
|
||||
if (ORTE_SUCCESS != ret) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
start = ranges + i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Pick up the last range, if it exists */
|
||||
|
||||
if (start < orig + len) {
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
|
||||
"%s ess:slurmd:discover: parse range %s (2)",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
start));
|
||||
|
||||
ret = parse_range(base, start, names);
|
||||
if (ORTE_SUCCESS != ret) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* All done */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Parse a single range in a set and add the full names of the nodes
|
||||
* found to the names argv
|
||||
*
|
||||
* @param base The base text of the node name
|
||||
* @param *ranges A pointer to a single range. (i.e. "1-3" or "5")
|
||||
* @param ***names An argv array to add the newly discovered nodes to
|
||||
*/
|
||||
static int parse_range(char *base, char *range, char ***names)
|
||||
{
|
||||
char *str, temp1[BUFSIZ];
|
||||
size_t i, j, start, end;
|
||||
size_t base_len, len, num_len;
|
||||
size_t str_start, str_end;
|
||||
size_t num_str_len;
|
||||
bool found;
|
||||
int ret;
|
||||
|
||||
len = strlen(range);
|
||||
base_len = strlen(base);
|
||||
/* Silence compiler warnings; start and end are always assigned
|
||||
properly, below */
|
||||
start = end = 0;
|
||||
|
||||
/* Look for the beginning of the first number */
|
||||
|
||||
for (found = false, i = 0; i < len; ++i) {
|
||||
if (isdigit((int) range[i])) {
|
||||
if (!found) {
|
||||
str_start = i;
|
||||
start = atoi(range + i);
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* Look for the end of the first number */
|
||||
|
||||
for (found = false, num_str_len = 0; i < len; ++i, ++num_str_len) {
|
||||
if (!isdigit((int) range[i])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Was there no range, just a single number? */
|
||||
|
||||
if (i >= len) {
|
||||
str_end = len;
|
||||
end = start;
|
||||
found = true;
|
||||
}
|
||||
|
||||
/* Nope, there was a range. Look for the beginning of the second
|
||||
number */
|
||||
|
||||
else {
|
||||
str_end = i - 1;
|
||||
for (; i < len; ++i) {
|
||||
if (isdigit((int) range[i])) {
|
||||
end = atoi(range + i);
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* Make strings for all values in the range */
|
||||
|
||||
len = base_len + num_str_len + 32;
|
||||
str = malloc(len);
|
||||
if (NULL == str) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
return ORTE_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
strcpy(str, base);
|
||||
for (i = start; i <= end; ++i) {
|
||||
str[base_len] = '\0';
|
||||
snprintf(temp1, BUFSIZ - 1, "%lu", (long) i);
|
||||
|
||||
/* Do we need zero pading? */
|
||||
|
||||
if ((num_len = strlen(temp1)) < num_str_len) {
|
||||
for (j = base_len; j < base_len + (num_str_len - num_len); ++j) {
|
||||
str[j] = '0';
|
||||
}
|
||||
str[j] = '\0';
|
||||
}
|
||||
strcat(str, temp1);
|
||||
ret = opal_argv_append_nosize(names, str);
|
||||
if(ORTE_SUCCESS != ret) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
free(str);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
free(str);
|
||||
|
||||
/* All done */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
@ -1,41 +0,0 @@
|
||||
# -*- text -*-
|
||||
#
|
||||
# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The University of Tennessee and The University
|
||||
# of Tennessee Research Foundation. All rights
|
||||
# reserved.
|
||||
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English help file for Open MPI MCA error messages.
|
||||
#
|
||||
[slurm-env-var-not-found]
|
||||
While trying to determine what resources are available, ORTE
|
||||
expects to find the following environment variables:
|
||||
|
||||
SLURM_NODELIST
|
||||
SLURM_TASKS_PER_NODE
|
||||
|
||||
However, it was unable to find the following environment variable:
|
||||
|
||||
%s
|
||||
|
||||
#This is a fatal error.
|
||||
[slurm-env-var-bad-value]
|
||||
While trying to determine what nodes are being used, ORTE
|
||||
uses the following environment variable:
|
||||
|
||||
SLURM_NODELIST value: %s
|
||||
|
||||
However, an error was encountered when trying to parse it
|
||||
|
||||
This is a fatal error.
|
Загрузка…
x
Ссылка в новой задаче
Block a user