1
1
openmpi/orte/mca/ess/slurmd/ess_slurmd_module.c
Ralph Castain 9b59d8de6f This is actually a much smaller commit than it appears at first glance - it just touches a lot of files. The --without-rte-support configuration option has never really been implemented completely. The option caused various objects not to be defined and conditionally compiled some base functions, but did nothing to prevent build of the component libraries. Unfortunately, since many of those components use objects covered by the option, it caused builds to break if those components were allowed to build.
Brian dealt with this in the past by creating platform files and using "no-build" to block the components. This was clunky, but acceptable when only one organization was using that option. However, that number has now expanded to at least two more locations.

Accordingly, make --without-rte-support actually work by adding appropriate configury to prevent components from building when they shouldn't. While doing so, remove two frameworks (db and rmcast) that are no longer used as ORCM comes to a close (besides, they belonged in ORCM now anyway). Do some minor cleanups along the way.

This commit was SVN r25497.
2011-11-22 21:24:35 +00:00

734 строки
23 KiB
C

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include <ctype.h>
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#ifdef HAVE_IFADDRS_H
#include <ifaddrs.h>
#endif
#include "opal/util/opal_environ.h"
#include "opal/util/output.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/argv.h"
#include "opal/class/opal_pointer_array.h"
#include "opal/mca/hwloc/base/base.h"
#include "opal/util/printf.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/nidmap.h"
#include "orte/util/pre_condition_transports.h"
#include "orte/util/regex.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/ess/base/base.h"
#include "orte/mca/ess/slurmd/ess_slurmd.h"
static int rte_init(void);
static int rte_finalize(void);
static void rte_abort(int error_code, bool report) __opal_attribute_noreturn__;
orte_ess_base_module_t orte_ess_slurmd_module = {
rte_init,
rte_finalize,
rte_abort,
orte_ess_base_proc_get_locality,
orte_ess_base_proc_get_daemon,
orte_ess_base_proc_get_hostname,
orte_ess_base_proc_get_local_rank,
orte_ess_base_proc_get_node_rank,
orte_ess_base_proc_get_epoch, /* proc_get_epoch */
orte_ess_base_update_pidmap,
orte_ess_base_update_nidmap,
NULL /* ft_event */
};
/* Local globals */
static bool app_init_complete;
static bool slurm20;
/* Local functions */
static int discover_nodes(char *regexp, char*** nodelist);
static int parse_ranges(char *base, char *ranges, char ***names);
static int parse_range(char *base, char *range, char ***names);
/**** MODULE FUNCTIONS ****/
static int rte_init(void)
{
int ret;
char *error = NULL;
int32_t jobfam, stepid;
char **nodes = NULL;
char *envar;
int i, j;
orte_nid_t *node;
orte_jmap_t *jmap;
orte_pmap_t *pmap;
orte_vpid_t vpid;
int local_rank;
int nodeid;
int num_nodes;
int cpus_per_task;
char *regexp, *tasks_per_node;
int *ppn;
bool block=false, cyclic=false;
uint64_t unique_key[2];
char *cs_env, *string_key;
/* init flag */
app_init_complete = false;
slurm20 = false;
/* run the prolog */
if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
error = "orte_ess_base_std_prolog";
goto error;
}
/* Only application procs can use this module. Since we
* were directly launched by srun, we need to bootstrap
* our own global info so we can startup. Srun will have
* provided that info in our environment, so get it from there
*/
/* declare ourselves to be standalone - i.e., not launched by orted */
orte_standalone_operation = true;
#if OPAL_HAVE_HWLOC
/* get the topology */
if (NULL == opal_hwloc_topology) {
if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
error = "topology discovery";
goto error;
}
}
#endif
/* get the slurm jobid - this will be our job family */
envar = getenv("SLURM_JOBID");
/* don't need to check this for NULL - if it was, we would
* never have been selected anyway
*/
jobfam = strtol(envar, NULL, 10);
/* get the slurm stepid - this will be our local jobid */
if (NULL == (envar = getenv("SLURM_STEPID"))) {
error = "could not get SLURM_STEPID";
goto error;
}
/* because the stepid could be zero, and we want the local
* jobid to be unique, increment it by one so the system
* doesn't think that we are a bunch of daemons!
*/
stepid = strtol(envar, NULL, 10) + 1;
/* now build the jobid */
ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid);
/* setup transport keys in case the MPI layer needs them -
* we can use the SLURM jobid and stepid as unique keys
* because they are unique values assigned by the RM
*/
unique_key[0] = (uint64_t)jobfam;
unique_key[1] = (uint64_t)stepid;
if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
asprintf(&envar, "%s=%s", cs_env, string_key);
putenv(envar);
/* cannot free the envar as that messes of our environ */
free(cs_env);
free(string_key);
/* get my local nodeid */
if (NULL == (envar = getenv("SLURM_NODEID"))) {
error = "could not get SLURM_NODEID";
goto error;
}
nodeid = strtol(envar, NULL, 10);
ORTE_PROC_MY_DAEMON->jobid = 0;
ORTE_PROC_MY_DAEMON->vpid = nodeid;
ORTE_EPOCH_SET(ORTE_PROC_MY_DAEMON->epoch,ORTE_PROC_MY_NAME->epoch);
/* get the node list */
if (NULL == (regexp = getenv("SLURM_STEP_NODELIST"))) {
error = "could not get SLURM_STEP_NODELIST";
goto error;
}
/* break that down into a list of nodes */
if (ORTE_SUCCESS != (ret = discover_nodes(regexp, &nodes))) {
error = "could not parse node list";
goto error;
}
num_nodes = opal_argv_count(nodes);
orte_process_info.num_nodes = num_nodes;
/* setup the nidmap arrays */
if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
ORTE_ERROR_LOG(ret);
error = "orte_util_nidmap_init";
goto error;
}
/* set the size of the nidmap storage so we minimize realloc's */
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&orte_nidmap, orte_process_info.num_nodes))) {
error = "could not set pointer array size for nidmap";
goto error;
}
/* get the slurm procid - this will be our vpid */
if (NULL == (envar = getenv("SLURM_PROCID"))) {
error = "could not get SLURM_PROCID";
goto error;
}
ORTE_PROC_MY_NAME->vpid = strtol(envar, NULL, 10);
/* get the number of procs in this job */
if (NULL == (envar = getenv("SLURM_STEP_NUM_TASKS"))) {
error = "could not get SLURM_STEP_NUM_TASKS";
goto error;
}
orte_process_info.num_procs = strtol(envar, NULL, 10);
/* set the app_num so that MPI attributes get set correctly */
orte_process_info.app_num = 1;
/* if this is SLURM 2.0 or above, get our port
* assignments for use in the OOB
*/
if (NULL != (envar = getenv("SLURM_STEP_RESV_PORTS"))) {
/* convert this to an MCA param that will be
* picked up by the OOB
*/
orte_oob_static_ports = strdup(envar);
slurm20 = true;
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s using SLURM-reserved ports %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
envar));
}
/* get the number of tasks/node */
if (NULL == (tasks_per_node = getenv("SLURM_STEP_TASKS_PER_NODE"))) {
error = "could not get SLURM_STEP_TASKS_PER_NODE";
goto error;
}
/* get the number of CPUs per task that the user provided to slurm */
if (NULL != (envar = getenv("SLURM_CPUS_PER_TASK"))) {
cpus_per_task = strtol(envar, NULL, 10);
if(0 >= cpus_per_task) {
error = "got bad value from SLURM_CPUS_PER_TASK";
goto error;
}
} else {
cpus_per_task = 1;
}
/* compute the ppn */
if (ORTE_SUCCESS != (ret = orte_regex_extract_ppn(num_nodes, tasks_per_node, &ppn))) {
error = "could not determine #procs on each node";
goto error;
}
/* for slurm, we have to normalize the ppn by the cpus_per_task */
for (i=0; i < num_nodes; i++) {
ppn[i] /= cpus_per_task;
}
/* get the distribution (i.e., mapping) mode */
if (NULL == (envar = getenv("SLURM_DISTRIBUTION")) ||
0 == strcmp(envar, "block")) {
/* assume byslot mapping */
block = true;
} else if (0 == strcmp(envar, "cyclic")) {
/* bynode mapping */
cyclic = true;
} else {
/* cannot currently support other mapping modes */
error = "distribution/mapping mode not supported";
goto error;
}
/* construct the nidmap */
for (i=0; i < num_nodes; i++) {
node = OBJ_NEW(orte_nid_t);
node->name = strdup(nodes[i]);
node->daemon = i;
node->index = i;
opal_pointer_array_set_item(&orte_nidmap, i, node);
}
opal_argv_free(nodes);
/* create a job map for this job */
jmap = OBJ_NEW(orte_jmap_t);
jmap->job = ORTE_PROC_MY_NAME->jobid;
opal_pointer_array_add(&orte_jobmap, jmap);
/* update the num procs */
jmap->num_procs = orte_process_info.num_procs;
/* set the size of the pidmap storage so we minimize realloc's */
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_size(&jmap->pmap, jmap->num_procs))) {
ORTE_ERROR_LOG(ret);
error = "could not set array size for pidmap";
goto error;
}
/* construct the pidmap */
if (block) {
/* for each node, cycle through the ppn */
vpid = 0;
for (i=0; i < orte_nidmap.size; i++) {
if (NULL == (node = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
continue;
}
/* compute the vpid for each proc on this node
* and add a pmap entry for it
*/
for (j=0; j < ppn[i]; j++) {
pmap = OBJ_NEW(orte_pmap_t);
pmap->node = node->index;
pmap->local_rank = j;
pmap->node_rank = j;
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(&jmap->pmap, vpid, pmap))) {
ORTE_ERROR_LOG(ret);
error = "could not set pmap values";
goto error;
}
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s node %d name %s rank %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(int) node->index, node->name, (int)vpid));
vpid++;
}
}
} else if (cyclic) {
/* cycle across the nodes */
vpid = 0;
while (vpid < orte_process_info.num_procs) {
for (i=0; i < num_nodes && vpid < orte_process_info.num_procs; i++) {
if (0 < ppn[i]) {
if (NULL == (node = (orte_nid_t*)opal_pointer_array_get_item(&orte_nidmap, i))) {
/* this is an error */
error = "error initializing process map";
goto error;
}
pmap = OBJ_NEW(orte_pmap_t);
pmap->node = node->index;
pmap->local_rank = ppn[i]-1;
pmap->node_rank = ppn[i]-1;
if (ORTE_SUCCESS != (ret = opal_pointer_array_set_item(&jmap->pmap, vpid, pmap))) {
ORTE_ERROR_LOG(ret);
error = "could not set pmap values";
goto error;
}
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s node %d name %s rank %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(int) node->index, node->name, (int)vpid));
vpid++;
--ppn[i];
}
}
}
}
free(ppn);
/* ensure we pick the correct critical components */
putenv("OMPI_MCA_grpcomm=hier");
putenv("OMPI_MCA_routed=direct");
/* complete definition of process name */
ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_MIN);
/* get our local rank */
if (NULL == (envar = getenv("SLURM_LOCALID"))) {
error = "could not get SLURM_LOCALID";
goto error;
}
local_rank = strtol(envar, NULL, 10);
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s local rank %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
local_rank));
/* set max procs */
if (orte_process_info.max_procs < orte_process_info.num_procs) {
orte_process_info.max_procs = orte_process_info.num_procs;
}
/* now use the default procedure to finish my setup */
if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) {
ORTE_ERROR_LOG(ret);
error = "orte_ess_base_app_setup";
goto error;
}
/* flag that we completed init */
app_init_complete = true;
return ORTE_SUCCESS;
error:
if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure",
true, error, ORTE_ERROR_NAME(ret), ret);
}
return ret;
}
static int rte_finalize(void)
{
int ret = ORTE_SUCCESS;
if (app_init_complete) {
/* use the default procedure to finish */
if (ORTE_SUCCESS != (ret = orte_ess_base_app_finalize())) {
ORTE_ERROR_LOG(ret);
}
}
/* remove the envars that we pushed into environ
* so we leave that structure intact
*/
unsetenv("OMPI_MCA_grpcomm");
unsetenv("OMPI_MCA_routed");
unsetenv("OMPI_MCA_orte_precondition_transports");
/* deconstruct my nidmap and jobmap arrays - this
* function protects itself from being called
* before things were initialized
*/
orte_util_nidmap_finalize();
#if OPAL_HAVE_HWLOC
if (NULL != opal_hwloc_topology) {
opal_hwloc_base_free_topology(opal_hwloc_topology);
opal_hwloc_topology = NULL;
}
#endif
return ret;
}
static void rte_abort(int error_code, bool report)
{
if (ORTE_ERR_SOCKET_NOT_AVAILABLE == error_code && slurm20) {
/* exit silently with a special error code for slurm 2.0 */
orte_ess_base_app_abort(108, false);
} else {
orte_ess_base_app_abort(error_code, report);
}
}
/**
* Discover the available resources.
*
* In order to fully support slurm, we need to be able to handle
* node regexp/task_per_node strings such as:
* foo,bar 5,3
* foo 5
* foo[2-10,12,99-105],bar,foobar[3-11] 2(x10),5,100(x16)
*
* @param *regexp A node regular expression from SLURM (i.e. SLURM_NODELIST)
* @param **nodelist argv array to return the found nodes in
*/
static int discover_nodes(char *regexp, char*** names)
{
int i, j, len, ret;
char *base;
char *orig;
bool found_range = false;
bool more_to_come = false;
orig = base = strdup(regexp);
if (NULL == base) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s ess:slurmd:discover: checking nodelist: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
regexp));
do {
/* Find the base */
len = strlen(base);
for (i = 0; i <= len; ++i) {
if (base[i] == '[') {
/* we found a range. this gets dealt with below */
base[i] = '\0';
found_range = true;
break;
}
if (base[i] == ',') {
/* we found a singleton node, and there are more to come */
base[i] = '\0';
found_range = false;
more_to_come = true;
break;
}
if (base[i] == '\0') {
/* we found a singleton node */
found_range = false;
more_to_come = false;
break;
}
}
if(i == 0) {
/* we found a special character at the beginning of the string */
orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value", 1, regexp);
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
free(orig);
return ORTE_ERR_BAD_PARAM;
}
if (found_range) {
/* If we found a range, now find the end of the range */
for (j = i; j < len; ++j) {
if (base[j] == ']') {
base[j] = '\0';
break;
}
}
if (j >= len) {
/* we didn't find the end of the range */
orte_show_help("help-ess-slurdm.txt", "slurm-env-var-bad-value", 1, regexp);
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
free(orig);
return ORTE_ERR_BAD_PARAM;
}
ret = parse_ranges(base, base + i + 1, names);
if(ORTE_SUCCESS != ret) {
orte_show_help("help-ras-slurm.txt", "slurm-env-var-bad-value", 1, regexp);
ORTE_ERROR_LOG(ret);
free(orig);
return ret;
}
if(base[j + 1] == ',') {
more_to_come = true;
base = &base[j + 2];
} else {
more_to_come = false;
}
} else {
/* If we didn't find a range, just add the node */
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s ess:slurmd:discover: found node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
base));
if(ORTE_SUCCESS != (ret = opal_argv_append_nosize(names, base))) {
ORTE_ERROR_LOG(ret);
free(orig);
return ret;
}
/* set base equal to the (possible) next base to look at */
base = &base[i + 1];
}
} while(more_to_come);
free(orig);
/* All done */
return ret;
}
/*
* Parse one or more ranges in a set
*
* @param base The base text of the node name
* @param *ranges A pointer to a range. This can contain multiple ranges
* (i.e. "1-3,10" or "5" or "9,0100-0130,250")
* @param ***names An argv array to add the newly discovered nodes to
*/
static int parse_ranges(char *base, char *ranges, char ***names)
{
int i, len, ret;
char *start, *orig;
/* Look for commas, the separator between ranges */
len = strlen(ranges);
for (orig = start = ranges, i = 0; i < len; ++i) {
if (',' == ranges[i]) {
ranges[i] = '\0';
ret = parse_range(base, start, names);
if (ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
return ret;
}
start = ranges + i + 1;
}
}
/* Pick up the last range, if it exists */
if (start < orig + len) {
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
"%s ess:slurmd:discover: parse range %s (2)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
start));
ret = parse_range(base, start, names);
if (ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
return ret;
}
}
/* All done */
return ORTE_SUCCESS;
}
/*
* Parse a single range in a set and add the full names of the nodes
* found to the names argv
*
* @param base The base text of the node name
* @param *ranges A pointer to a single range. (i.e. "1-3" or "5")
* @param ***names An argv array to add the newly discovered nodes to
*/
static int parse_range(char *base, char *range, char ***names)
{
char *str, temp1[BUFSIZ];
size_t i, j, start, end;
size_t base_len, len, num_len;
size_t str_start, str_end;
size_t num_str_len;
bool found;
int ret;
len = strlen(range);
base_len = strlen(base);
/* Silence compiler warnings; start and end are always assigned
properly, below */
start = end = 0;
/* Look for the beginning of the first number */
for (found = false, i = 0; i < len; ++i) {
if (isdigit((int) range[i])) {
if (!found) {
str_start = i;
start = atoi(range + i);
found = true;
break;
}
}
}
if (!found) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* Look for the end of the first number */
for (found = false, num_str_len = 0; i < len; ++i, ++num_str_len) {
if (!isdigit((int) range[i])) {
break;
}
}
/* Was there no range, just a single number? */
if (i >= len) {
str_end = len;
end = start;
found = true;
}
/* Nope, there was a range. Look for the beginning of the second
number */
else {
str_end = i - 1;
for (; i < len; ++i) {
if (isdigit((int) range[i])) {
end = atoi(range + i);
found = true;
break;
}
}
}
if (!found) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
return ORTE_ERR_NOT_FOUND;
}
/* Make strings for all values in the range */
len = base_len + num_str_len + 32;
str = malloc(len);
if (NULL == str) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
return ORTE_ERR_OUT_OF_RESOURCE;
}
strcpy(str, base);
for (i = start; i <= end; ++i) {
str[base_len] = '\0';
snprintf(temp1, BUFSIZ - 1, "%lu", (long) i);
/* Do we need zero pading? */
if ((num_len = strlen(temp1)) < num_str_len) {
for (j = base_len; j < base_len + (num_str_len - num_len); ++j) {
str[j] = '0';
}
str[j] = '\0';
}
strcat(str, temp1);
ret = opal_argv_append_nosize(names, str);
if(ORTE_SUCCESS != ret) {
ORTE_ERROR_LOG(ret);
free(str);
return ret;
}
}
free(str);
/* All done */
return ORTE_SUCCESS;
}