1
1
openmpi/orte/mca/plm/slurmd/plm_slurmd_module.c
Ralph Castain 2ed0e60321 Bring some sanity to the exit code returned by mpirun. Ensure that we provide a non-zero code if something goes wrong, including someone exiting after calling mpi_init without calling mpi_finalize.
Jeff is preparing an (undoubtedly lengthy) explanation/matrix of how these codes are determined for the OMPI FAQ.

This commit was SVN r17879.
2008-03-19 19:00:51 +00:00

628 строки
20 KiB
C

/*
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*
* These symbols are in a file by themselves to provide nice linker
* semantics. Since linkers generally pull in symbols by object
* files, keeping these symbols as the only symbols in this file
* prevents utility programs such as "ompi_info" from having to import
* entire components just to query their version and parameters.
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <signal.h>
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif
#include "opal/mca/installdirs/installdirs.h"
#include "opal/util/argv.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/path.h"
#include "opal/util/show_help.h"
#include "opal/util/basename.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/runtime/opal_progress.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/plm/base/plm_private.h"
#include "plm_slurmd.h"
/*
* Local functions
*/
static int plm_slurmd_init(void);
static int plm_slurmd_launch_job(orte_job_t *jdata);
static int plm_slurmd_terminate_job(orte_jobid_t jobid);
static int plm_slurmd_terminate_orteds(void);
static int plm_slurmd_signal_job(orte_jobid_t jobid, int32_t signal);
static int plm_slurmd_finalize(void);
static int plm_slurmd_start_proc(int argc, char **argv, char **env,
char *prefix);
/*
* Global variable
*/
orte_plm_base_module_1_0_0_t orte_plm_slurmd_module = {
plm_slurmd_init,
orte_plm_base_set_hnp_name,
plm_slurmd_launch_job,
plm_slurmd_terminate_job,
plm_slurmd_terminate_orteds,
plm_slurmd_signal_job,
plm_slurmd_finalize
};
/*
* Local variables
*/
static pid_t srun_pid = 0;
static orte_jobid_t active_job = ORTE_JOBID_INVALID;
static bool failed_launch;
/**
* Init the module
*/
static int plm_slurmd_init(void)
{
int rc;
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
ORTE_ERROR_LOG(rc);
}
return rc;
}
/* When working in this function, ALWAYS jump to "cleanup" if
* you encounter an error so that orterun will be woken up and
* the job can cleanly terminate
*/
static int plm_slurmd_launch_job(orte_job_t *jdata)
{
orte_app_context_t **apps;
orte_node_t **nodes;
orte_std_cntr_t n;
orte_job_map_t *map;
char *param;
char **argv = NULL;
int argc;
int rc;
char *tmp;
char** env = NULL;
char* var;
char *nodelist_flat;
char **nodelist_argv;
char *rml_uri;
int nodelist_argc;
char **custom_strings;
int num_args, i;
char *cur_prefix;
struct timeval launchstart, launchstop;
if (orte_timing) {
if (0 != gettimeofday(&launchstart, NULL)) {
opal_output(0, "plm_slurmd: could not obtain job start time");
launchstart.tv_sec = 0;
launchstart.tv_usec = 0;
}
}
/* indicate the state of the launch */
failed_launch = true;
/* create a jobid for this job */
if (ORTE_SUCCESS != (rc = orte_plm_base_create_jobid(&jdata->jobid))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slurmd: launching job %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jdata->jobid)));
/* setup the job */
if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* set the active jobid */
active_job = jdata->jobid;
/* Get the map for this job */
if (NULL == (map = orte_rmaps.get_job_map(active_job))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
rc = ORTE_ERR_NOT_FOUND;
goto cleanup;
}
apps = (orte_app_context_t**)jdata->apps->addr;
nodes = (orte_node_t**)map->nodes->addr;
/* we are going to ignore the daemons and just launch
* the apps directly. This means we cannot use the
* base function to launch_apps as that function assumes
* we are doing it via our daemons
*/
/* are there any procs to actually launch? */
if (0 == jdata->num_procs) {
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slurmd: no procs to launch!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
goto cleanup;
}
/*
* start building argv array
*/
argv = NULL;
argc = 0;
/*
* SLURM srun OPTIONS
*/
/* add the srun command */
opal_argv_append(&argc, &argv, "srun");
/* Append user defined arguments to srun */
if ( NULL != mca_plm_slurmd_component.custom_args ) {
custom_strings = opal_argv_split(mca_plm_slurmd_component.custom_args, ' ');
num_args = opal_argv_count(custom_strings);
for (i = 0; i < num_args; ++i) {
opal_argv_append(&argc, &argv, custom_strings[i]);
}
opal_argv_free(custom_strings);
}
asprintf(&tmp, "--nodes=%lu", (unsigned long) map->num_new_daemons);
opal_argv_append(&argc, &argv, tmp);
free(tmp);
asprintf(&tmp, "--ntasks=%lu", (unsigned long) jdata->num_procs);
opal_argv_append(&argc, &argv, tmp);
free(tmp);
/* create nodelist */
nodelist_argv = NULL;
nodelist_argc = 0;
for (n=0; n < map->num_nodes; n++ ) {
/* add it to the list of nodes upon which
* we need to launch a proc
*/
opal_argv_append(&nodelist_argc, &nodelist_argv, nodes[n]->name);
}
if (0 == opal_argv_count(nodelist_argv)) {
opal_show_help("help-plm-slurmd.txt", "no-hosts-in-list", true);
rc = ORTE_ERR_FAILED_TO_START;
goto cleanup;
}
nodelist_flat = opal_argv_join(nodelist_argv, ',');
opal_argv_free(nodelist_argv);
asprintf(&tmp, "--nodelist=%s", nodelist_flat);
opal_argv_append(&argc, &argv, tmp);
free(tmp);
OPAL_OUTPUT_VERBOSE((2, orte_plm_globals.output,
"%s plm:slurmd: launching on nodes %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat));
/*
* APP OPTIONS
*/
/* add the executable (as specified by user) */
opal_argv_append(&argc, &argv, apps[0]->app);
/* add the user's cmd args */
for (i=1; NULL != apps[0]->argv[i]; i++) {
opal_argv_append(&argc, &argv, apps[0]->argv[i]);
}
/* setup the environment - since there will be no daemon, we have
* tell the the apps everything they need to know via the environment,
* not command line args. Hence, we cannot use the plm base function
* to setup the info
*/
if (NULL != apps[0]->env) {
env = opal_environ_merge(environ, apps[0]->env);
} else {
env = opal_argv_copy(environ);
}
/* add the nodelist */
var = mca_base_param_environ_variable("orte", "slurm", "nodelist");
opal_setenv(var, nodelist_flat, true, &env);
free(nodelist_flat);
free(var);
/* tell the proc what SDS component to use */
var = mca_base_param_environ_variable("sds", NULL, NULL);
opal_setenv(var, "slurmd", true, &env);
free(var);
/* pass the jobid */
var = mca_base_param_environ_variable("orte", "sds", "jobid");
orte_util_convert_jobid_to_string(&param, jdata->jobid);
opal_setenv(var, param, true, &env);
free(var);
free(param);
/* pass the starting vpid */
var = mca_base_param_environ_variable("orte", "sds", "vpid");
opal_setenv(var, "0", true, &env);
free(var);
/* pass the total number of procs that will be in the system */
var = mca_base_param_environ_variable("orte", "sds", "num_procs");
asprintf(&param, "%lu", (unsigned long)(jdata->num_procs));
opal_setenv(var, param, true, &env);
free(var);
free(param);
/* pass the uri of the hnp */
var = mca_base_param_environ_variable("orte", "hnp", "uri");
rml_uri = orte_rml.get_contact_info();
opal_setenv(var, rml_uri, true, &env);
free(var);
/* make the proc think the local daemon is the hnp */
var = mca_base_param_environ_variable("orte", "local_daemon", "uri");
opal_setenv(var, rml_uri, true, &env);
free(var);
free(rml_uri);
/* order the proc to use the unity routed component */
var = mca_base_param_environ_variable("routed", NULL, NULL);
opal_setenv(var, "unity", true, &env);
free(var);
if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
param = opal_argv_join(argv, ' ');
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slurmd: final top-level argv:\n\t%s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
(NULL == param) ? "NULL" : param));
if (NULL != param) free(param);
}
/* Copy the prefix-directory specified in the
corresponding app_context. If there are multiple,
different prefix's in the app context, complain (i.e., only
allow one --prefix option for the entire slurmd run -- we
don't support different --prefix'es for different nodes in
the SLURM plm) */
cur_prefix = NULL;
for (n=0; n < jdata->num_apps; n++) {
char * app_prefix_dir = apps[n]->prefix_dir;
/* Check for already set cur_prefix_dir -- if different,
complain */
if (NULL != app_prefix_dir) {
if (NULL != cur_prefix &&
0 != strcmp (cur_prefix, app_prefix_dir)) {
opal_show_help("help-plm-slurmd.txt", "multiple-prefixes",
true, cur_prefix, app_prefix_dir);
return ORTE_ERR_FATAL;
}
/* If not yet set, copy it; iff set, then it's the
same anyway */
if (NULL == cur_prefix) {
cur_prefix = strdup(app_prefix_dir);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slurmd: Set prefix:%s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
cur_prefix));
}
}
}
/* exec the proc(s) */
if (ORTE_SUCCESS != (rc = plm_slurmd_start_proc(argc, argv, env, cur_prefix))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* do NOT wait for srun to complete. Srun only completes when the processes
* it starts complete. Instead, we'll catch
* any srun failures and deal with them elsewhere
*/
/* declare the launch a success */
failed_launch = false;
if (orte_timing) {
if (0 != gettimeofday(&launchstop, NULL)) {
opal_output(0, "plm_slurmd: could not obtain stop time");
} else {
opal_output(0, "plm_slurmd: total job launch time is %ld usec",
(launchstop.tv_sec - launchstart.tv_sec)*1000000 +
(launchstop.tv_usec - launchstart.tv_usec));
}
}
cleanup:
if (NULL != argv) {
opal_argv_free(argv);
}
if (NULL != env) {
opal_argv_free(env);
}
/* check for failed launch - if so, force terminate */
if (failed_launch) {
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
orte_wakeup();
}
return rc;
}
static int plm_slurmd_terminate_job(orte_jobid_t jobid)
{
if (0 != srun_pid) {
kill(srun_pid, -9);
}
return ORTE_SUCCESS;
}
/**
* Terminate the orteds for a given job
*/
static int plm_slurmd_terminate_orteds(void)
{
orte_job_t *daemons;
int data=1;
/* get the daemon job object */
if (NULL != (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
/* record all daemons as terminated */
daemons->num_terminated = daemons->num_procs;
daemons->state = ORTE_JOB_STATE_TERMINATED;
}
/* trip the orted exit trigger */
write(orteds_exit, &data, sizeof(int));
opal_progress();
return ORTE_SUCCESS;
}
/**
* Signal all the processes in the child srun by sending the signal directly to it
*/
static int plm_slurmd_signal_job(orte_jobid_t jobid, int32_t signal)
{
if (0 != srun_pid) {
kill(srun_pid, (int)signal);
}
return ORTE_SUCCESS;
}
static int plm_slurmd_finalize(void)
{
int rc;
/* cleanup any pending recvs */
if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
ORTE_ERROR_LOG(rc);
}
return ORTE_SUCCESS;
}
static void srun_wait_cb(pid_t pid, int status, void* cbdata){
/* According to the SLURM folks, srun always returns the highest exit
code of our remote processes. Thus, a non-zero exit status doesn't
necessarily mean that srun failed - it could be that a proc returned
a non-zero exit status. Of course, that means the job failed(!), so
the end result is the same.
As a result, we really can't do much with the exit status itself - it
could be something in errno (if srun itself failed), or it could be
something returned by a proc, or it could be something returned by
the OS (e.g., couldn't find the executable). Somebody is welcome
to sort out all the options and pretty-print a better error message. For
now, though, the only thing that really matters is that
srun terminated. Report the error (if there is one) and make sure that orterun
wakes up!
*/
orte_job_t *jdata;
if (NULL == (jdata = orte_get_job_data_object(active_job))) {
/* bad jobid */
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
orte_wakeup();
return;
}
if (0 != status) {
if (failed_launch) {
/* we have a problem during launch */
opal_output(0, "ERROR: srun failed to start the specified procs.");
opal_output(0, "ERROR: This could be due to an inability to find the executable");
opal_output(0, "ERROR: on one or more remote nodes, lack of authority to execute");
opal_output(0, "ERROR: on one or more specified nodes, or other factors.");
/* report that the launch has failed so we break out of the daemon
* callback receive and exit
*/
jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
} else {
/* a proc must have died unexpectedly after launch - report
* that the job has failed so we exit
*/
jdata->state = ORTE_JOB_STATE_ABORTED;
}
} else {
jdata->state = ORTE_JOB_STATE_TERMINATED;
}
orte_wakeup();
}
static int plm_slurmd_start_proc(int argc, char **argv, char **env,
char *prefix)
{
int fd;
char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
if (NULL == exec_argv) {
return ORTE_ERR_NOT_FOUND;
}
srun_pid = fork();
if (-1 == srun_pid) {
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
return ORTE_ERR_SYS_LIMITS_CHILDREN;
}
if (0 == srun_pid) { /* child */
char *bin_base = NULL, *lib_base = NULL;
/* Figure out the basenames for the libdir and bindir. There
is a lengthy comment about this in plm_rsh_module.c
explaining all the rationale for how / why we're doing
this. */
lib_base = opal_basename(opal_install_dirs.libdir);
bin_base = opal_basename(opal_install_dirs.bindir);
/* If we have a prefix, then modify the PATH and
LD_LIBRARY_PATH environment variables. */
if (NULL != prefix) {
char *oldenv, *newenv;
/* Reset PATH */
oldenv = getenv("PATH");
if (NULL != oldenv) {
asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv);
} else {
asprintf(&newenv, "%s/%s", prefix, bin_base);
}
opal_setenv("PATH", newenv, true, &env);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slurmd: reset PATH: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
newenv));
free(newenv);
/* Reset LD_LIBRARY_PATH */
oldenv = getenv("LD_LIBRARY_PATH");
if (NULL != oldenv) {
asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv);
} else {
asprintf(&newenv, "%s/%s", prefix, lib_base);
}
opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
"%s plm:slurmd: reset LD_LIBRARY_PATH: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
newenv));
free(newenv);
}
fd = open("/dev/null", O_CREAT|O_WRONLY|O_TRUNC, 0666);
if(fd > 0) {
dup2(fd, 0);
}
/* When not in debug mode and --debug-daemons was not passed,
* tie stdout/stderr to dev null so we don't see messages from orted */
if (0 >= opal_output_get_verbosity(orte_plm_globals.output) &&
!orte_debug_daemons_flag) {
if (fd >= 0) {
if (fd != 1) {
dup2(fd,1);
}
if (fd != 2) {
dup2(fd,2);
}
}
}
if (fd > 2) {
close(fd);
}
/* get the srun process out of orterun's process group so that
signals sent from the shell (like those resulting from
cntl-c) don't get sent to srun */
setpgid(0, 0);
execve(exec_argv, argv, env);
opal_output(0, "plm:slurmd:start_proc: exec failed");
/* don't return - need to exit - returning would be bad -
we're not in the calling process anymore */
exit(1);
} else { /* parent */
/* just in case, make sure that the srun process is not in our
process group any more. Stevens says always do this on both
sides of the fork... */
setpgid(srun_pid, srun_pid);
/* setup the waitpid so we can find out if srun succeeds! */
orte_wait_cb(srun_pid, srun_wait_cb, NULL);
free(exec_argv);
}
return ORTE_SUCCESS;
}