1
1

Complete the update of the rsh pls to handle failed-to-start

This commit was SVN r14531.
Этот коммит содержится в:
Ralph Castain 2007-04-26 15:07:40 +00:00
родитель ce32b918da
Коммит bca2de3a57
2 изменённых файлов: 109 добавлений и 196 удалений

Просмотреть файл

@ -20,7 +20,7 @@
#
[no-local-orted]
The rsh PLS component was not able to find the executable "orted" in
your PATH or in the directory where Open MPI was initially installed,
your PATH or in the directory where Open MPI/OpenRTE was initially installed,
and therefore cannot continue.
For reference, your current PATH is:

Просмотреть файл

@ -74,6 +74,7 @@
#include "orte/util/session_dir.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_wakeup.h"
#include "orte/dss/dss.h"
#include "orte/mca/ns/ns.h"
@ -135,13 +136,10 @@ static const char * orte_pls_rsh_shell_name[] = {
};
/* local global storage of timing variables */
static unsigned long mintime=999999999, miniter, maxtime=0, maxiter;
static float avgtime=0.0;
static struct timeval *launchstart;
static struct timeval joblaunchstart, joblaunchstop;
/* local global storage of the list of active daemons */
static opal_list_t active_daemons;
/* global storage of active jobid being launched */
static orte_jobid_t active_job=ORTE_JOBID_INVALID;
/**
@ -265,63 +263,18 @@ static int orte_pls_rsh_fill_exec_path ( char ** exec_path)
static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
{
orte_pls_daemon_info_t *info = (orte_pls_daemon_info_t*) cbdata;
orte_mapped_node_t *node;
orte_mapped_proc_t *proc;
opal_list_item_t *item;
int rc;
unsigned long deltat;
struct timeval launchstop;
/* if ssh exited abnormally, set the child processes to aborted
and print something useful to the user. The usual reasons for
ssh to exit abnormally all are a pretty good indication that
the child processes aren't going to start up properly.
This should somehow be pushed up to the calling level, but we
don't really have a way to do that just yet.
*/
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {
/* get the mapping for our node so we can cancel the right things */
rc = orte_rmaps.get_node_map(&node, info->cell,
info->nodename, info->active_job);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* set state of all processes associated with the daemon as
terminated */
for(item = opal_list_get_first(&node->procs);
item != opal_list_get_end(&node->procs);
item = opal_list_get_next(item)) {
proc = (orte_mapped_proc_t*) item;
rc = orte_smr.set_proc_state(&(proc->name),
ORTE_PROC_STATE_ABORTED, status);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
}
/* Clean up the session directory as if we were the
process itself. This covers the case where the
process died abnormally and didn't cleanup its own
session directory. */
orte_session_dir_finalize(&(proc->name));
}
OBJ_RELEASE(node);
cleanup:
/* tell the user something went wrong */
opal_output(0, "ERROR: A daemon on node %s failed to start as expected.",
info->nodename);
opal_output(0, "ERROR: A daemon failed to start as expected.");
opal_output(0, "ERROR: There may be more information available from");
opal_output(0, "ERROR: the remote shell (see above).");
if (WIFEXITED(status)) {
opal_output(0, "ERROR: The daemon exited unexpectedly with status %d.",
WEXITSTATUS(status));
WEXITSTATUS(status));
} else if (WIFSIGNALED(status)) {
#ifdef WCOREDUMP
if (WCOREDUMP(status)) {
@ -336,38 +289,23 @@ static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
} else {
opal_output(0, "No extra status information is available: %d.", status);
}
OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock);
/* tell the system that this daemon is gone */
if (ORTE_SUCCESS != (rc = orte_pls_base_remove_daemon(info))) {
/* The usual reasons for ssh to exit abnormally all are a pretty good
indication that the child processes aren't going to start up properly.
Set the job state to indicate we failed to launch so orterun's exit status
will be non-zero and forcibly terminate the job so orterun can exit
*/
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(active_job, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_wakeup(active_job))) {
ORTE_ERROR_LOG(rc);
}
/* remove the daemon from our local list */
opal_list_remove_item(&active_daemons, &info->super);
OBJ_RELEASE(info);
OPAL_THREAD_UNLOCK(&mca_pls_rsh_component.lock);
} /* if abnormal exit */
/* release any waiting threads */
OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock);
/* first check timing request */
if (mca_pls_rsh_component.timing) {
if (0 != gettimeofday(&launchstop, NULL)) {
opal_output(0, "pls_rsh: could not obtain stop time");
} else {
deltat = (launchstop.tv_sec - launchstart[info->name->vpid].tv_sec)*1000000 +
(launchstop.tv_usec - launchstart[info->name->vpid].tv_usec);
avgtime = avgtime + deltat;
if (deltat < mintime) {
mintime = deltat;
miniter = (unsigned long)info->name->vpid;
}
if (deltat > maxtime) {
maxtime = deltat;
maxiter = (unsigned long)info->name->vpid;
}
}
}
if (mca_pls_rsh_component.num_children-- >=
mca_pls_rsh_component.num_concurrent ||
@ -380,19 +318,9 @@ static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
opal_output(0, "pls_rsh: could not obtain job launch stop time");
} else {
deltat = (joblaunchstop.tv_sec - joblaunchstart.tv_sec)*1000000 +
(joblaunchstop.tv_usec - joblaunchstart.tv_usec);
(joblaunchstop.tv_usec - joblaunchstart.tv_usec);
opal_output(0, "pls_rsh: total time to launch job is %lu usec", deltat);
if (mintime < 999999999) {
/* had at least one non-local node */
avgtime = avgtime/opal_list_get_size(&active_daemons);
opal_output(0, "pls_rsh: average time to launch one daemon %f usec", avgtime);
opal_output(0, "pls_rsh: min time to launch a daemon was %lu usec for iter %lu", mintime, miniter);
opal_output(0, "pls_rsh: max time to launch a daemon was %lu usec for iter %lu", maxtime, maxiter);
} else {
opal_output(0, "No nonlocal launches to report for timing info");
}
}
free(launchstart);
}
OPAL_THREAD_UNLOCK(&mca_pls_rsh_component.lock);
@ -404,9 +332,13 @@ static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
* for launching the application.
*/
/* When working in this function, ALWAYS jump to "cleanup" if
* you encounter an error so that orterun will be woken up and
* the job can cleanly terminate
*/
int orte_pls_rsh_launch(orte_jobid_t jobid)
{
orte_job_map_t *map;
orte_job_map_t *map=NULL;
opal_list_item_t *n_item;
orte_mapped_node_t *rmaps_node;
orte_std_cntr_t num_nodes;
@ -426,7 +358,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
bool remote_sh = false, remote_csh = false;
bool local_sh = false, local_csh = false;
char *lib_base = NULL, *bin_base = NULL;
orte_pls_daemon_info_t *dmn;
bool failed_launch = true;
if (mca_pls_rsh_component.timing) {
if (0 != gettimeofday(&joblaunchstart, NULL)) {
@ -436,12 +368,9 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
}
}
/* setup a list that will contain the info for all the daemons
* so we can store it on the registry when done and use it
* locally to track their state
*/
OBJ_CONSTRUCT(&active_daemons, opal_list_t);
/* set the active jobid */
active_job = jobid;
/* Get the map for this job
* We need the entire mapping for a couple of reasons:
* - need the prefix to start with.
@ -451,8 +380,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
rc = orte_rmaps.get_job_map(&map, jobid);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
OBJ_DESTRUCT(&active_daemons);
return rc;
goto cleanup;
}
/* if the user requested that we re-use daemons,
@ -461,18 +389,16 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
if (orte_pls_base.reuse_daemons) {
if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(map);
OBJ_DESTRUCT(&active_daemons);
return rc;
goto cleanup;
}
}
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
if (0 == num_nodes) {
/* nothing left to do - just return */
OBJ_RELEASE(map);
OBJ_DESTRUCT(&active_daemons);
return ORTE_SUCCESS;
failed_launch = false;
rc = ORTE_SUCCESS;
goto cleanup;
}
if (mca_pls_rsh_component.debug_daemons &&
@ -492,9 +418,8 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
*/
opal_show_help("help-pls-rsh.txt", "deadlock-params",
true, mca_pls_rsh_component.num_concurrent, num_nodes);
OBJ_RELEASE(map);
OBJ_DESTRUCT(&active_daemons);
return ORTE_ERR_FATAL;
rc = ORTE_ERR_FATAL;
goto cleanup;
}
/*
@ -521,7 +446,8 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
*/
if (num_nodes == 0) {
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
return ORTE_ERR_BAD_PARAM;
rc = ORTE_ERR_BAD_PARAM;
goto cleanup;
}
rc = orte_ns.reserve_range(0, num_nodes, &vpid);
if (ORTE_SUCCESS != rc) {
@ -599,7 +525,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
return rc;
goto cleanup;
}
switch (shell) {
@ -683,10 +609,6 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
/*
* Iterate through each of the nodes
*/
if (mca_pls_rsh_component.timing) {
/* allocate space to track the start times */
launchstart = (struct timeval*)malloc((num_nodes+vpid) * sizeof(struct timeval));
}
for(n_item = opal_list_get_first(&map->nodes);
n_item != opal_list_get_end(&map->nodes);
@ -698,17 +620,6 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
rmaps_node = (orte_mapped_node_t*)n_item;
if (mca_pls_rsh_component.timing) {
if (0 != gettimeofday(&launchstart[vpid], NULL)) {
opal_output(0, "pls_rsh: could not obtain start time");
}
}
/* new daemon - setup to record its info */
dmn = OBJ_NEW(orte_pls_daemon_info_t);
dmn->active_job = jobid;
opal_list_append(&active_daemons, &dmn->super);
/* setup node name */
free(argv[node_name_index1]);
if (NULL != rmaps_node->username &&
@ -722,9 +633,6 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
free(argv[node_name_index2]);
argv[node_name_index2] = strdup(rmaps_node->nodename);
/* save it in the daemon info */
dmn->nodename = strdup(rmaps_node->nodename);
/* initialize daemons process name */
rc = orte_ns.create_process_name(&name, rmaps_node->cell, 0, vpid);
if (ORTE_SUCCESS != rc) {
@ -732,25 +640,12 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
goto cleanup;
}
/* save it in the daemon info */
dmn->cell = rmaps_node->cell;
if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(dmn->name), name, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
/* fork a child to exec the rsh/ssh session */
/* set the process state to "launched" */
if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(name, ORTE_PROC_STATE_LAUNCHED, 0))) {
ORTE_ERROR_LOG(rc);
goto cleanup;
}
pid = fork();
if (pid < 0) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
rc = ORTE_ERR_SYS_LIMITS_CHILDREN;
goto cleanup;
}
@ -760,18 +655,18 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
char** env;
char* var;
long fd, fdmax = sysconf(_SC_OPEN_MAX);
if (mca_pls_rsh_component.debug) {
opal_output(0, "pls:rsh: launching on node %s\n",
rmaps_node->nodename);
}
/* We don't need to sense an oversubscribed condition and set the sched_yield
* for the node as we are only launching the daemons at this time. The daemons
* are now smart enough to set the oversubscribed condition themselves when
* they launch the local procs.
*/
/* Is this a local launch?
*
* Not all node names may be resolvable (if we found
@ -787,22 +682,20 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
opal_output(0, "pls:rsh: %s is a LOCAL node\n",
rmaps_node->nodename);
}
if (mca_pls_rsh_component.timing) {
/* since this is a local launch, the daemon will never reach
* the waitpid callback - so set the start value to
* something nonsensical
*/
launchstart[vpid].tv_sec = 0;
launchstart[vpid].tv_usec = 0;
}
exec_path = opal_path_findv(argv[local_exec_index], 0, environ, NULL);
if (NULL == exec_path && NULL == prefix_dir) {
rc = orte_pls_rsh_fill_exec_path (&exec_path);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
exit(-1); /* the forked process MUST exit */
/* don't normally ERROR_LOG this problem as the function has already
* printed out a nice error message for us - do the ERROR_LOG only
* when we are in debug mode so we can see where it occurred
*/
if (mca_pls_rsh_component.debug) {
ORTE_ERROR_LOG(rc);
}
exit(-1); /* the forked process MUST exit */
}
} else {
if (NULL != prefix_dir) {
@ -812,19 +705,25 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
if (NULL == exec_path) {
rc = orte_pls_rsh_fill_exec_path (&exec_path);
if (ORTE_SUCCESS != rc) {
ORTE_ERROR_LOG(rc);
/* don't normally ERROR_LOG this problem as the function has already
* printed out a nice error message for us - do the ERROR_LOG only
* when we are in debug mode so we can see where it occurred
*/
if (mca_pls_rsh_component.debug) {
ORTE_ERROR_LOG(rc);
}
exit(-1); /* the forked process MUST exit */
}
}
}
/* If we have a prefix, then modify the PATH and
LD_LIBRARY_PATH environment variables. We're
already in the child process, so it's ok to modify
environ. */
if (NULL != prefix_dir) {
char *oldenv, *newenv;
/* Reset PATH */
newenv = opal_os_path( false, prefix_dir, bin_base, NULL );
oldenv = getenv("PATH");
@ -839,7 +738,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
opal_output(0, "pls:rsh: reset PATH: %s", newenv);
}
free(newenv);
/* Reset LD_LIBRARY_PATH */
newenv = opal_os_path( false, prefix_dir, lib_base, NULL );
oldenv = getenv("LD_LIBRARY_PATH");
@ -856,40 +755,40 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
}
free(newenv);
}
/* Since this is a local execution, we need to
potentially whack the final ")" in the argv (if
sh/csh conditionals, from above). Note that we're
modifying the argv[] in the child process, so
there's no need to save this and restore it
afterward -- the parent's argv[] is unmodified. */
potentially whack the final ")" in the argv (if
sh/csh conditionals, from above). Note that we're
modifying the argv[] in the child process, so
there's no need to save this and restore it
afterward -- the parent's argv[] is unmodified. */
if (NULL != argv[local_exec_index_end]) {
free(argv[local_exec_index_end]);
argv[local_exec_index_end] = NULL;
}
/* tell the daemon to setup its own process session/group */
opal_argv_append(&argc, &argv, "--set-sid");
exec_argv = &argv[local_exec_index];
/* Finally, chdir($HOME) because we're making the
assumption that this is what will happen on
remote nodes (via rsh/ssh). This allows a user
to specify a path that is relative to $HOME for
both the cwd and argv[0] and it will work on
all nodes -- including the local nost.
Otherwise, it would work on remote nodes and
not the local node. If the user does not start
in $HOME on the remote nodes... well... let's
hope they start in $HOME. :-) */
assumption that this is what will happen on
remote nodes (via rsh/ssh). This allows a user
to specify a path that is relative to $HOME for
both the cwd and argv[0] and it will work on
all nodes -- including the local nost.
Otherwise, it would work on remote nodes and
not the local node. If the user does not start
in $HOME on the remote nodes... well... let's
hope they start in $HOME. :-) */
var = getenv("HOME");
if (NULL != var) {
if (mca_pls_rsh_component.debug) {
opal_output(0, "pls:rsh: changing to directory %s", var);
}
/* Ignore errors -- what are we going to do?
(and we ignore errors on the remote nodes
in the fork pls, so this is consistent) */
(and we ignore errors on the remote nodes
in the fork pls, so this is consistent) */
chdir(var);
}
} else {
@ -899,7 +798,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
}
exec_argv = argv;
exec_path = strdup(mca_pls_rsh_component.agent_path);
if (NULL != prefix_dir) {
char *opal_prefix = getenv("OPAL_PREFIX");
if (remote_sh) {
@ -921,7 +820,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
Must be very careful about obeying
[t]csh's order of evaluation and not
using a variable before it is defined.
See this thread for more details:
See this thread for more details:
http://www.open-mpi.org/community/lists/users/2006/01/0517.php. */
asprintf (&argv[local_exec_index],
"%s%s%s set path = ( %s/%s $path ) ; "
@ -943,7 +842,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
}
}
}
/* setup process name */
rc = orte_ns.get_proc_name_string(&name_string, name);
if (ORTE_SUCCESS != rc) {
@ -952,18 +851,18 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
}
free(argv[proc_name_index]);
argv[proc_name_index] = strdup(name_string);
if (!mca_pls_rsh_component.debug) {
/* setup stdin */
int fd = open("/dev/null", O_RDWR);
dup2(fd, 0);
close(fd);
}
/* close all file descriptors w/ exception of stdin/stdout/stderr */
for(fd=3; fd<fdmax; fd++)
close(fd);
/* Set signal handlers back to the default. Do this close
to the execve() because the event library may (and likely
will) reset them. If we don't do this, the event
@ -1020,7 +919,7 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
/* setup callback on sigchild - wait until setup above is complete
* as the callback can occur in the call to orte_wait_cb
*/
orte_wait_cb(pid, orte_pls_rsh_wait_daemon, dmn);
orte_wait_cb(pid, orte_pls_rsh_wait_daemon, NULL);
/* if required - add delay to avoid problems w/ X11 authentication */
if (mca_pls_rsh_component.debug && mca_pls_rsh_component.delay) {
@ -1030,14 +929,13 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
}
free(name);
}
/* all done, so store the daemon info on the registry */
if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&active_daemons))) {
ORTE_ERROR_LOG(rc);
}
/* get here if launch went okay */
failed_launch = false;
cleanup:
OBJ_RELEASE(map);
if (NULL != map) {
OBJ_RELEASE(map);
}
if (NULL != lib_base) {
free(lib_base);
@ -1046,8 +944,23 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
free(bin_base);
}
if (NULL != jobid_string) free(jobid_string); /* done with this variable */
if (NULL != argv) opal_argv_free(argv);
if (NULL != jobid_string) {
free(jobid_string); /* done with this variable */
}
if (NULL != argv) {
opal_argv_free(argv);
}
/* check for failed launch - if so, force terminate */
if (failed_launch) {
if (ORTE_SUCCESS != (rc = orte_smr.set_job_state(jobid, ORTE_JOB_STATE_FAILED_TO_START))) {
ORTE_ERROR_LOG(rc);
}
if (ORTE_SUCCESS != (rc = orte_wakeup(jobid))) {
ORTE_ERROR_LOG(rc);
}
}
return rc;
}