1
1

Fully restore fault recovery, both at the individual process and daemon level.

NOTE: MPI fault recovery remains unavailable pending merge from Josh. This only covers ORTE-level processes.

This commit was SVN r23335.
Этот коммит содержится в:
Ralph Castain 2010-07-01 19:45:43 +00:00
родитель 7190415977
Коммит f3d90dfb8d
5 изменённых файлов: 445 добавлений и 266 удалений

Просмотреть файл

@ -38,6 +38,7 @@
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/routed/routed.h"
#include "orte/tools/orterun/debuggers.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
@ -55,8 +56,11 @@ static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
orte_exit_code_t exit_code);
static void check_job_complete(orte_job_t *jdata);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc);
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
orte_proc_state_t state, orte_exit_code_t exit_code);
static orte_odls_child_t* proc_is_local(orte_process_name_t *proc);
static void record_dead_daemon(orte_job_t *jdat, orte_vpid_t vpid,
orte_proc_state_t state, orte_exit_code_t exit_code);
/*
* Module functions: Global
@ -308,17 +312,22 @@ static int update_state(orte_jobid_t job,
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
return ORTE_SUCCESS;
}
/* reset the child's state as restart_proc would
* have cleared it
*/
child->state = state;
ORTE_ERROR_LOG(rc);
/* let it fall thru to abort */
} else {
/* see if we can relocate it somewhere else */
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) {
if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) {
return ORTE_SUCCESS;
}
/* let it fall thru to abort */
}
} else {
/* this is a remote process - see if we can relocate it */
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) {
if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) {
return ORTE_SUCCESS;
}
/* guess not - let it fall thru to abort */
@ -378,10 +387,26 @@ static int update_state(orte_jobid_t job,
break;
case ORTE_PROC_STATE_COMM_FAILED:
/* delete the route */
orte_routed.delete_route(proc);
/* purge the oob */
orte_rml.purge(proc);
/* is this to a daemon? */
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
/* if we have ordered orteds to terminate, ignore this */
/* if we have ordered orteds to terminate, see if this one failed to tell
* us it had terminated
*/
if (orte_orteds_term_ordered) {
record_dead_daemon(jdata, proc->vpid, state, exit_code);
check_job_complete(jdata);
break;
}
/* if abort is in progress, see if this one failed to tell
* us it had terminated
*/
if (orte_abnormal_term_ordered) {
record_dead_daemon(jdata, proc->vpid, state, exit_code);
check_job_complete(jdata);
break;
}
/* if this is my own connection, ignore it */
@ -390,33 +415,37 @@ static int update_state(orte_jobid_t job,
}
if (orte_enable_recovery) {
/* relocate its processes */
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc))) {
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) {
/* unable to relocate for some reason */
opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc));
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
/* check if all is complete so we can terminate */
check_job_complete(jdata);
}
} else {
if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died",
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true,
ORTE_VPID_PRINT(proc->vpid), "Unknown");
} else {
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died",
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true,
ORTE_VPID_PRINT(proc->vpid),
(NULL == pdat->node) ? "Unknown" :
((NULL == pdat->node->name) ? "Unknown" : pdat->node->name));
}
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
update_proc(jdata, proc, state, pid, ORTE_ERR_COMM_FAILURE);
/* remove this proc from the daemon job */
record_dead_daemon(jdata, proc->vpid, state, exit_code);
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
/* check if all is complete so we can terminate */
check_job_complete(jdata);
}
} else {
/* delete the route */
orte_routed.delete_route(proc);
}
break;
@ -425,6 +454,7 @@ static int update_state(orte_jobid_t job,
if (orte_enable_recovery) {
/* relocate its processes */
} else {
record_dead_daemon(jdata, proc->vpid, state, exit_code);
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
@ -482,18 +512,26 @@ static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), exit_code));
orte_job_term_ordered = true;
/* if debuggers are running, clean up */
orte_debugger_finalize();
/* tell the plm to terminate all jobs */
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
ORTE_ERROR_LOG(rc);
}
/* set control params to indicate we are terminating */
orte_job_term_ordered = true;
orte_abnormal_term_ordered = true;
orte_enable_recovery = false;
/* set the exit status, just in case whomever called us failed
* to do so - it can only be done once, so we are protected
* from overwriting it
*/
ORTE_UPDATE_EXIT_STATUS(exit_code);
/* tell the plm to terminate the orteds - they will automatically
* kill their local procs
*/
if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
ORTE_ERROR_LOG(rc);
}
}
static void failed_start(orte_job_t *jdata)
@ -1088,12 +1126,13 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
OBJ_DESTRUCT(&proc);
}
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc)
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
orte_proc_state_t state, orte_exit_code_t exit_code)
{
orte_proc_t *pdata, *pdt;
orte_node_t *node;
orte_job_t *jdat;
orte_proc_t *pdata, *pdt, *pdt2;
orte_node_t *node, *nd;
orte_app_context_t *app;
orte_job_map_t *map;
char *app_name;
int rc, i, n;
@ -1103,64 +1142,114 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc)
opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc));
return ORTE_ERR_NOT_FOUND;
}
/* track that we are attempting to relocate */
pdata->relocates++;
/* have we exceeded the number of relocates for this proc? */
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx);
if (app->max_global_restarts < pdata->relocates) {
return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED;
}
/* set the state */
pdata->state = state;
/* retain the node id */
node = pdata->node;
/* if it is a daemon that died, we need to flag all of its procs
* to be relocated
*/
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
map = jdata->map;
/* remove this proc from the daemon job */
record_dead_daemon(jdata, proc->vpid, state, exit_code);
/* check to see if any other nodes are "alive" */
if (!orte_hnp_is_allocated && jdata->num_procs == 1) {
return ORTE_ERR_FATAL;
}
app_name = "orted";
for (n=0; n < map->nodes->size; n++) {
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
continue;
}
if (node->daemon->name.vpid != proc->vpid) {
continue;
}
/* found the node - now flag the procs */
/* scan the procs looking for each unique jobid on the node */
for (i=0; i < node->procs->size; i++) {
if (NULL == (pdt = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
if (ORTE_PROC_STATE_TERMINATED < pdt->state) {
/* get the job data object for this process */
if (NULL == (jdat = orte_get_job_data_object(pdt->name.jobid))) {
/* major problem */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
continue;
}
/* if the proc hasn't already terminated, then mark
* it as aborted so it will be restarted
*/
pdt->state = ORTE_PROC_STATE_ABORTED;
}
/* mark the node as "down" */
node->state = ORTE_NODE_STATE_DOWN;
/* remove it from the map */
opal_pointer_array_set_item(map->nodes, n, NULL);
/* do a release to maintain accounting - won't actually
* remove the node object from memory
/* since the node was used in this job's map, release
* it so that accounting is maintained
*/
OBJ_RELEASE(node);
/* mark this proc as dead so it will be restarted */
pdt->state = ORTE_PROC_STATE_ABORTED;
/* remove this proc from the node */
OBJ_RELEASE(pdt); /* maintains accounting */
opal_pointer_array_set_item(node->procs, i, NULL);
/* maintain accounting on num procs alive in case this can't restart */
jdat->num_terminated++;
/* look for all other procs on this node from the same job */
for (n=0; n < node->procs->size; n++) {
if (NULL == (pdt2 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
continue;
}
if (pdt2->name.jobid == pdt->name.jobid) {
/* mark this proc as having aborted */
pdt2->state = ORTE_PROC_STATE_ABORTED;
/* remove it from the node */
OBJ_RELEASE(pdt2);
opal_pointer_array_set_item(node->procs, n, NULL);
/* maintain accounting on num procs alive */
jdat->num_terminated++;
}
}
/* and remove the node from the map */
for (n=0; n < jdat->map->nodes->size; n++) {
if (NULL == (nd = (orte_node_t*)opal_pointer_array_get_item(jdat->map->nodes, n))) {
continue;
}
if (nd->index == node->index) {
opal_pointer_array_set_item(jdat->map->nodes, n, NULL);
OBJ_RELEASE(node); /* maintain accounting */
break;
}
} else {
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx);
app_name = app->app;
}
/* reset the job params for this job */
orte_plm_base_reset_job(jdat);
/* relaunch the job */
opal_output(0, "%s RELOCATING APPS FOR JOB %s FROM NODE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdat->jobid), node->name);
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdat))) {
opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc));
return rc;
}
}
return ORTE_SUCCESS;
}
/* otherwise, we are an app - try to relocate us to another node */
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx);
if (NULL == app) {
/* no way to restart this job */
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:cannot-relocate", true,
ORTE_NAME_PRINT(proc));
return ORTE_ERR_NOT_FOUND;
}
app_name = app->app;
/* track that we are attempting to relocate */
pdata->relocates++;
/* have we exceeded the number of relocates for this proc? */
if (app->max_global_restarts < pdata->relocates) {
return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED;
}
/* reset the job params for restart */
orte_plm_base_reset_job(jdata);
/* flag the current node as not-to-be-used */
pdata->node->state = ORTE_NODE_STATE_DO_NOT_USE;
/* restart the job - the spawn function will remap and
* launch the replacement proc(s)
*/
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
"%s RESTARTING APP: %s",
"%s RELOCATING APP %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
@ -1189,3 +1278,45 @@ static orte_odls_child_t* proc_is_local(orte_process_name_t *proc)
}
return NULL;
}
static void record_dead_daemon(orte_job_t *jdat, orte_vpid_t vpid,
orte_proc_state_t state, orte_exit_code_t exit_code)
{
orte_job_t *jdt;
orte_proc_t *pdat;
orte_node_t *node;
int i;
if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, vpid)) &&
ORTE_PROC_STATE_TERMINATED != pdat->state) {
/* need to record that this one died */
pdat->state = state;
pdat->exit_code = exit_code;
ORTE_UPDATE_EXIT_STATUS(exit_code);
/* remove it from the job array */
opal_pointer_array_set_item(jdat->procs, vpid, NULL);
orte_process_info.num_procs--;
jdat->num_procs--;
/* mark the node as down so it won't be used in mapping
* procs to be relaunched
*/
node = pdat->node;
node->state = ORTE_NODE_STATE_DOWN;
node->daemon = NULL;
OBJ_RELEASE(pdat); /* maintain accounting */
/* mark all procs on this node as having terminated */
for (i=0; i < node->procs->size; i++) {
if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
/* get the job data object for this process */
if (NULL == (jdt = orte_get_job_data_object(pdat->name.jobid))) {
/* major problem */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
continue;
}
pdat->state = ORTE_PROC_STATE_ABORTED;
jdt->num_terminated++;
}
}
}

Просмотреть файл

@ -31,3 +31,12 @@ check with your system administrator to try and determine the
source of the problem.
Your job is being terminated as a result.
#
[errmgr-hnp:cannot-relocate]
The system is unable to relocate the specified process:
Process: %s
because the application for that process could not be found. This
appears to be a system error. Please report it to the ORTE
developers.

Просмотреть файл

@ -246,7 +246,9 @@ static int update_state(orte_jobid_t job,
}
/* delete the route */
orte_routed.delete_route(proc);
/* see is this was a lifeline */
/* purge the oob */
orte_rml.purge(proc);
/* see if this was a lifeline */
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
/* kill our children */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
@ -322,16 +324,18 @@ static int update_state(orte_jobid_t job,
child->name->vpid == proc->vpid) {
/* see if this child has reached its local restart limit */
app = jobdat->apps[child->app_idx];
if (child->restarts == app->max_local_restarts ) {
goto REPORT_ABORT;
}
/* otherwise, attempt to restart it locally */
if (child->restarts < app->max_local_restarts ) {
/* attempt to restart it locally */
child->restarts++;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted restarting proc %s for the %d time",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name), child->restarts));
if (ORTE_SUCCESS != (rc = orte_odls.restart_proc(child))) {
/* reset the child's state as restart_proc would
* have cleared it
*/
child->state = state;
ORTE_ERROR_LOG(rc);
goto REPORT_ABORT;
}
@ -339,6 +343,7 @@ static int update_state(orte_jobid_t job,
}
}
}
}
REPORT_ABORT:
/* if the job hasn't completed and the state is abnormally

Просмотреть файл

@ -1188,12 +1188,27 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char ***env)
{
char *vpid_str, *param, *value;
char *param, *value;
orte_node_rank_t node_rank;
orte_local_rank_t local_rank;
int rc;
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid_str, child->name->vpid))) {
/* setup the jobid */
if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name->jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
return rc;
}
opal_setenv(param, value, true, env);
free(param);
free(value);
/* setup the vpid */
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name->vpid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
@ -1202,7 +1217,7 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char *
rc = ORTE_ERR_OUT_OF_RESOURCE;
return rc;
}
opal_setenv(param, vpid_str, true, env);
opal_setenv(param, value, true, env);
free(param);
/* although the vpid IS the process' rank within the job, users
@ -1213,8 +1228,8 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char *
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it
*/
opal_setenv("OMPI_COMM_WORLD_RANK", vpid_str, true, env);
free(vpid_str); /* done with this now */
opal_setenv("OMPI_COMM_WORLD_RANK", value, true, env);
free(value); /* done with this now */
/* users would appreciate being given a public environmental variable
* that also represents the local rank value - something MPI specific - so
@ -1291,6 +1306,78 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char *
return ORTE_SUCCESS;
}
static int setup_path(orte_app_context_t *app)
{
int rc;
char dir[MAXPATHLEN];
char **argvptr;
char *pathenv = NULL, *mpiexec_pathenv = NULL;
char *full_search;
/* Try to change to the app's cwd and check that the app
exists and is executable The function will
take care of outputting a pretty error message, if required
*/
if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(app, true))) {
/* do not ERROR_LOG - it will be reported elsewhere */
goto CLEANUP;
}
/* The prior function will have done a chdir() to jump us to
* wherever the app is to be executed. This could be either where
* the user specified (via -wdir), or to the user's home directory
* on this node if nothing was provided. It seems that chdir doesn't
* adjust the $PWD enviro variable when it changes the directory. This
* can cause a user to get a different response when doing getcwd vs
* looking at the enviro variable. To keep this consistent, we explicitly
* ensure that the PWD enviro variable matches the CWD we moved to.
*
* NOTE: if a user's program does a chdir(), then $PWD will once
* again not match getcwd! This is beyond our control - we are only
* ensuring they start out matching.
*/
getcwd(dir, sizeof(dir));
opal_setenv("PWD", dir, true, &app->env);
/* Search for the OMPI_exec_path and PATH settings in the environment. */
for (argvptr = app->env; *argvptr != NULL; argvptr++) {
if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
mpiexec_pathenv = *argvptr + 15;
}
if (0 == strncmp("PATH=", *argvptr, 5)) {
pathenv = *argvptr + 5;
}
}
/* If OMPI_exec_path is set (meaning --path was used), then create a
temporary environment to be used in the search for the executable.
The PATH setting in this temporary environment is a combination of
the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
then just use existing environment with PATH in it. */
if (NULL != mpiexec_pathenv) {
argvptr = NULL;
if (pathenv != NULL) {
asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
} else {
asprintf(&full_search, "%s", mpiexec_pathenv);
}
opal_setenv("PATH", full_search, true, &argvptr);
free(full_search);
} else {
argvptr = app->env;
}
rc = orte_util_check_context_app(app, argvptr);
/* do not ERROR_LOG - it will be reported elsewhere */
if (NULL != mpiexec_pathenv) {
opal_argv_free(argvptr);
}
CLEANUP:
return rc;
}
/* define a timer release point so that we can wait for
* file descriptors to come available, if necessary
*/
@ -1311,7 +1398,6 @@ static void timer_cb(int fd, short event, void *cbdata)
int orte_odls_base_default_launch_local(orte_jobid_t job,
orte_odls_base_fork_local_proc_fn_t fork_local)
{
char *job_str, *param;
opal_list_item_t *item;
orte_app_context_t *app, **apps;
orte_app_idx_t i, num_apps;
@ -1323,11 +1409,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
opal_buffer_t alert;
orte_std_cntr_t proc_rank;
orte_odls_job_t *jobdat;
char *pathenv = NULL, *mpiexec_pathenv = NULL;
char basedir[MAXPATHLEN];
char dir[MAXPATHLEN];
char **argvptr;
char *full_search;
char **argvsav=NULL;
int inm;
opal_event_t *delay;
@ -1518,75 +1600,21 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
goto CLEANUP;
}
/* Try to change to the app's cwd and check that the app
exists and is executable The function will
take care of outputting a pretty error message, if required
/* setup the working directory for this app - will jump us
* to that directory
*/
if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(app, true))) {
/* do not ERROR_LOG - it will be reported elsewhere */
/* cycle through children to find those for this jobid */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID) &&
i == child->app_idx) {
child->exit_code = rc;
}
}
goto CLEANUP;
}
/* The prior function will have done a chdir() to jump us to
* wherever the app is to be executed. This could be either where
* the user specified (via -wdir), or to the user's home directory
* on this node if nothing was provided. It seems that chdir doesn't
* adjust the $PWD enviro variable when it changes the directory. This
* can cause a user to get a different response when doing getcwd vs
* looking at the enviro variable. To keep this consistent, we explicitly
* ensure that the PWD enviro variable matches the CWD we moved to.
*
* NOTE: if a user's program does a chdir(), then $PWD will once
* again not match getcwd! This is beyond our control - we are only
* ensuring they start out matching.
if (ORTE_SUCCESS != (rc = setup_path(app))) {
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:launch:setup_path failed with error %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_ERROR_NAME(rc)));
/* do not ERROR_LOG this failure - it will be reported
* elsewhere. The launch is going to fail. Since we could have
* multiple app_contexts, we need to ensure that we flag only
* the correct one that caused this operation to fail. We then have
* to flag all the other procs from the app_context as having "not failed"
* so we can report things out correctly
*/
getcwd(dir, sizeof(dir));
opal_setenv("PWD", dir, true, &app->env);
/* Search for the OMPI_exec_path and PATH settings in the environment. */
for (argvptr = app->env; *argvptr != NULL; argvptr++) {
if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
mpiexec_pathenv = *argvptr + 15;
}
if (0 == strncmp("PATH=", *argvptr, 5)) {
pathenv = *argvptr + 5;
}
}
/* If OMPI_exec_path is set (meaning --path was used), then create a
temporary environment to be used in the search for the executable.
The PATH setting in this temporary environment is a combination of
the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
then just use existing environment with PATH in it. */
if (NULL != mpiexec_pathenv) {
argvptr = NULL;
if (pathenv != NULL) {
asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
} else {
asprintf(&full_search, "%s", mpiexec_pathenv);
}
opal_setenv("PATH", full_search, true, &argvptr);
free(full_search);
} else {
argvptr = app->env;
}
if (ORTE_SUCCESS != (rc = orte_util_check_context_app(app, argvptr))) {
/* do not ERROR_LOG - it will be reported elsewhere */
if (NULL != mpiexec_pathenv) {
opal_argv_free(argvptr);
}
/* cycle through children to find those for this jobid */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
@ -1600,9 +1628,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
/* okay, now tell the HNP we couldn't do it */
goto CLEANUP;
}
if (NULL != mpiexec_pathenv) {
opal_argv_free(argvptr);
}
/* okay, now let's launch all the local procs for this app using the provided fork_local fn */
for (proc_rank = 0, item = opal_list_get_first(&orte_local_children);
@ -1765,19 +1790,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
/* setup the rest of the environment with the proc-specific items - these
* will be overwritten for each child
*/
if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&job_str, child->name->jobid))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto CLEANUP;
}
opal_setenv(param, job_str, true, &app->env);
free(param);
free(job_str);
if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
@ -1871,6 +1883,9 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
launch_failed = false;
CLEANUP:
/* ensure we reset our working directory back to our default location */
chdir(basedir);
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:launch reporting job %s launch status",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -2947,6 +2962,7 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
orte_app_context_t *app;
opal_list_item_t *item;
orte_odls_job_t *jobdat;
char basedir[MAXPATHLEN];
/* protect operations involving the global list of children */
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
@ -2956,6 +2972,12 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
/* establish our baseline working directory - we will be potentially
* bouncing around as we execute this app, but we will always return
* to this place as our default directory
*/
getcwd(basedir, sizeof(basedir));
/* find this child's jobdat */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
@ -2985,19 +3007,36 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
child->rml_uri = NULL;
}
app = jobdat->apps[child->app_idx];
/* reset envars to match this child */
/* reset envars to match this child */
if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) {
ORTE_ERROR_LOG(rc);
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
goto CLEANUP;
}
opal_output(0, "%s restarting app %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app);
/* setup the path */
if (ORTE_SUCCESS != (rc = setup_path(app))) {
ORTE_ERROR_LOG(rc);
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
goto CLEANUP;
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s restarting app %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
/* must unlock prior to fork to keep things clean in the
* event library
*/
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
rc = fork_local(app, child, app->env, jobdat);
if (ORTE_SUCCESS == rc) {
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL);
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
}
CLEANUP:
@ -3007,7 +3046,14 @@ CLEANUP:
ORTE_NAME_PRINT(child->name),
(ORTE_SUCCESS == rc) ? "succeeded" : "failed"));
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
/* reset our working directory back to our default location - if we
* don't do this, then we will be looking for relative paths starting
* from the last wdir option specified by the user. Thus, we would
* be requiring that the user keep track on the cmd line of where
* each app was located relative to the prior app, instead of relative
* to their current location
*/
chdir(basedir);
return rc;
}

Просмотреть файл

@ -5,36 +5,24 @@
* A program that just spins - provides mechanism for testing user-driven
* abnormal program termination
*/
#include "opal_config.h"
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include "opal/runtime/opal_progress.h"
#include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/runtime.h"
int main(int argc, char* argv[])
{
int i;
double pi;
orte_init(&argc, &argv, ORTE_PROC_NON_MPI);
i = 0;
while (1) {
i++;
pi = i / 3.14159256;
if (i > 100) {
/* need to progress so we can
* wake up if our daemon goes
* away!
*/
opal_progress();
/* reset the counter so we loop */
i = 0;
}
if (ORTE_SUCCESS != orte_init(&argc, &argv, ORTE_PROC_NON_MPI)) {
fprintf(stderr, "ORTE_INIT FAILED\n");
exit(1);
}
opal_output(0, "%s RUNNING", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
opal_event_dispatch();
orte_finalize();