1
1

Fully restore fault recovery, both at the individual process and daemon level.

NOTE: MPI fault recovery remains unavailable pending merge from Josh. This only covers ORTE-level processes.

This commit was SVN r23335.
Этот коммит содержится в:
Ralph Castain 2010-07-01 19:45:43 +00:00
родитель 7190415977
Коммит f3d90dfb8d
5 изменённых файлов: 445 добавлений и 266 удалений

Просмотреть файл

@ -38,6 +38,7 @@
#include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/sensor/sensor.h" #include "orte/mca/sensor/sensor.h"
#include "orte/mca/routed/routed.h" #include "orte/mca/routed/routed.h"
#include "orte/tools/orterun/debuggers.h"
#include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/base.h"
@ -55,8 +56,11 @@ static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
orte_exit_code_t exit_code); orte_exit_code_t exit_code);
static void check_job_complete(orte_job_t *jdata); static void check_job_complete(orte_job_t *jdata);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid); static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc); static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
orte_proc_state_t state, orte_exit_code_t exit_code);
static orte_odls_child_t* proc_is_local(orte_process_name_t *proc); static orte_odls_child_t* proc_is_local(orte_process_name_t *proc);
static void record_dead_daemon(orte_job_t *jdat, orte_vpid_t vpid,
orte_proc_state_t state, orte_exit_code_t exit_code);
/* /*
* Module functions: Global * Module functions: Global
@ -308,17 +312,22 @@ static int update_state(orte_jobid_t job,
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
/* reset the child's state as restart_proc would
* have cleared it
*/
child->state = state;
ORTE_ERROR_LOG(rc);
/* let it fall thru to abort */ /* let it fall thru to abort */
} else { } else {
/* see if we can relocate it somewhere else */ /* see if we can relocate it somewhere else */
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) { if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) {
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
/* let it fall thru to abort */ /* let it fall thru to abort */
} }
} else { } else {
/* this is a remote process - see if we can relocate it */ /* this is a remote process - see if we can relocate it */
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) { if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) {
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
/* guess not - let it fall thru to abort */ /* guess not - let it fall thru to abort */
@ -378,10 +387,26 @@ static int update_state(orte_jobid_t job,
break; break;
case ORTE_PROC_STATE_COMM_FAILED: case ORTE_PROC_STATE_COMM_FAILED:
/* delete the route */
orte_routed.delete_route(proc);
/* purge the oob */
orte_rml.purge(proc);
/* is this to a daemon? */ /* is this to a daemon? */
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
/* if we have ordered orteds to terminate, ignore this */ /* if we have ordered orteds to terminate, see if this one failed to tell
* us it had terminated
*/
if (orte_orteds_term_ordered) { if (orte_orteds_term_ordered) {
record_dead_daemon(jdata, proc->vpid, state, exit_code);
check_job_complete(jdata);
break;
}
/* if abort is in progress, see if this one failed to tell
* us it had terminated
*/
if (orte_abnormal_term_ordered) {
record_dead_daemon(jdata, proc->vpid, state, exit_code);
check_job_complete(jdata);
break; break;
} }
/* if this is my own connection, ignore it */ /* if this is my own connection, ignore it */
@ -390,33 +415,37 @@ static int update_state(orte_jobid_t job,
} }
if (orte_enable_recovery) { if (orte_enable_recovery) {
/* relocate its processes */ /* relocate its processes */
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc))) { if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) {
/* unable to relocate for some reason */
opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc));
/* kill all local procs */ /* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */ /* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code); hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
/* check if all is complete so we can terminate */
check_job_complete(jdata);
} }
} else { } else {
if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true,
ORTE_VPID_PRINT(proc->vpid), "Unknown"); ORTE_VPID_PRINT(proc->vpid), "Unknown");
} else { } else {
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true,
ORTE_VPID_PRINT(proc->vpid), ORTE_VPID_PRINT(proc->vpid),
(NULL == pdat->node) ? "Unknown" : (NULL == pdat->node) ? "Unknown" :
((NULL == pdat->node->name) ? "Unknown" : pdat->node->name)); ((NULL == pdat->node->name) ? "Unknown" : pdat->node->name));
} }
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE); /* remove this proc from the daemon job */
update_proc(jdata, proc, state, pid, ORTE_ERR_COMM_FAILURE); record_dead_daemon(jdata, proc->vpid, state, exit_code);
/* kill all local procs */ /* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */ /* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code); hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
/* check if all is complete so we can terminate */
check_job_complete(jdata);
} }
} else {
/* delete the route */
orte_routed.delete_route(proc);
} }
break; break;
@ -425,6 +454,7 @@ static int update_state(orte_jobid_t job,
if (orte_enable_recovery) { if (orte_enable_recovery) {
/* relocate its processes */ /* relocate its processes */
} else { } else {
record_dead_daemon(jdata, proc->vpid, state, exit_code);
/* kill all local procs */ /* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */ /* kill all jobs */
@ -482,18 +512,26 @@ static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job), exit_code)); ORTE_JOBID_PRINT(job), exit_code));
orte_job_term_ordered = true; /* if debuggers are running, clean up */
orte_debugger_finalize();
/* tell the plm to terminate all jobs */ /* set control params to indicate we are terminating */
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) { orte_job_term_ordered = true;
ORTE_ERROR_LOG(rc); orte_abnormal_term_ordered = true;
} orte_enable_recovery = false;
/* set the exit status, just in case whomever called us failed /* set the exit status, just in case whomever called us failed
* to do so - it can only be done once, so we are protected * to do so - it can only be done once, so we are protected
* from overwriting it * from overwriting it
*/ */
ORTE_UPDATE_EXIT_STATUS(exit_code); ORTE_UPDATE_EXIT_STATUS(exit_code);
/* tell the plm to terminate the orteds - they will automatically
* kill their local procs
*/
if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
ORTE_ERROR_LOG(rc);
}
} }
static void failed_start(orte_job_t *jdata) static void failed_start(orte_job_t *jdata)
@ -1088,12 +1126,13 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
OBJ_DESTRUCT(&proc); OBJ_DESTRUCT(&proc);
} }
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc) static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
orte_proc_state_t state, orte_exit_code_t exit_code)
{ {
orte_proc_t *pdata, *pdt; orte_job_t *jdat;
orte_node_t *node; orte_proc_t *pdata, *pdt, *pdt2;
orte_node_t *node, *nd;
orte_app_context_t *app; orte_app_context_t *app;
orte_job_map_t *map;
char *app_name; char *app_name;
int rc, i, n; int rc, i, n;
@ -1103,64 +1142,114 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc)
opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc)); opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc));
return ORTE_ERR_NOT_FOUND; return ORTE_ERR_NOT_FOUND;
} }
/* track that we are attempting to relocate */
pdata->relocates++; /* set the state */
/* have we exceeded the number of relocates for this proc? */ pdata->state = state;
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx);
if (app->max_global_restarts < pdata->relocates) { /* retain the node id */
return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED; node = pdata->node;
}
/* if it is a daemon that died, we need to flag all of its procs /* if it is a daemon that died, we need to flag all of its procs
* to be relocated * to be relocated
*/ */
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
map = jdata->map; /* remove this proc from the daemon job */
record_dead_daemon(jdata, proc->vpid, state, exit_code);
/* check to see if any other nodes are "alive" */
if (!orte_hnp_is_allocated && jdata->num_procs == 1) {
return ORTE_ERR_FATAL;
}
app_name = "orted"; app_name = "orted";
for (n=0; n < map->nodes->size; n++) { /* scan the procs looking for each unique jobid on the node */
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) { for (i=0; i < node->procs->size; i++) {
if (NULL == (pdt = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue; continue;
} }
if (node->daemon->name.vpid != proc->vpid) { /* get the job data object for this process */
if (NULL == (jdat = orte_get_job_data_object(pdt->name.jobid))) {
/* major problem */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
continue; continue;
} }
/* found the node - now flag the procs */ /* since the node was used in this job's map, release
for (i=0; i < node->procs->size; i++) { * it so that accounting is maintained
if (NULL == (pdt = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
if (ORTE_PROC_STATE_TERMINATED < pdt->state) {
continue;
}
/* if the proc hasn't already terminated, then mark
* it as aborted so it will be restarted
*/
pdt->state = ORTE_PROC_STATE_ABORTED;
}
/* mark the node as "down" */
node->state = ORTE_NODE_STATE_DOWN;
/* remove it from the map */
opal_pointer_array_set_item(map->nodes, n, NULL);
/* do a release to maintain accounting - won't actually
* remove the node object from memory
*/ */
OBJ_RELEASE(node); OBJ_RELEASE(node);
break; /* mark this proc as dead so it will be restarted */
pdt->state = ORTE_PROC_STATE_ABORTED;
/* remove this proc from the node */
OBJ_RELEASE(pdt); /* maintains accounting */
opal_pointer_array_set_item(node->procs, i, NULL);
/* maintain accounting on num procs alive in case this can't restart */
jdat->num_terminated++;
/* look for all other procs on this node from the same job */
for (n=0; n < node->procs->size; n++) {
if (NULL == (pdt2 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
continue;
}
if (pdt2->name.jobid == pdt->name.jobid) {
/* mark this proc as having aborted */
pdt2->state = ORTE_PROC_STATE_ABORTED;
/* remove it from the node */
OBJ_RELEASE(pdt2);
opal_pointer_array_set_item(node->procs, n, NULL);
/* maintain accounting on num procs alive */
jdat->num_terminated++;
}
}
/* and remove the node from the map */
for (n=0; n < jdat->map->nodes->size; n++) {
if (NULL == (nd = (orte_node_t*)opal_pointer_array_get_item(jdat->map->nodes, n))) {
continue;
}
if (nd->index == node->index) {
opal_pointer_array_set_item(jdat->map->nodes, n, NULL);
OBJ_RELEASE(node); /* maintain accounting */
break;
}
}
/* reset the job params for this job */
orte_plm_base_reset_job(jdat);
/* relaunch the job */
opal_output(0, "%s RELOCATING APPS FOR JOB %s FROM NODE %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdat->jobid), node->name);
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdat))) {
opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc));
return rc;
}
} }
} else {
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx); return ORTE_SUCCESS;
app_name = app->app;
} }
/* otherwise, we are an app - try to relocate us to another node */
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx);
if (NULL == app) {
/* no way to restart this job */
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:cannot-relocate", true,
ORTE_NAME_PRINT(proc));
return ORTE_ERR_NOT_FOUND;
}
app_name = app->app;
/* track that we are attempting to relocate */
pdata->relocates++;
/* have we exceeded the number of relocates for this proc? */
if (app->max_global_restarts < pdata->relocates) {
return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED;
}
/* reset the job params for restart */ /* reset the job params for restart */
orte_plm_base_reset_job(jdata); orte_plm_base_reset_job(jdata);
/* flag the current node as not-to-be-used */
pdata->node->state = ORTE_NODE_STATE_DO_NOT_USE;
/* restart the job - the spawn function will remap and /* restart the job - the spawn function will remap and
* launch the replacement proc(s) * launch the replacement proc(s)
*/ */
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
"%s RESTARTING APP: %s", "%s RELOCATING APP %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc))); ORTE_NAME_PRINT(proc)));
@ -1189,3 +1278,45 @@ static orte_odls_child_t* proc_is_local(orte_process_name_t *proc)
} }
return NULL; return NULL;
} }
static void record_dead_daemon(orte_job_t *jdat, orte_vpid_t vpid,
orte_proc_state_t state, orte_exit_code_t exit_code)
{
orte_job_t *jdt;
orte_proc_t *pdat;
orte_node_t *node;
int i;
if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, vpid)) &&
ORTE_PROC_STATE_TERMINATED != pdat->state) {
/* need to record that this one died */
pdat->state = state;
pdat->exit_code = exit_code;
ORTE_UPDATE_EXIT_STATUS(exit_code);
/* remove it from the job array */
opal_pointer_array_set_item(jdat->procs, vpid, NULL);
orte_process_info.num_procs--;
jdat->num_procs--;
/* mark the node as down so it won't be used in mapping
* procs to be relaunched
*/
node = pdat->node;
node->state = ORTE_NODE_STATE_DOWN;
node->daemon = NULL;
OBJ_RELEASE(pdat); /* maintain accounting */
/* mark all procs on this node as having terminated */
for (i=0; i < node->procs->size; i++) {
if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
/* get the job data object for this process */
if (NULL == (jdt = orte_get_job_data_object(pdat->name.jobid))) {
/* major problem */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
continue;
}
pdat->state = ORTE_PROC_STATE_ABORTED;
jdt->num_terminated++;
}
}
}

Просмотреть файл

@ -31,3 +31,12 @@ check with your system administrator to try and determine the
source of the problem. source of the problem.
Your job is being terminated as a result. Your job is being terminated as a result.
#
[errmgr-hnp:cannot-relocate]
The system is unable to relocate the specified process:
Process: %s
because the application for that process could not be found. This
appears to be a system error. Please report it to the ORTE
developers.

Просмотреть файл

@ -188,31 +188,31 @@ static int update_state(orte_jobid_t job,
} }
switch (jobstate) { switch (jobstate) {
case ORTE_JOB_STATE_FAILED_TO_START: case ORTE_JOB_STATE_FAILED_TO_START:
failed_start(jobdat, exit_code); failed_start(jobdat, exit_code);
break; break;
case ORTE_JOB_STATE_RUNNING: case ORTE_JOB_STATE_RUNNING:
/* update all local child states */ /* update all local child states */
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING); update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING);
break; break;
case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED:
/* update all procs in job */ /* update all procs in job */
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
/* order all local procs for this job to be killed */ /* order all local procs for this job to be killed */
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
case ORTE_JOB_STATE_COMM_FAILED: case ORTE_JOB_STATE_COMM_FAILED:
/* kill all local procs */ /* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* tell the caller we can't recover */ /* tell the caller we can't recover */
return ORTE_ERR_UNRECOVERABLE; return ORTE_ERR_UNRECOVERABLE;
break; break;
case ORTE_JOB_STATE_HEARTBEAT_FAILED: case ORTE_JOB_STATE_HEARTBEAT_FAILED:
/* let the HNP handle this */ /* let the HNP handle this */
return ORTE_SUCCESS; return ORTE_SUCCESS;
break; break;
default: default:
break; break;
} }
OBJ_CONSTRUCT(&alert, opal_buffer_t); OBJ_CONSTRUCT(&alert, opal_buffer_t);
/* pack update state command */ /* pack update state command */
@ -246,7 +246,9 @@ static int update_state(orte_jobid_t job,
} }
/* delete the route */ /* delete the route */
orte_routed.delete_route(proc); orte_routed.delete_route(proc);
/* see is this was a lifeline */ /* purge the oob */
orte_rml.purge(proc);
/* see if this was a lifeline */
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
/* kill our children */ /* kill our children */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
@ -313,7 +315,7 @@ static int update_state(orte_jobid_t job,
if (ORTE_PROC_STATE_TERMINATED < state) { if (ORTE_PROC_STATE_TERMINATED < state) {
if (jobdat->enable_recovery) { if (jobdat->enable_recovery) {
/* find this proc in the local children */ /* find this proc in the local children */
for (item = opal_list_get_first(&orte_local_children); for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children); item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) { item = opal_list_get_next(item)) {
@ -322,20 +324,23 @@ static int update_state(orte_jobid_t job,
child->name->vpid == proc->vpid) { child->name->vpid == proc->vpid) {
/* see if this child has reached its local restart limit */ /* see if this child has reached its local restart limit */
app = jobdat->apps[child->app_idx]; app = jobdat->apps[child->app_idx];
if (child->restarts == app->max_local_restarts ) { if (child->restarts < app->max_local_restarts ) {
goto REPORT_ABORT; /* attempt to restart it locally */
child->restarts++;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted restarting proc %s for the %d time",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name), child->restarts));
if (ORTE_SUCCESS != (rc = orte_odls.restart_proc(child))) {
/* reset the child's state as restart_proc would
* have cleared it
*/
child->state = state;
ORTE_ERROR_LOG(rc);
goto REPORT_ABORT;
}
return ORTE_SUCCESS;
} }
/* otherwise, attempt to restart it locally */
child->restarts++;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted restarting proc %s for the %d time",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name), child->restarts));
if (ORTE_SUCCESS != (rc = orte_odls.restart_proc(child))) {
ORTE_ERROR_LOG(rc);
goto REPORT_ABORT;
}
return ORTE_SUCCESS;
} }
} }
} }
@ -571,9 +576,9 @@ int ft_event(int state)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
/***************** /*****************
* Local Functions * Local Functions
*****************/ *****************/
static bool any_live_children(orte_jobid_t job) static bool any_live_children(orte_jobid_t job)
{ {
opal_list_item_t *item; opal_list_item_t *item;
@ -640,53 +645,53 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child)
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat)
{ {
int rc; int rc;
opal_list_item_t *item, *next; opal_list_item_t *item, *next;
orte_odls_child_t *child; orte_odls_child_t *child;
orte_vpid_t null=ORTE_VPID_INVALID; orte_vpid_t null=ORTE_VPID_INVALID;
/* pack the jobid */ /* pack the jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we are timing things, pack the time the launch msg for this job was recvd */
if (orte_timing) {
int64_t tmp;
tmp = jobdat->launch_msg_recvd.tv_sec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
tmp = jobdat->launch_msg_recvd.tv_usec; /* if we are timing things, pack the time the launch msg for this job was recvd */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { if (orte_timing) {
ORTE_ERROR_LOG(rc); int64_t tmp;
return rc; tmp = jobdat->launch_msg_recvd.tv_sec;
} if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
} ORTE_ERROR_LOG(rc);
for (item = opal_list_get_first(&orte_local_children); return rc;
item != opal_list_get_end(&orte_local_children); }
item = next) { tmp = jobdat->launch_msg_recvd.tv_usec;
child = (orte_odls_child_t*)item; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
next = opal_list_get_next(item);
/* if this child is part of the job... */
if (child->name->jobid == jobdat->jobid) {
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
} }
} for (item = opal_list_get_first(&orte_local_children);
/* flag that this job is complete so the receiver can know */ item != opal_list_get_end(&orte_local_children);
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { item = next) {
ORTE_ERROR_LOG(rc); child = (orte_odls_child_t*)item;
return rc; next = opal_list_get_next(item);
} /* if this child is part of the job... */
if (child->name->jobid == jobdat->jobid) {
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
/* flag that this job is complete so the receiver can know */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
static bool all_children_registered(orte_jobid_t job) static bool all_children_registered(orte_jobid_t job)
{ {

Просмотреть файл

@ -1188,12 +1188,27 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char ***env) static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char ***env)
{ {
char *vpid_str, *param, *value; char *param, *value;
orte_node_rank_t node_rank; orte_node_rank_t node_rank;
orte_local_rank_t local_rank; orte_local_rank_t local_rank;
int rc; int rc;
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid_str, child->name->vpid))) { /* setup the jobid */
if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name->jobid))) {
ORTE_ERROR_LOG(rc);
return rc;
}
if (NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
return rc;
}
opal_setenv(param, value, true, env);
free(param);
free(value);
/* setup the vpid */
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name->vpid))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
return rc; return rc;
} }
@ -1202,7 +1217,7 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char *
rc = ORTE_ERR_OUT_OF_RESOURCE; rc = ORTE_ERR_OUT_OF_RESOURCE;
return rc; return rc;
} }
opal_setenv(param, vpid_str, true, env); opal_setenv(param, value, true, env);
free(param); free(param);
/* although the vpid IS the process' rank within the job, users /* although the vpid IS the process' rank within the job, users
@ -1213,8 +1228,8 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char *
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
* We know - just live with it * We know - just live with it
*/ */
opal_setenv("OMPI_COMM_WORLD_RANK", vpid_str, true, env); opal_setenv("OMPI_COMM_WORLD_RANK", value, true, env);
free(vpid_str); /* done with this now */ free(value); /* done with this now */
/* users would appreciate being given a public environmental variable /* users would appreciate being given a public environmental variable
* that also represents the local rank value - something MPI specific - so * that also represents the local rank value - something MPI specific - so
@ -1291,6 +1306,78 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char *
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
static int setup_path(orte_app_context_t *app)
{
int rc;
char dir[MAXPATHLEN];
char **argvptr;
char *pathenv = NULL, *mpiexec_pathenv = NULL;
char *full_search;
/* Try to change to the app's cwd and check that the app
exists and is executable The function will
take care of outputting a pretty error message, if required
*/
if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(app, true))) {
/* do not ERROR_LOG - it will be reported elsewhere */
goto CLEANUP;
}
/* The prior function will have done a chdir() to jump us to
* wherever the app is to be executed. This could be either where
* the user specified (via -wdir), or to the user's home directory
* on this node if nothing was provided. It seems that chdir doesn't
* adjust the $PWD enviro variable when it changes the directory. This
* can cause a user to get a different response when doing getcwd vs
* looking at the enviro variable. To keep this consistent, we explicitly
* ensure that the PWD enviro variable matches the CWD we moved to.
*
* NOTE: if a user's program does a chdir(), then $PWD will once
* again not match getcwd! This is beyond our control - we are only
* ensuring they start out matching.
*/
getcwd(dir, sizeof(dir));
opal_setenv("PWD", dir, true, &app->env);
/* Search for the OMPI_exec_path and PATH settings in the environment. */
for (argvptr = app->env; *argvptr != NULL; argvptr++) {
if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
mpiexec_pathenv = *argvptr + 15;
}
if (0 == strncmp("PATH=", *argvptr, 5)) {
pathenv = *argvptr + 5;
}
}
/* If OMPI_exec_path is set (meaning --path was used), then create a
temporary environment to be used in the search for the executable.
The PATH setting in this temporary environment is a combination of
the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
then just use existing environment with PATH in it. */
if (NULL != mpiexec_pathenv) {
argvptr = NULL;
if (pathenv != NULL) {
asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
} else {
asprintf(&full_search, "%s", mpiexec_pathenv);
}
opal_setenv("PATH", full_search, true, &argvptr);
free(full_search);
} else {
argvptr = app->env;
}
rc = orte_util_check_context_app(app, argvptr);
/* do not ERROR_LOG - it will be reported elsewhere */
if (NULL != mpiexec_pathenv) {
opal_argv_free(argvptr);
}
CLEANUP:
return rc;
}
/* define a timer release point so that we can wait for /* define a timer release point so that we can wait for
* file descriptors to come available, if necessary * file descriptors to come available, if necessary
*/ */
@ -1311,7 +1398,6 @@ static void timer_cb(int fd, short event, void *cbdata)
int orte_odls_base_default_launch_local(orte_jobid_t job, int orte_odls_base_default_launch_local(orte_jobid_t job,
orte_odls_base_fork_local_proc_fn_t fork_local) orte_odls_base_fork_local_proc_fn_t fork_local)
{ {
char *job_str, *param;
opal_list_item_t *item; opal_list_item_t *item;
orte_app_context_t *app, **apps; orte_app_context_t *app, **apps;
orte_app_idx_t i, num_apps; orte_app_idx_t i, num_apps;
@ -1323,11 +1409,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
opal_buffer_t alert; opal_buffer_t alert;
orte_std_cntr_t proc_rank; orte_std_cntr_t proc_rank;
orte_odls_job_t *jobdat; orte_odls_job_t *jobdat;
char *pathenv = NULL, *mpiexec_pathenv = NULL;
char basedir[MAXPATHLEN]; char basedir[MAXPATHLEN];
char dir[MAXPATHLEN];
char **argvptr;
char *full_search;
char **argvsav=NULL; char **argvsav=NULL;
int inm; int inm;
opal_event_t *delay; opal_event_t *delay;
@ -1518,75 +1600,21 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
goto CLEANUP; goto CLEANUP;
} }
/* setup the working directory for this app - will jump us
/* Try to change to the app's cwd and check that the app * to that directory
exists and is executable The function will
take care of outputting a pretty error message, if required
*/ */
if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(app, true))) { if (ORTE_SUCCESS != (rc = setup_path(app))) {
/* do not ERROR_LOG - it will be reported elsewhere */ OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
/* cycle through children to find those for this jobid */ "%s odls:launch:setup_path failed with error %s",
for (item = opal_list_get_first(&orte_local_children); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
item != opal_list_get_end(&orte_local_children); ORTE_ERROR_NAME(rc)));
item = opal_list_get_next(item)) { /* do not ERROR_LOG this failure - it will be reported
child = (orte_odls_child_t*)item; * elsewhere. The launch is going to fail. Since we could have
if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID) && * multiple app_contexts, we need to ensure that we flag only
i == child->app_idx) { * the correct one that caused this operation to fail. We then have
child->exit_code = rc; * to flag all the other procs from the app_context as having "not failed"
} * so we can report things out correctly
} */
goto CLEANUP;
}
/* The prior function will have done a chdir() to jump us to
* wherever the app is to be executed. This could be either where
* the user specified (via -wdir), or to the user's home directory
* on this node if nothing was provided. It seems that chdir doesn't
* adjust the $PWD enviro variable when it changes the directory. This
* can cause a user to get a different response when doing getcwd vs
* looking at the enviro variable. To keep this consistent, we explicitly
* ensure that the PWD enviro variable matches the CWD we moved to.
*
* NOTE: if a user's program does a chdir(), then $PWD will once
* again not match getcwd! This is beyond our control - we are only
* ensuring they start out matching.
*/
getcwd(dir, sizeof(dir));
opal_setenv("PWD", dir, true, &app->env);
/* Search for the OMPI_exec_path and PATH settings in the environment. */
for (argvptr = app->env; *argvptr != NULL; argvptr++) {
if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
mpiexec_pathenv = *argvptr + 15;
}
if (0 == strncmp("PATH=", *argvptr, 5)) {
pathenv = *argvptr + 5;
}
}
/* If OMPI_exec_path is set (meaning --path was used), then create a
temporary environment to be used in the search for the executable.
The PATH setting in this temporary environment is a combination of
the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
then just use existing environment with PATH in it. */
if (NULL != mpiexec_pathenv) {
argvptr = NULL;
if (pathenv != NULL) {
asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
} else {
asprintf(&full_search, "%s", mpiexec_pathenv);
}
opal_setenv("PATH", full_search, true, &argvptr);
free(full_search);
} else {
argvptr = app->env;
}
if (ORTE_SUCCESS != (rc = orte_util_check_context_app(app, argvptr))) {
/* do not ERROR_LOG - it will be reported elsewhere */
if (NULL != mpiexec_pathenv) {
opal_argv_free(argvptr);
}
/* cycle through children to find those for this jobid */ /* cycle through children to find those for this jobid */
for (item = opal_list_get_first(&orte_local_children); for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children); item != opal_list_get_end(&orte_local_children);
@ -1600,9 +1628,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
/* okay, now tell the HNP we couldn't do it */ /* okay, now tell the HNP we couldn't do it */
goto CLEANUP; goto CLEANUP;
} }
if (NULL != mpiexec_pathenv) {
opal_argv_free(argvptr);
}
/* okay, now let's launch all the local procs for this app using the provided fork_local fn */ /* okay, now let's launch all the local procs for this app using the provided fork_local fn */
for (proc_rank = 0, item = opal_list_get_first(&orte_local_children); for (proc_rank = 0, item = opal_list_get_first(&orte_local_children);
@ -1765,19 +1790,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
/* setup the rest of the environment with the proc-specific items - these /* setup the rest of the environment with the proc-specific items - these
* will be overwritten for each child * will be overwritten for each child
*/ */
if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&job_str, child->name->jobid))) {
ORTE_ERROR_LOG(rc);
goto CLEANUP;
}
if (NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
rc = ORTE_ERR_OUT_OF_RESOURCE;
goto CLEANUP;
}
opal_setenv(param, job_str, true, &app->env);
free(param);
free(job_str);
if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) { if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
goto CLEANUP; goto CLEANUP;
@ -1870,7 +1882,10 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
} }
launch_failed = false; launch_failed = false;
CLEANUP: CLEANUP:
/* ensure we reset our working directory back to our default location */
chdir(basedir);
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:launch reporting job %s launch status", "%s odls:launch reporting job %s launch status",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -2947,6 +2962,7 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
orte_app_context_t *app; orte_app_context_t *app;
opal_list_item_t *item; opal_list_item_t *item;
orte_odls_job_t *jobdat; orte_odls_job_t *jobdat;
char basedir[MAXPATHLEN];
/* protect operations involving the global list of children */ /* protect operations involving the global list of children */
OPAL_THREAD_LOCK(&orte_odls_globals.mutex); OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
@ -2956,6 +2972,12 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name))); ORTE_NAME_PRINT(child->name)));
/* establish our baseline working directory - we will be potentially
* bouncing around as we execute this app, but we will always return
* to this place as our default directory
*/
getcwd(basedir, sizeof(basedir));
/* find this child's jobdat */ /* find this child's jobdat */
jobdat = NULL; jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata); for (item = opal_list_get_first(&orte_local_jobdata);
@ -2985,29 +3007,53 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
child->rml_uri = NULL; child->rml_uri = NULL;
} }
app = jobdat->apps[child->app_idx]; app = jobdat->apps[child->app_idx];
/* reset envars to match this child */
/* reset envars to match this child */
if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) { if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) {
ORTE_ERROR_LOG(rc); ORTE_ERROR_LOG(rc);
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
goto CLEANUP; goto CLEANUP;
} }
opal_output(0, "%s restarting app %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app);
rc = fork_local(app, child, app->env, jobdat);
if (ORTE_SUCCESS == rc) {
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL);
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
/* setup the path */
if (ORTE_SUCCESS != (rc = setup_path(app))) {
ORTE_ERROR_LOG(rc);
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
goto CLEANUP;
} }
CLEANUP: OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s restarting app %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
/* must unlock prior to fork to keep things clean in the
* event library
*/
opal_condition_signal(&orte_odls_globals.cond);
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
rc = fork_local(app, child, app->env, jobdat);
if (ORTE_SUCCESS == rc) {
orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL);
}
CLEANUP:
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:restart of proc %s %s", "%s odls:restart of proc %s %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name), ORTE_NAME_PRINT(child->name),
(ORTE_SUCCESS == rc) ? "succeeded" : "failed")); (ORTE_SUCCESS == rc) ? "succeeded" : "failed"));
opal_condition_signal(&orte_odls_globals.cond); /* reset our working directory back to our default location - if we
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); * don't do this, then we will be looking for relative paths starting
* from the last wdir option specified by the user. Thus, we would
* be requiring that the user keep track on the cmd line of where
* each app was located relative to the prior app, instead of relative
* to their current location
*/
chdir(basedir);
return rc; return rc;
} }

Просмотреть файл

@ -5,36 +5,24 @@
* A program that just spins - provides mechanism for testing user-driven * A program that just spins - provides mechanism for testing user-driven
* abnormal program termination * abnormal program termination
*/ */
#include "opal_config.h" #include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h> #include <stdio.h>
#include "opal/runtime/opal_progress.h" #include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h"
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
if (ORTE_SUCCESS != orte_init(&argc, &argv, ORTE_PROC_NON_MPI)) {
int i; fprintf(stderr, "ORTE_INIT FAILED\n");
double pi; exit(1);
orte_init(&argc, &argv, ORTE_PROC_NON_MPI);
i = 0;
while (1) {
i++;
pi = i / 3.14159256;
if (i > 100) {
/* need to progress so we can
* wake up if our daemon goes
* away!
*/
opal_progress();
/* reset the counter so we loop */
i = 0;
}
} }
opal_output(0, "%s RUNNING", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
opal_event_dispatch();
orte_finalize(); orte_finalize();