Fully restore fault recovery, both at the individual process and daemon level.
NOTE: MPI fault recovery remains unavailable pending merge from Josh. This only covers ORTE-level processes. This commit was SVN r23335.
Этот коммит содержится в:
родитель
7190415977
Коммит
f3d90dfb8d
@ -38,6 +38,7 @@
|
||||
#include "orte/mca/rmaps/rmaps_types.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/tools/orterun/debuggers.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
@ -55,8 +56,11 @@ static void update_proc(orte_job_t *jdata, orte_process_name_t *proc,
|
||||
orte_exit_code_t exit_code);
|
||||
static void check_job_complete(orte_job_t *jdata);
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
|
||||
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc);
|
||||
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
|
||||
orte_proc_state_t state, orte_exit_code_t exit_code);
|
||||
static orte_odls_child_t* proc_is_local(orte_process_name_t *proc);
|
||||
static void record_dead_daemon(orte_job_t *jdat, orte_vpid_t vpid,
|
||||
orte_proc_state_t state, orte_exit_code_t exit_code);
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
@ -308,17 +312,22 @@ static int update_state(orte_jobid_t job,
|
||||
if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* reset the child's state as restart_proc would
|
||||
* have cleared it
|
||||
*/
|
||||
child->state = state;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
/* let it fall thru to abort */
|
||||
} else {
|
||||
/* see if we can relocate it somewhere else */
|
||||
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) {
|
||||
if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* let it fall thru to abort */
|
||||
}
|
||||
} else {
|
||||
/* this is a remote process - see if we can relocate it */
|
||||
if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) {
|
||||
if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* guess not - let it fall thru to abort */
|
||||
@ -378,10 +387,26 @@ static int update_state(orte_jobid_t job,
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_COMM_FAILED:
|
||||
/* delete the route */
|
||||
orte_routed.delete_route(proc);
|
||||
/* purge the oob */
|
||||
orte_rml.purge(proc);
|
||||
/* is this to a daemon? */
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
/* if we have ordered orteds to terminate, ignore this */
|
||||
/* if we have ordered orteds to terminate, see if this one failed to tell
|
||||
* us it had terminated
|
||||
*/
|
||||
if (orte_orteds_term_ordered) {
|
||||
record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
check_job_complete(jdata);
|
||||
break;
|
||||
}
|
||||
/* if abort is in progress, see if this one failed to tell
|
||||
* us it had terminated
|
||||
*/
|
||||
if (orte_abnormal_term_ordered) {
|
||||
record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
check_job_complete(jdata);
|
||||
break;
|
||||
}
|
||||
/* if this is my own connection, ignore it */
|
||||
@ -390,33 +415,37 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
if (orte_enable_recovery) {
|
||||
/* relocate its processes */
|
||||
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc))) {
|
||||
if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) {
|
||||
/* unable to relocate for some reason */
|
||||
opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc));
|
||||
/* kill all local procs */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
/* kill all jobs */
|
||||
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
|
||||
/* check if all is complete so we can terminate */
|
||||
check_job_complete(jdata);
|
||||
}
|
||||
} else {
|
||||
if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died",
|
||||
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true,
|
||||
ORTE_VPID_PRINT(proc->vpid), "Unknown");
|
||||
} else {
|
||||
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died",
|
||||
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true,
|
||||
ORTE_VPID_PRINT(proc->vpid),
|
||||
(NULL == pdat->node) ? "Unknown" :
|
||||
((NULL == pdat->node->name) ? "Unknown" : pdat->node->name));
|
||||
}
|
||||
ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
|
||||
update_proc(jdata, proc, state, pid, ORTE_ERR_COMM_FAILURE);
|
||||
/* remove this proc from the daemon job */
|
||||
record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
/* kill all local procs */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
/* kill all jobs */
|
||||
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
|
||||
/* check if all is complete so we can terminate */
|
||||
check_job_complete(jdata);
|
||||
}
|
||||
} else {
|
||||
/* delete the route */
|
||||
orte_routed.delete_route(proc);
|
||||
}
|
||||
break;
|
||||
|
||||
@ -425,6 +454,7 @@ static int update_state(orte_jobid_t job,
|
||||
if (orte_enable_recovery) {
|
||||
/* relocate its processes */
|
||||
} else {
|
||||
record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
/* kill all local procs */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
/* kill all jobs */
|
||||
@ -482,18 +512,26 @@ static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job), exit_code));
|
||||
|
||||
orte_job_term_ordered = true;
|
||||
/* if debuggers are running, clean up */
|
||||
orte_debugger_finalize();
|
||||
|
||||
/* tell the plm to terminate all jobs */
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
/* set control params to indicate we are terminating */
|
||||
orte_job_term_ordered = true;
|
||||
orte_abnormal_term_ordered = true;
|
||||
orte_enable_recovery = false;
|
||||
|
||||
/* set the exit status, just in case whomever called us failed
|
||||
* to do so - it can only be done once, so we are protected
|
||||
* from overwriting it
|
||||
*/
|
||||
ORTE_UPDATE_EXIT_STATUS(exit_code);
|
||||
|
||||
/* tell the plm to terminate the orteds - they will automatically
|
||||
* kill their local procs
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
|
||||
static void failed_start(orte_job_t *jdata)
|
||||
@ -1088,12 +1126,13 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
OBJ_DESTRUCT(&proc);
|
||||
}
|
||||
|
||||
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc)
|
||||
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
|
||||
orte_proc_state_t state, orte_exit_code_t exit_code)
|
||||
{
|
||||
orte_proc_t *pdata, *pdt;
|
||||
orte_node_t *node;
|
||||
orte_job_t *jdat;
|
||||
orte_proc_t *pdata, *pdt, *pdt2;
|
||||
orte_node_t *node, *nd;
|
||||
orte_app_context_t *app;
|
||||
orte_job_map_t *map;
|
||||
char *app_name;
|
||||
int rc, i, n;
|
||||
|
||||
@ -1103,64 +1142,114 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc)
|
||||
opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc));
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
/* track that we are attempting to relocate */
|
||||
pdata->relocates++;
|
||||
/* have we exceeded the number of relocates for this proc? */
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx);
|
||||
if (app->max_global_restarts < pdata->relocates) {
|
||||
return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED;
|
||||
}
|
||||
|
||||
/* set the state */
|
||||
pdata->state = state;
|
||||
|
||||
/* retain the node id */
|
||||
node = pdata->node;
|
||||
|
||||
/* if it is a daemon that died, we need to flag all of its procs
|
||||
* to be relocated
|
||||
*/
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
map = jdata->map;
|
||||
/* remove this proc from the daemon job */
|
||||
record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
/* check to see if any other nodes are "alive" */
|
||||
if (!orte_hnp_is_allocated && jdata->num_procs == 1) {
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
app_name = "orted";
|
||||
for (n=0; n < map->nodes->size; n++) {
|
||||
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
|
||||
continue;
|
||||
}
|
||||
if (node->daemon->name.vpid != proc->vpid) {
|
||||
continue;
|
||||
}
|
||||
/* found the node - now flag the procs */
|
||||
/* scan the procs looking for each unique jobid on the node */
|
||||
for (i=0; i < node->procs->size; i++) {
|
||||
if (NULL == (pdt = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
if (ORTE_PROC_STATE_TERMINATED < pdt->state) {
|
||||
/* get the job data object for this process */
|
||||
if (NULL == (jdat = orte_get_job_data_object(pdt->name.jobid))) {
|
||||
/* major problem */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
continue;
|
||||
}
|
||||
/* if the proc hasn't already terminated, then mark
|
||||
* it as aborted so it will be restarted
|
||||
*/
|
||||
pdt->state = ORTE_PROC_STATE_ABORTED;
|
||||
}
|
||||
/* mark the node as "down" */
|
||||
node->state = ORTE_NODE_STATE_DOWN;
|
||||
/* remove it from the map */
|
||||
opal_pointer_array_set_item(map->nodes, n, NULL);
|
||||
/* do a release to maintain accounting - won't actually
|
||||
* remove the node object from memory
|
||||
/* since the node was used in this job's map, release
|
||||
* it so that accounting is maintained
|
||||
*/
|
||||
OBJ_RELEASE(node);
|
||||
/* mark this proc as dead so it will be restarted */
|
||||
pdt->state = ORTE_PROC_STATE_ABORTED;
|
||||
/* remove this proc from the node */
|
||||
OBJ_RELEASE(pdt); /* maintains accounting */
|
||||
opal_pointer_array_set_item(node->procs, i, NULL);
|
||||
/* maintain accounting on num procs alive in case this can't restart */
|
||||
jdat->num_terminated++;
|
||||
/* look for all other procs on this node from the same job */
|
||||
for (n=0; n < node->procs->size; n++) {
|
||||
if (NULL == (pdt2 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) {
|
||||
continue;
|
||||
}
|
||||
if (pdt2->name.jobid == pdt->name.jobid) {
|
||||
/* mark this proc as having aborted */
|
||||
pdt2->state = ORTE_PROC_STATE_ABORTED;
|
||||
/* remove it from the node */
|
||||
OBJ_RELEASE(pdt2);
|
||||
opal_pointer_array_set_item(node->procs, n, NULL);
|
||||
/* maintain accounting on num procs alive */
|
||||
jdat->num_terminated++;
|
||||
}
|
||||
}
|
||||
/* and remove the node from the map */
|
||||
for (n=0; n < jdat->map->nodes->size; n++) {
|
||||
if (NULL == (nd = (orte_node_t*)opal_pointer_array_get_item(jdat->map->nodes, n))) {
|
||||
continue;
|
||||
}
|
||||
if (nd->index == node->index) {
|
||||
opal_pointer_array_set_item(jdat->map->nodes, n, NULL);
|
||||
OBJ_RELEASE(node); /* maintain accounting */
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx);
|
||||
app_name = app->app;
|
||||
}
|
||||
/* reset the job params for this job */
|
||||
orte_plm_base_reset_job(jdat);
|
||||
|
||||
/* relaunch the job */
|
||||
opal_output(0, "%s RELOCATING APPS FOR JOB %s FROM NODE %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdat->jobid), node->name);
|
||||
if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdat))) {
|
||||
opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc));
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* otherwise, we are an app - try to relocate us to another node */
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx);
|
||||
if (NULL == app) {
|
||||
/* no way to restart this job */
|
||||
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:cannot-relocate", true,
|
||||
ORTE_NAME_PRINT(proc));
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
app_name = app->app;
|
||||
/* track that we are attempting to relocate */
|
||||
pdata->relocates++;
|
||||
/* have we exceeded the number of relocates for this proc? */
|
||||
if (app->max_global_restarts < pdata->relocates) {
|
||||
return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED;
|
||||
}
|
||||
|
||||
/* reset the job params for restart */
|
||||
orte_plm_base_reset_job(jdata);
|
||||
|
||||
/* flag the current node as not-to-be-used */
|
||||
pdata->node->state = ORTE_NODE_STATE_DO_NOT_USE;
|
||||
|
||||
/* restart the job - the spawn function will remap and
|
||||
* launch the replacement proc(s)
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
|
||||
"%s RESTARTING APP: %s",
|
||||
"%s RELOCATING APP %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
|
||||
@ -1189,3 +1278,45 @@ static orte_odls_child_t* proc_is_local(orte_process_name_t *proc)
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void record_dead_daemon(orte_job_t *jdat, orte_vpid_t vpid,
|
||||
orte_proc_state_t state, orte_exit_code_t exit_code)
|
||||
{
|
||||
orte_job_t *jdt;
|
||||
orte_proc_t *pdat;
|
||||
orte_node_t *node;
|
||||
int i;
|
||||
|
||||
if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, vpid)) &&
|
||||
ORTE_PROC_STATE_TERMINATED != pdat->state) {
|
||||
/* need to record that this one died */
|
||||
pdat->state = state;
|
||||
pdat->exit_code = exit_code;
|
||||
ORTE_UPDATE_EXIT_STATUS(exit_code);
|
||||
/* remove it from the job array */
|
||||
opal_pointer_array_set_item(jdat->procs, vpid, NULL);
|
||||
orte_process_info.num_procs--;
|
||||
jdat->num_procs--;
|
||||
/* mark the node as down so it won't be used in mapping
|
||||
* procs to be relaunched
|
||||
*/
|
||||
node = pdat->node;
|
||||
node->state = ORTE_NODE_STATE_DOWN;
|
||||
node->daemon = NULL;
|
||||
OBJ_RELEASE(pdat); /* maintain accounting */
|
||||
/* mark all procs on this node as having terminated */
|
||||
for (i=0; i < node->procs->size; i++) {
|
||||
if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
/* get the job data object for this process */
|
||||
if (NULL == (jdt = orte_get_job_data_object(pdat->name.jobid))) {
|
||||
/* major problem */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
continue;
|
||||
}
|
||||
pdat->state = ORTE_PROC_STATE_ABORTED;
|
||||
jdt->num_terminated++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -31,3 +31,12 @@ check with your system administrator to try and determine the
|
||||
source of the problem.
|
||||
|
||||
Your job is being terminated as a result.
|
||||
#
|
||||
[errmgr-hnp:cannot-relocate]
|
||||
The system is unable to relocate the specified process:
|
||||
|
||||
Process: %s
|
||||
|
||||
because the application for that process could not be found. This
|
||||
appears to be a system error. Please report it to the ORTE
|
||||
developers.
|
||||
|
@ -246,7 +246,9 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
/* delete the route */
|
||||
orte_routed.delete_route(proc);
|
||||
/* see is this was a lifeline */
|
||||
/* purge the oob */
|
||||
orte_rml.purge(proc);
|
||||
/* see if this was a lifeline */
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
|
||||
/* kill our children */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
@ -322,16 +324,18 @@ static int update_state(orte_jobid_t job,
|
||||
child->name->vpid == proc->vpid) {
|
||||
/* see if this child has reached its local restart limit */
|
||||
app = jobdat->apps[child->app_idx];
|
||||
if (child->restarts == app->max_local_restarts ) {
|
||||
goto REPORT_ABORT;
|
||||
}
|
||||
/* otherwise, attempt to restart it locally */
|
||||
if (child->restarts < app->max_local_restarts ) {
|
||||
/* attempt to restart it locally */
|
||||
child->restarts++;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:orted restarting proc %s for the %d time",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name), child->restarts));
|
||||
if (ORTE_SUCCESS != (rc = orte_odls.restart_proc(child))) {
|
||||
/* reset the child's state as restart_proc would
|
||||
* have cleared it
|
||||
*/
|
||||
child->state = state;
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto REPORT_ABORT;
|
||||
}
|
||||
@ -339,6 +343,7 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
REPORT_ABORT:
|
||||
/* if the job hasn't completed and the state is abnormally
|
||||
|
@ -1188,12 +1188,27 @@ static int odls_base_default_setup_fork(orte_app_context_t *context,
|
||||
|
||||
static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char ***env)
|
||||
{
|
||||
char *vpid_str, *param, *value;
|
||||
char *param, *value;
|
||||
orte_node_rank_t node_rank;
|
||||
orte_local_rank_t local_rank;
|
||||
int rc;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid_str, child->name->vpid))) {
|
||||
/* setup the jobid */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
if (NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
return rc;
|
||||
}
|
||||
opal_setenv(param, value, true, env);
|
||||
free(param);
|
||||
free(value);
|
||||
|
||||
/* setup the vpid */
|
||||
if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name->vpid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
@ -1202,7 +1217,7 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char *
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
return rc;
|
||||
}
|
||||
opal_setenv(param, vpid_str, true, env);
|
||||
opal_setenv(param, value, true, env);
|
||||
free(param);
|
||||
|
||||
/* although the vpid IS the process' rank within the job, users
|
||||
@ -1213,8 +1228,8 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char *
|
||||
* AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
|
||||
* We know - just live with it
|
||||
*/
|
||||
opal_setenv("OMPI_COMM_WORLD_RANK", vpid_str, true, env);
|
||||
free(vpid_str); /* done with this now */
|
||||
opal_setenv("OMPI_COMM_WORLD_RANK", value, true, env);
|
||||
free(value); /* done with this now */
|
||||
|
||||
/* users would appreciate being given a public environmental variable
|
||||
* that also represents the local rank value - something MPI specific - so
|
||||
@ -1291,6 +1306,78 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char *
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int setup_path(orte_app_context_t *app)
|
||||
{
|
||||
int rc;
|
||||
char dir[MAXPATHLEN];
|
||||
char **argvptr;
|
||||
char *pathenv = NULL, *mpiexec_pathenv = NULL;
|
||||
char *full_search;
|
||||
|
||||
/* Try to change to the app's cwd and check that the app
|
||||
exists and is executable The function will
|
||||
take care of outputting a pretty error message, if required
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(app, true))) {
|
||||
/* do not ERROR_LOG - it will be reported elsewhere */
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* The prior function will have done a chdir() to jump us to
|
||||
* wherever the app is to be executed. This could be either where
|
||||
* the user specified (via -wdir), or to the user's home directory
|
||||
* on this node if nothing was provided. It seems that chdir doesn't
|
||||
* adjust the $PWD enviro variable when it changes the directory. This
|
||||
* can cause a user to get a different response when doing getcwd vs
|
||||
* looking at the enviro variable. To keep this consistent, we explicitly
|
||||
* ensure that the PWD enviro variable matches the CWD we moved to.
|
||||
*
|
||||
* NOTE: if a user's program does a chdir(), then $PWD will once
|
||||
* again not match getcwd! This is beyond our control - we are only
|
||||
* ensuring they start out matching.
|
||||
*/
|
||||
getcwd(dir, sizeof(dir));
|
||||
opal_setenv("PWD", dir, true, &app->env);
|
||||
|
||||
/* Search for the OMPI_exec_path and PATH settings in the environment. */
|
||||
for (argvptr = app->env; *argvptr != NULL; argvptr++) {
|
||||
if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
|
||||
mpiexec_pathenv = *argvptr + 15;
|
||||
}
|
||||
if (0 == strncmp("PATH=", *argvptr, 5)) {
|
||||
pathenv = *argvptr + 5;
|
||||
}
|
||||
}
|
||||
|
||||
/* If OMPI_exec_path is set (meaning --path was used), then create a
|
||||
temporary environment to be used in the search for the executable.
|
||||
The PATH setting in this temporary environment is a combination of
|
||||
the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
|
||||
then just use existing environment with PATH in it. */
|
||||
if (NULL != mpiexec_pathenv) {
|
||||
argvptr = NULL;
|
||||
if (pathenv != NULL) {
|
||||
asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
|
||||
} else {
|
||||
asprintf(&full_search, "%s", mpiexec_pathenv);
|
||||
}
|
||||
opal_setenv("PATH", full_search, true, &argvptr);
|
||||
free(full_search);
|
||||
} else {
|
||||
argvptr = app->env;
|
||||
}
|
||||
|
||||
rc = orte_util_check_context_app(app, argvptr);
|
||||
/* do not ERROR_LOG - it will be reported elsewhere */
|
||||
if (NULL != mpiexec_pathenv) {
|
||||
opal_argv_free(argvptr);
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/* define a timer release point so that we can wait for
|
||||
* file descriptors to come available, if necessary
|
||||
*/
|
||||
@ -1311,7 +1398,6 @@ static void timer_cb(int fd, short event, void *cbdata)
|
||||
int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
orte_odls_base_fork_local_proc_fn_t fork_local)
|
||||
{
|
||||
char *job_str, *param;
|
||||
opal_list_item_t *item;
|
||||
orte_app_context_t *app, **apps;
|
||||
orte_app_idx_t i, num_apps;
|
||||
@ -1323,11 +1409,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
opal_buffer_t alert;
|
||||
orte_std_cntr_t proc_rank;
|
||||
orte_odls_job_t *jobdat;
|
||||
char *pathenv = NULL, *mpiexec_pathenv = NULL;
|
||||
char basedir[MAXPATHLEN];
|
||||
char dir[MAXPATHLEN];
|
||||
char **argvptr;
|
||||
char *full_search;
|
||||
char **argvsav=NULL;
|
||||
int inm;
|
||||
opal_event_t *delay;
|
||||
@ -1518,75 +1600,21 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
|
||||
/* Try to change to the app's cwd and check that the app
|
||||
exists and is executable The function will
|
||||
take care of outputting a pretty error message, if required
|
||||
/* setup the working directory for this app - will jump us
|
||||
* to that directory
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(app, true))) {
|
||||
/* do not ERROR_LOG - it will be reported elsewhere */
|
||||
/* cycle through children to find those for this jobid */
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID) &&
|
||||
i == child->app_idx) {
|
||||
child->exit_code = rc;
|
||||
}
|
||||
}
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* The prior function will have done a chdir() to jump us to
|
||||
* wherever the app is to be executed. This could be either where
|
||||
* the user specified (via -wdir), or to the user's home directory
|
||||
* on this node if nothing was provided. It seems that chdir doesn't
|
||||
* adjust the $PWD enviro variable when it changes the directory. This
|
||||
* can cause a user to get a different response when doing getcwd vs
|
||||
* looking at the enviro variable. To keep this consistent, we explicitly
|
||||
* ensure that the PWD enviro variable matches the CWD we moved to.
|
||||
*
|
||||
* NOTE: if a user's program does a chdir(), then $PWD will once
|
||||
* again not match getcwd! This is beyond our control - we are only
|
||||
* ensuring they start out matching.
|
||||
if (ORTE_SUCCESS != (rc = setup_path(app))) {
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:launch:setup_path failed with error %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_ERROR_NAME(rc)));
|
||||
/* do not ERROR_LOG this failure - it will be reported
|
||||
* elsewhere. The launch is going to fail. Since we could have
|
||||
* multiple app_contexts, we need to ensure that we flag only
|
||||
* the correct one that caused this operation to fail. We then have
|
||||
* to flag all the other procs from the app_context as having "not failed"
|
||||
* so we can report things out correctly
|
||||
*/
|
||||
getcwd(dir, sizeof(dir));
|
||||
opal_setenv("PWD", dir, true, &app->env);
|
||||
|
||||
/* Search for the OMPI_exec_path and PATH settings in the environment. */
|
||||
for (argvptr = app->env; *argvptr != NULL; argvptr++) {
|
||||
if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
|
||||
mpiexec_pathenv = *argvptr + 15;
|
||||
}
|
||||
if (0 == strncmp("PATH=", *argvptr, 5)) {
|
||||
pathenv = *argvptr + 5;
|
||||
}
|
||||
}
|
||||
|
||||
/* If OMPI_exec_path is set (meaning --path was used), then create a
|
||||
temporary environment to be used in the search for the executable.
|
||||
The PATH setting in this temporary environment is a combination of
|
||||
the OMPI_exec_path and PATH values. If OMPI_exec_path is not set,
|
||||
then just use existing environment with PATH in it. */
|
||||
if (NULL != mpiexec_pathenv) {
|
||||
argvptr = NULL;
|
||||
if (pathenv != NULL) {
|
||||
asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
|
||||
} else {
|
||||
asprintf(&full_search, "%s", mpiexec_pathenv);
|
||||
}
|
||||
opal_setenv("PATH", full_search, true, &argvptr);
|
||||
free(full_search);
|
||||
} else {
|
||||
argvptr = app->env;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (rc = orte_util_check_context_app(app, argvptr))) {
|
||||
/* do not ERROR_LOG - it will be reported elsewhere */
|
||||
if (NULL != mpiexec_pathenv) {
|
||||
opal_argv_free(argvptr);
|
||||
}
|
||||
/* cycle through children to find those for this jobid */
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
@ -1600,9 +1628,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
/* okay, now tell the HNP we couldn't do it */
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (NULL != mpiexec_pathenv) {
|
||||
opal_argv_free(argvptr);
|
||||
}
|
||||
|
||||
/* okay, now let's launch all the local procs for this app using the provided fork_local fn */
|
||||
for (proc_rank = 0, item = opal_list_get_first(&orte_local_children);
|
||||
@ -1765,19 +1790,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
/* setup the rest of the environment with the proc-specific items - these
|
||||
* will be overwritten for each child
|
||||
*/
|
||||
if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&job_str, child->name->jobid))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
}
|
||||
if (NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
|
||||
rc = ORTE_ERR_OUT_OF_RESOURCE;
|
||||
goto CLEANUP;
|
||||
}
|
||||
opal_setenv(param, job_str, true, &app->env);
|
||||
free(param);
|
||||
free(job_str);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto CLEANUP;
|
||||
@ -1871,6 +1883,9 @@ int orte_odls_base_default_launch_local(orte_jobid_t job,
|
||||
launch_failed = false;
|
||||
|
||||
CLEANUP:
|
||||
/* ensure we reset our working directory back to our default location */
|
||||
chdir(basedir);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:launch reporting job %s launch status",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -2947,6 +2962,7 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
|
||||
orte_app_context_t *app;
|
||||
opal_list_item_t *item;
|
||||
orte_odls_job_t *jobdat;
|
||||
char basedir[MAXPATHLEN];
|
||||
|
||||
/* protect operations involving the global list of children */
|
||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||
@ -2956,6 +2972,12 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
|
||||
/* establish our baseline working directory - we will be potentially
|
||||
* bouncing around as we execute this app, but we will always return
|
||||
* to this place as our default directory
|
||||
*/
|
||||
getcwd(basedir, sizeof(basedir));
|
||||
|
||||
/* find this child's jobdat */
|
||||
jobdat = NULL;
|
||||
for (item = opal_list_get_first(&orte_local_jobdata);
|
||||
@ -2985,19 +3007,36 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child,
|
||||
child->rml_uri = NULL;
|
||||
}
|
||||
app = jobdat->apps[child->app_idx];
|
||||
/* reset envars to match this child */
|
||||
|
||||
/* reset envars to match this child */
|
||||
if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
opal_condition_signal(&orte_odls_globals.cond);
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
goto CLEANUP;
|
||||
}
|
||||
opal_output(0, "%s restarting app %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app);
|
||||
|
||||
/* setup the path */
|
||||
if (ORTE_SUCCESS != (rc = setup_path(app))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
opal_condition_signal(&orte_odls_globals.cond);
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s restarting app %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
|
||||
|
||||
/* must unlock prior to fork to keep things clean in the
|
||||
* event library
|
||||
*/
|
||||
opal_condition_signal(&orte_odls_globals.cond);
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
|
||||
rc = fork_local(app, child, app->env, jobdat);
|
||||
if (ORTE_SUCCESS == rc) {
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL);
|
||||
OPAL_THREAD_LOCK(&orte_odls_globals.mutex);
|
||||
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
@ -3007,7 +3046,14 @@ CLEANUP:
|
||||
ORTE_NAME_PRINT(child->name),
|
||||
(ORTE_SUCCESS == rc) ? "succeeded" : "failed"));
|
||||
|
||||
opal_condition_signal(&orte_odls_globals.cond);
|
||||
OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex);
|
||||
/* reset our working directory back to our default location - if we
|
||||
* don't do this, then we will be looking for relative paths starting
|
||||
* from the last wdir option specified by the user. Thus, we would
|
||||
* be requiring that the user keep track on the cmd line of where
|
||||
* each app was located relative to the prior app, instead of relative
|
||||
* to their current location
|
||||
*/
|
||||
chdir(basedir);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
@ -5,36 +5,24 @@
|
||||
* A program that just spins - provides mechanism for testing user-driven
|
||||
* abnormal program termination
|
||||
*/
|
||||
#include "opal_config.h"
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
|
||||
int i;
|
||||
double pi;
|
||||
|
||||
orte_init(&argc, &argv, ORTE_PROC_NON_MPI);
|
||||
|
||||
i = 0;
|
||||
while (1) {
|
||||
i++;
|
||||
pi = i / 3.14159256;
|
||||
if (i > 100) {
|
||||
/* need to progress so we can
|
||||
* wake up if our daemon goes
|
||||
* away!
|
||||
*/
|
||||
opal_progress();
|
||||
/* reset the counter so we loop */
|
||||
i = 0;
|
||||
}
|
||||
if (ORTE_SUCCESS != orte_init(&argc, &argv, ORTE_PROC_NON_MPI)) {
|
||||
fprintf(stderr, "ORTE_INIT FAILED\n");
|
||||
exit(1);
|
||||
}
|
||||
opal_output(0, "%s RUNNING", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
|
||||
opal_event_dispatch();
|
||||
|
||||
orte_finalize();
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user