diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index 751bfd70b7..c1d56e6ed9 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -38,6 +38,7 @@ #include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/sensor/sensor.h" #include "orte/mca/routed/routed.h" +#include "orte/tools/orterun/debuggers.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/base.h" @@ -55,8 +56,11 @@ static void update_proc(orte_job_t *jdata, orte_process_name_t *proc, orte_exit_code_t exit_code); static void check_job_complete(orte_job_t *jdata); static void killprocs(orte_jobid_t job, orte_vpid_t vpid); -static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc); +static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, + orte_proc_state_t state, orte_exit_code_t exit_code); static orte_odls_child_t* proc_is_local(orte_process_name_t *proc); +static void record_dead_daemon(orte_job_t *jdat, orte_vpid_t vpid, + orte_proc_state_t state, orte_exit_code_t exit_code); /* * Module functions: Global @@ -308,17 +312,22 @@ static int update_state(orte_jobid_t job, if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { return ORTE_SUCCESS; } + /* reset the child's state as restart_proc would + * have cleared it + */ + child->state = state; + ORTE_ERROR_LOG(rc); /* let it fall thru to abort */ } else { /* see if we can relocate it somewhere else */ - if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) { + if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { return ORTE_SUCCESS; } /* let it fall thru to abort */ } } else { /* this is a remote process - see if we can relocate it */ - if (ORTE_SUCCESS == hnp_relocate(jdata, proc)) { + if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { return ORTE_SUCCESS; } /* guess not - let it fall thru to abort */ @@ -378,10 +387,26 @@ static int update_state(orte_jobid_t job, break; case ORTE_PROC_STATE_COMM_FAILED: + /* delete the route */ + orte_routed.delete_route(proc); + /* purge the oob */ + orte_rml.purge(proc); /* is this to a daemon? */ if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - /* if we have ordered orteds to terminate, ignore this */ + /* if we have ordered orteds to terminate, see if this one failed to tell + * us it had terminated + */ if (orte_orteds_term_ordered) { + record_dead_daemon(jdata, proc->vpid, state, exit_code); + check_job_complete(jdata); + break; + } + /* if abort is in progress, see if this one failed to tell + * us it had terminated + */ + if (orte_abnormal_term_ordered) { + record_dead_daemon(jdata, proc->vpid, state, exit_code); + check_job_complete(jdata); break; } /* if this is my own connection, ignore it */ @@ -390,33 +415,37 @@ static int update_state(orte_jobid_t job, } if (orte_enable_recovery) { /* relocate its processes */ - if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc))) { + if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) { + /* unable to relocate for some reason */ + opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); /* kill all local procs */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* kill all jobs */ hnp_abort(ORTE_JOBID_WILDCARD, exit_code); + /* check if all is complete so we can terminate */ + check_job_complete(jdata); } } else { if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", + orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true, ORTE_VPID_PRINT(proc->vpid), "Unknown"); } else { - orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", + orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true, ORTE_VPID_PRINT(proc->vpid), (NULL == pdat->node) ? "Unknown" : ((NULL == pdat->node->name) ? "Unknown" : pdat->node->name)); } - ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE); - update_proc(jdata, proc, state, pid, ORTE_ERR_COMM_FAILURE); + /* remove this proc from the daemon job */ + record_dead_daemon(jdata, proc->vpid, state, exit_code); /* kill all local procs */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* kill all jobs */ hnp_abort(ORTE_JOBID_WILDCARD, exit_code); + /* check if all is complete so we can terminate */ + check_job_complete(jdata); } - } else { - /* delete the route */ - orte_routed.delete_route(proc); } break; @@ -425,6 +454,7 @@ static int update_state(orte_jobid_t job, if (orte_enable_recovery) { /* relocate its processes */ } else { + record_dead_daemon(jdata, proc->vpid, state, exit_code); /* kill all local procs */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* kill all jobs */ @@ -482,18 +512,26 @@ static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), exit_code)); + /* if debuggers are running, clean up */ + orte_debugger_finalize(); + + /* set control params to indicate we are terminating */ orte_job_term_ordered = true; - - /* tell the plm to terminate all jobs */ - if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) { - ORTE_ERROR_LOG(rc); - } - + orte_abnormal_term_ordered = true; + orte_enable_recovery = false; + /* set the exit status, just in case whomever called us failed * to do so - it can only be done once, so we are protected * from overwriting it */ ORTE_UPDATE_EXIT_STATUS(exit_code); + + /* tell the plm to terminate the orteds - they will automatically + * kill their local procs + */ + if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) { + ORTE_ERROR_LOG(rc); + } } static void failed_start(orte_job_t *jdata) @@ -1088,79 +1126,130 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid) OBJ_DESTRUCT(&proc); } -static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc) +static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, + orte_proc_state_t state, orte_exit_code_t exit_code) { - orte_proc_t *pdata, *pdt; - orte_node_t *node; + orte_job_t *jdat; + orte_proc_t *pdata, *pdt, *pdt2; + orte_node_t *node, *nd; orte_app_context_t *app; - orte_job_map_t *map; char *app_name; int rc, i, n; - + /* get the proc_t object for this process */ pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (NULL == pdata) { opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc)); return ORTE_ERR_NOT_FOUND; } - /* track that we are attempting to relocate */ - pdata->relocates++; - /* have we exceeded the number of relocates for this proc? */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx); - if (app->max_global_restarts < pdata->relocates) { - return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED; - } + + /* set the state */ + pdata->state = state; + + /* retain the node id */ + node = pdata->node; /* if it is a daemon that died, we need to flag all of its procs * to be relocated */ if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - map = jdata->map; + /* remove this proc from the daemon job */ + record_dead_daemon(jdata, proc->vpid, state, exit_code); + /* check to see if any other nodes are "alive" */ + if (!orte_hnp_is_allocated && jdata->num_procs == 1) { + return ORTE_ERR_FATAL; + } app_name = "orted"; - for (n=0; n < map->nodes->size; n++) { - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) { + /* scan the procs looking for each unique jobid on the node */ + for (i=0; i < node->procs->size; i++) { + if (NULL == (pdt = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } - if (node->daemon->name.vpid != proc->vpid) { + /* get the job data object for this process */ + if (NULL == (jdat = orte_get_job_data_object(pdt->name.jobid))) { + /* major problem */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); continue; } - /* found the node - now flag the procs */ - for (i=0; i < node->procs->size; i++) { - if (NULL == (pdt = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { - continue; - } - if (ORTE_PROC_STATE_TERMINATED < pdt->state) { - continue; - } - /* if the proc hasn't already terminated, then mark - * it as aborted so it will be restarted - */ - pdt->state = ORTE_PROC_STATE_ABORTED; - } - /* mark the node as "down" */ - node->state = ORTE_NODE_STATE_DOWN; - /* remove it from the map */ - opal_pointer_array_set_item(map->nodes, n, NULL); - /* do a release to maintain accounting - won't actually - * remove the node object from memory + /* since the node was used in this job's map, release + * it so that accounting is maintained */ OBJ_RELEASE(node); - break; + /* mark this proc as dead so it will be restarted */ + pdt->state = ORTE_PROC_STATE_ABORTED; + /* remove this proc from the node */ + OBJ_RELEASE(pdt); /* maintains accounting */ + opal_pointer_array_set_item(node->procs, i, NULL); + /* maintain accounting on num procs alive in case this can't restart */ + jdat->num_terminated++; + /* look for all other procs on this node from the same job */ + for (n=0; n < node->procs->size; n++) { + if (NULL == (pdt2 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) { + continue; + } + if (pdt2->name.jobid == pdt->name.jobid) { + /* mark this proc as having aborted */ + pdt2->state = ORTE_PROC_STATE_ABORTED; + /* remove it from the node */ + OBJ_RELEASE(pdt2); + opal_pointer_array_set_item(node->procs, n, NULL); + /* maintain accounting on num procs alive */ + jdat->num_terminated++; + } + } + /* and remove the node from the map */ + for (n=0; n < jdat->map->nodes->size; n++) { + if (NULL == (nd = (orte_node_t*)opal_pointer_array_get_item(jdat->map->nodes, n))) { + continue; + } + if (nd->index == node->index) { + opal_pointer_array_set_item(jdat->map->nodes, n, NULL); + OBJ_RELEASE(node); /* maintain accounting */ + break; + } + } + /* reset the job params for this job */ + orte_plm_base_reset_job(jdat); + + /* relaunch the job */ + opal_output(0, "%s RELOCATING APPS FOR JOB %s FROM NODE %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdat->jobid), node->name); + if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdat))) { + opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc)); + return rc; + } } - } else { - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx); - app_name = app->app; + + return ORTE_SUCCESS; } + /* otherwise, we are an app - try to relocate us to another node */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx); + if (NULL == app) { + /* no way to restart this job */ + orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:cannot-relocate", true, + ORTE_NAME_PRINT(proc)); + return ORTE_ERR_NOT_FOUND; + } + app_name = app->app; + /* track that we are attempting to relocate */ + pdata->relocates++; + /* have we exceeded the number of relocates for this proc? */ + if (app->max_global_restarts < pdata->relocates) { + return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED; + } /* reset the job params for restart */ orte_plm_base_reset_job(jdata); + /* flag the current node as not-to-be-used */ + pdata->node->state = ORTE_NODE_STATE_DO_NOT_USE; + /* restart the job - the spawn function will remap and * launch the replacement proc(s) */ OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s RESTARTING APP: %s", + "%s RELOCATING APP %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); @@ -1189,3 +1278,45 @@ static orte_odls_child_t* proc_is_local(orte_process_name_t *proc) } return NULL; } + +static void record_dead_daemon(orte_job_t *jdat, orte_vpid_t vpid, + orte_proc_state_t state, orte_exit_code_t exit_code) +{ + orte_job_t *jdt; + orte_proc_t *pdat; + orte_node_t *node; + int i; + + if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, vpid)) && + ORTE_PROC_STATE_TERMINATED != pdat->state) { + /* need to record that this one died */ + pdat->state = state; + pdat->exit_code = exit_code; + ORTE_UPDATE_EXIT_STATUS(exit_code); + /* remove it from the job array */ + opal_pointer_array_set_item(jdat->procs, vpid, NULL); + orte_process_info.num_procs--; + jdat->num_procs--; + /* mark the node as down so it won't be used in mapping + * procs to be relaunched + */ + node = pdat->node; + node->state = ORTE_NODE_STATE_DOWN; + node->daemon = NULL; + OBJ_RELEASE(pdat); /* maintain accounting */ + /* mark all procs on this node as having terminated */ + for (i=0; i < node->procs->size; i++) { + if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { + continue; + } + /* get the job data object for this process */ + if (NULL == (jdt = orte_get_job_data_object(pdat->name.jobid))) { + /* major problem */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + continue; + } + pdat->state = ORTE_PROC_STATE_ABORTED; + jdt->num_terminated++; + } + } +} diff --git a/orte/mca/errmgr/hnp/help-orte-errmgr-hnp.txt b/orte/mca/errmgr/hnp/help-orte-errmgr-hnp.txt index d002f7cd9a..d06821544d 100644 --- a/orte/mca/errmgr/hnp/help-orte-errmgr-hnp.txt +++ b/orte/mca/errmgr/hnp/help-orte-errmgr-hnp.txt @@ -31,3 +31,12 @@ check with your system administrator to try and determine the source of the problem. Your job is being terminated as a result. +# +[errmgr-hnp:cannot-relocate] +The system is unable to relocate the specified process: + +Process: %s + +because the application for that process could not be found. This +appears to be a system error. Please report it to the ORTE +developers. diff --git a/orte/mca/errmgr/orted/errmgr_orted.c b/orte/mca/errmgr/orted/errmgr_orted.c index d941b8e763..45e2946efc 100644 --- a/orte/mca/errmgr/orted/errmgr_orted.c +++ b/orte/mca/errmgr/orted/errmgr_orted.c @@ -188,31 +188,31 @@ static int update_state(orte_jobid_t job, } switch (jobstate) { - case ORTE_JOB_STATE_FAILED_TO_START: - failed_start(jobdat, exit_code); - break; - case ORTE_JOB_STATE_RUNNING: - /* update all local child states */ - update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING); - break; - case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: - /* update all procs in job */ - update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); - /* order all local procs for this job to be killed */ - killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); - case ORTE_JOB_STATE_COMM_FAILED: - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); - /* tell the caller we can't recover */ - return ORTE_ERR_UNRECOVERABLE; - break; - case ORTE_JOB_STATE_HEARTBEAT_FAILED: - /* let the HNP handle this */ - return ORTE_SUCCESS; - break; + case ORTE_JOB_STATE_FAILED_TO_START: + failed_start(jobdat, exit_code); + break; + case ORTE_JOB_STATE_RUNNING: + /* update all local child states */ + update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING); + break; + case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: + /* update all procs in job */ + update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); + /* order all local procs for this job to be killed */ + killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); + case ORTE_JOB_STATE_COMM_FAILED: + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); + /* tell the caller we can't recover */ + return ORTE_ERR_UNRECOVERABLE; + break; + case ORTE_JOB_STATE_HEARTBEAT_FAILED: + /* let the HNP handle this */ + return ORTE_SUCCESS; + break; - default: - break; + default: + break; } OBJ_CONSTRUCT(&alert, opal_buffer_t); /* pack update state command */ @@ -246,7 +246,9 @@ static int update_state(orte_jobid_t job, } /* delete the route */ orte_routed.delete_route(proc); - /* see is this was a lifeline */ + /* purge the oob */ + orte_rml.purge(proc); + /* see if this was a lifeline */ if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { /* kill our children */ killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); @@ -313,7 +315,7 @@ static int update_state(orte_jobid_t job, if (ORTE_PROC_STATE_TERMINATED < state) { if (jobdat->enable_recovery) { - /* find this proc in the local children */ + /* find this proc in the local children */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { @@ -322,20 +324,23 @@ static int update_state(orte_jobid_t job, child->name->vpid == proc->vpid) { /* see if this child has reached its local restart limit */ app = jobdat->apps[child->app_idx]; - if (child->restarts == app->max_local_restarts ) { - goto REPORT_ABORT; + if (child->restarts < app->max_local_restarts ) { + /* attempt to restart it locally */ + child->restarts++; + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:orted restarting proc %s for the %d time", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name), child->restarts)); + if (ORTE_SUCCESS != (rc = orte_odls.restart_proc(child))) { + /* reset the child's state as restart_proc would + * have cleared it + */ + child->state = state; + ORTE_ERROR_LOG(rc); + goto REPORT_ABORT; + } + return ORTE_SUCCESS; } - /* otherwise, attempt to restart it locally */ - child->restarts++; - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:orted restarting proc %s for the %d time", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(child->name), child->restarts)); - if (ORTE_SUCCESS != (rc = orte_odls.restart_proc(child))) { - ORTE_ERROR_LOG(rc); - goto REPORT_ABORT; - } - return ORTE_SUCCESS; } } } @@ -571,9 +576,9 @@ int ft_event(int state) return ORTE_SUCCESS; } -/***************** - * Local Functions - *****************/ + /***************** + * Local Functions + *****************/ static bool any_live_children(orte_jobid_t job) { opal_list_item_t *item; @@ -640,53 +645,53 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) return ORTE_SUCCESS; } -static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) -{ - int rc; - opal_list_item_t *item, *next; - orte_odls_child_t *child; - orte_vpid_t null=ORTE_VPID_INVALID; + static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) + { + int rc; + opal_list_item_t *item, *next; + orte_odls_child_t *child; + orte_vpid_t null=ORTE_VPID_INVALID; - /* pack the jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* if we are timing things, pack the time the launch msg for this job was recvd */ - if (orte_timing) { - int64_t tmp; - tmp = jobdat->launch_msg_recvd.tv_sec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { + /* pack the jobid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return rc; } - tmp = jobdat->launch_msg_recvd.tv_usec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); - /* if this child is part of the job... */ - if (child->name->jobid == jobdat->jobid) { - if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { + /* if we are timing things, pack the time the launch msg for this job was recvd */ + if (orte_timing) { + int64_t tmp; + tmp = jobdat->launch_msg_recvd.tv_sec; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { + ORTE_ERROR_LOG(rc); + return rc; + } + tmp = jobdat->launch_msg_recvd.tv_usec; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { ORTE_ERROR_LOG(rc); return rc; } } - } - /* flag that this job is complete so the receiver can know */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - return rc; - } + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = next) { + child = (orte_odls_child_t*)item; + next = opal_list_get_next(item); + /* if this child is part of the job... */ + if (child->name->jobid == jobdat->jobid) { + if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + /* flag that this job is complete so the receiver can know */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } - return ORTE_SUCCESS; -} + return ORTE_SUCCESS; + } static bool all_children_registered(orte_jobid_t job) { diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index c53fb5eeab..f58dcc72aa 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -1188,12 +1188,27 @@ static int odls_base_default_setup_fork(orte_app_context_t *context, static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char ***env) { - char *vpid_str, *param, *value; + char *param, *value; orte_node_rank_t node_rank; orte_local_rank_t local_rank; int rc; - if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&vpid_str, child->name->vpid))) { + /* setup the jobid */ + if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name->jobid))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + rc = ORTE_ERR_OUT_OF_RESOURCE; + return rc; + } + opal_setenv(param, value, true, env); + free(param); + free(value); + + /* setup the vpid */ + if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name->vpid))) { ORTE_ERROR_LOG(rc); return rc; } @@ -1202,7 +1217,7 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char * rc = ORTE_ERR_OUT_OF_RESOURCE; return rc; } - opal_setenv(param, vpid_str, true, env); + opal_setenv(param, value, true, env); free(param); /* although the vpid IS the process' rank within the job, users @@ -1213,8 +1228,8 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char * * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT. * We know - just live with it */ - opal_setenv("OMPI_COMM_WORLD_RANK", vpid_str, true, env); - free(vpid_str); /* done with this now */ + opal_setenv("OMPI_COMM_WORLD_RANK", value, true, env); + free(value); /* done with this now */ /* users would appreciate being given a public environmental variable * that also represents the local rank value - something MPI specific - so @@ -1291,6 +1306,78 @@ static int setup_child(orte_odls_child_t *child, orte_odls_job_t *jobdat, char * return ORTE_SUCCESS; } +static int setup_path(orte_app_context_t *app) +{ + int rc; + char dir[MAXPATHLEN]; + char **argvptr; + char *pathenv = NULL, *mpiexec_pathenv = NULL; + char *full_search; + + /* Try to change to the app's cwd and check that the app + exists and is executable The function will + take care of outputting a pretty error message, if required + */ + if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(app, true))) { + /* do not ERROR_LOG - it will be reported elsewhere */ + goto CLEANUP; + } + + /* The prior function will have done a chdir() to jump us to + * wherever the app is to be executed. This could be either where + * the user specified (via -wdir), or to the user's home directory + * on this node if nothing was provided. It seems that chdir doesn't + * adjust the $PWD enviro variable when it changes the directory. This + * can cause a user to get a different response when doing getcwd vs + * looking at the enviro variable. To keep this consistent, we explicitly + * ensure that the PWD enviro variable matches the CWD we moved to. + * + * NOTE: if a user's program does a chdir(), then $PWD will once + * again not match getcwd! This is beyond our control - we are only + * ensuring they start out matching. + */ + getcwd(dir, sizeof(dir)); + opal_setenv("PWD", dir, true, &app->env); + + /* Search for the OMPI_exec_path and PATH settings in the environment. */ + for (argvptr = app->env; *argvptr != NULL; argvptr++) { + if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) { + mpiexec_pathenv = *argvptr + 15; + } + if (0 == strncmp("PATH=", *argvptr, 5)) { + pathenv = *argvptr + 5; + } + } + + /* If OMPI_exec_path is set (meaning --path was used), then create a + temporary environment to be used in the search for the executable. + The PATH setting in this temporary environment is a combination of + the OMPI_exec_path and PATH values. If OMPI_exec_path is not set, + then just use existing environment with PATH in it. */ + if (NULL != mpiexec_pathenv) { + argvptr = NULL; + if (pathenv != NULL) { + asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv); + } else { + asprintf(&full_search, "%s", mpiexec_pathenv); + } + opal_setenv("PATH", full_search, true, &argvptr); + free(full_search); + } else { + argvptr = app->env; + } + + rc = orte_util_check_context_app(app, argvptr); + /* do not ERROR_LOG - it will be reported elsewhere */ + if (NULL != mpiexec_pathenv) { + opal_argv_free(argvptr); + } + + CLEANUP: + return rc; +} + + /* define a timer release point so that we can wait for * file descriptors to come available, if necessary */ @@ -1311,7 +1398,6 @@ static void timer_cb(int fd, short event, void *cbdata) int orte_odls_base_default_launch_local(orte_jobid_t job, orte_odls_base_fork_local_proc_fn_t fork_local) { - char *job_str, *param; opal_list_item_t *item; orte_app_context_t *app, **apps; orte_app_idx_t i, num_apps; @@ -1323,11 +1409,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, opal_buffer_t alert; orte_std_cntr_t proc_rank; orte_odls_job_t *jobdat; - char *pathenv = NULL, *mpiexec_pathenv = NULL; char basedir[MAXPATHLEN]; - char dir[MAXPATHLEN]; - char **argvptr; - char *full_search; char **argvsav=NULL; int inm; opal_event_t *delay; @@ -1518,75 +1600,21 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, goto CLEANUP; } - - /* Try to change to the app's cwd and check that the app - exists and is executable The function will - take care of outputting a pretty error message, if required + /* setup the working directory for this app - will jump us + * to that directory */ - if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(app, true))) { - /* do not ERROR_LOG - it will be reported elsewhere */ - /* cycle through children to find those for this jobid */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t*)item; - if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name->jobid), ORTE_JOBID) && - i == child->app_idx) { - child->exit_code = rc; - } - } - goto CLEANUP; - } - - /* The prior function will have done a chdir() to jump us to - * wherever the app is to be executed. This could be either where - * the user specified (via -wdir), or to the user's home directory - * on this node if nothing was provided. It seems that chdir doesn't - * adjust the $PWD enviro variable when it changes the directory. This - * can cause a user to get a different response when doing getcwd vs - * looking at the enviro variable. To keep this consistent, we explicitly - * ensure that the PWD enviro variable matches the CWD we moved to. - * - * NOTE: if a user's program does a chdir(), then $PWD will once - * again not match getcwd! This is beyond our control - we are only - * ensuring they start out matching. - */ - getcwd(dir, sizeof(dir)); - opal_setenv("PWD", dir, true, &app->env); - - /* Search for the OMPI_exec_path and PATH settings in the environment. */ - for (argvptr = app->env; *argvptr != NULL; argvptr++) { - if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) { - mpiexec_pathenv = *argvptr + 15; - } - if (0 == strncmp("PATH=", *argvptr, 5)) { - pathenv = *argvptr + 5; - } - } - - /* If OMPI_exec_path is set (meaning --path was used), then create a - temporary environment to be used in the search for the executable. - The PATH setting in this temporary environment is a combination of - the OMPI_exec_path and PATH values. If OMPI_exec_path is not set, - then just use existing environment with PATH in it. */ - if (NULL != mpiexec_pathenv) { - argvptr = NULL; - if (pathenv != NULL) { - asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv); - } else { - asprintf(&full_search, "%s", mpiexec_pathenv); - } - opal_setenv("PATH", full_search, true, &argvptr); - free(full_search); - } else { - argvptr = app->env; - } - - if (ORTE_SUCCESS != (rc = orte_util_check_context_app(app, argvptr))) { - /* do not ERROR_LOG - it will be reported elsewhere */ - if (NULL != mpiexec_pathenv) { - opal_argv_free(argvptr); - } + if (ORTE_SUCCESS != (rc = setup_path(app))) { + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls:launch:setup_path failed with error %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_ERROR_NAME(rc))); + /* do not ERROR_LOG this failure - it will be reported + * elsewhere. The launch is going to fail. Since we could have + * multiple app_contexts, we need to ensure that we flag only + * the correct one that caused this operation to fail. We then have + * to flag all the other procs from the app_context as having "not failed" + * so we can report things out correctly + */ /* cycle through children to find those for this jobid */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); @@ -1600,10 +1628,7 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, /* okay, now tell the HNP we couldn't do it */ goto CLEANUP; } - if (NULL != mpiexec_pathenv) { - opal_argv_free(argvptr); - } - + /* okay, now let's launch all the local procs for this app using the provided fork_local fn */ for (proc_rank = 0, item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); @@ -1765,19 +1790,6 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, /* setup the rest of the environment with the proc-specific items - these * will be overwritten for each child */ - if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&job_str, child->name->jobid))) { - ORTE_ERROR_LOG(rc); - goto CLEANUP; - } - if (NULL == (param = mca_base_param_environ_variable("orte","ess","jobid"))) { - ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); - rc = ORTE_ERR_OUT_OF_RESOURCE; - goto CLEANUP; - } - opal_setenv(param, job_str, true, &app->env); - free(param); - free(job_str); - if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) { ORTE_ERROR_LOG(rc); goto CLEANUP; @@ -1870,7 +1882,10 @@ int orte_odls_base_default_launch_local(orte_jobid_t job, } launch_failed = false; -CLEANUP: + CLEANUP: + /* ensure we reset our working directory back to our default location */ + chdir(basedir); + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:launch reporting job %s launch status", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -2947,7 +2962,8 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, orte_app_context_t *app; opal_list_item_t *item; orte_odls_job_t *jobdat; - + char basedir[MAXPATHLEN]; + /* protect operations involving the global list of children */ OPAL_THREAD_LOCK(&orte_odls_globals.mutex); @@ -2956,6 +2972,12 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name))); + /* establish our baseline working directory - we will be potentially + * bouncing around as we execute this app, but we will always return + * to this place as our default directory + */ + getcwd(basedir, sizeof(basedir)); + /* find this child's jobdat */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); @@ -2985,29 +3007,53 @@ int orte_odls_base_default_restart_proc(orte_odls_child_t *child, child->rml_uri = NULL; } app = jobdat->apps[child->app_idx]; - /* reset envars to match this child */ - + + /* reset envars to match this child */ if (ORTE_SUCCESS != (rc = setup_child(child, jobdat, &app->env))) { ORTE_ERROR_LOG(rc); + opal_condition_signal(&orte_odls_globals.cond); + OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); goto CLEANUP; } - opal_output(0, "%s restarting app %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app); + + /* setup the path */ + if (ORTE_SUCCESS != (rc = setup_path(app))) { + ORTE_ERROR_LOG(rc); + opal_condition_signal(&orte_odls_globals.cond); + OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); + goto CLEANUP; + } + + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s restarting app %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app)); + + /* must unlock prior to fork to keep things clean in the + * event library + */ + opal_condition_signal(&orte_odls_globals.cond); + OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); + rc = fork_local(app, child, app->env, jobdat); if (ORTE_SUCCESS == rc) { - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); orte_wait_cb(child->pid, odls_base_default_wait_local_proc, NULL); - OPAL_THREAD_LOCK(&orte_odls_globals.mutex); - } -CLEANUP: + CLEANUP: OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, "%s odls:restart of proc %s %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(child->name), (ORTE_SUCCESS == rc) ? "succeeded" : "failed")); - opal_condition_signal(&orte_odls_globals.cond); - OPAL_THREAD_UNLOCK(&orte_odls_globals.mutex); + /* reset our working directory back to our default location - if we + * don't do this, then we will be looking for relative paths starting + * from the last wdir option specified by the user. Thus, we would + * be requiring that the user keep track on the cmd line of where + * each app was located relative to the prior app, instead of relative + * to their current location + */ + chdir(basedir); + return rc; } diff --git a/orte/test/system/orte_spin.c b/orte/test/system/orte_spin.c index e4ce3ed495..43b320ad55 100644 --- a/orte/test/system/orte_spin.c +++ b/orte/test/system/orte_spin.c @@ -5,36 +5,24 @@ * A program that just spins - provides mechanism for testing user-driven * abnormal program termination */ -#include "opal_config.h" +#include "orte_config.h" +#include "orte/constants.h" #include -#include "opal/runtime/opal_progress.h" - +#include "orte/runtime/orte_globals.h" +#include "orte/util/name_fns.h" #include "orte/runtime/runtime.h" int main(int argc, char* argv[]) { - - int i; - double pi; - - orte_init(&argc, &argv, ORTE_PROC_NON_MPI); - - i = 0; - while (1) { - i++; - pi = i / 3.14159256; - if (i > 100) { - /* need to progress so we can - * wake up if our daemon goes - * away! - */ - opal_progress(); - /* reset the counter so we loop */ - i = 0; - } + if (ORTE_SUCCESS != orte_init(&argc, &argv, ORTE_PROC_NON_MPI)) { + fprintf(stderr, "ORTE_INIT FAILED\n"); + exit(1); } + opal_output(0, "%s RUNNING", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + + opal_event_dispatch(); orte_finalize();