From 7ce34223f1f62d2b55e3a1f6d359b6dac20192ec Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Wed, 12 May 2010 18:11:58 +0000 Subject: [PATCH] Per off-list discussion, implement the new OMPI exit status policy (soon to be on wiki) and further cleanup error reporting to cover new cases. Implement process migration when failed nodes are detected. Some testing still required This commit was SVN r23121. --- orte/include/orte/constants.h | 5 +- orte/mca/errmgr/hnp/errmgr_hnp.c | 406 ++++++++++++------- orte/mca/errmgr/orted/errmgr_orted.c | 42 ++ orte/mca/ess/hnp/ess_hnp_module.c | 20 +- orte/mca/sensor/file/sensor_file.c | 155 ++++--- orte/mca/sensor/heartbeat/sensor_heartbeat.c | 6 +- orte/mca/sensor/memusage/sensor_memusage.c | 76 ++-- orte/runtime/orte_globals.c | 4 + orte/runtime/orte_globals.h | 3 + orte/runtime/orte_mca_params.c | 11 + orte/tools/orterun/help-orterun.txt | 31 ++ orte/tools/orterun/orterun.c | 36 +- 12 files changed, 502 insertions(+), 293 deletions(-) diff --git a/orte/include/orte/constants.h b/orte/include/orte/constants.h index 1f0fbd5b07..e151adbb38 100644 --- a/orte/include/orte/constants.h +++ b/orte/include/orte/constants.h @@ -115,7 +115,10 @@ enum { ORTE_ERR_RELOCATE_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 32), ORTE_ERR_INVALID_NODE_RANK = (ORTE_ERR_BASE - 33), ORTE_ERR_INVALID_LOCAL_RANK = (ORTE_ERR_BASE - 34), - ORTE_ERR_UNRECOVERABLE = (ORTE_ERR_BASE - 35) + ORTE_ERR_UNRECOVERABLE = (ORTE_ERR_BASE - 35), + ORTE_ERR_MEM_LIMIT_EXCEEDED = (ORTE_ERR_BASE - 36), + ORTE_ERR_HEARTBEAT_LOST = (ORTE_ERR_BASE - 37), + ORTE_ERR_PROC_STALLED = (ORTE_ERR_BASE - 38) }; #define ORTE_ERR_MAX (ORTE_ERR_BASE - 100) diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index e1af67e866..15f3c95243 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -36,6 +36,8 @@ #include "orte/mca/plm/plm.h" #include "orte/mca/rmaps/rmaps_types.h" #include "orte/mca/sensor/sensor.h" +#include "orte/mca/routed/routed.h" + #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/errmgr_private.h" @@ -44,7 +46,8 @@ /* Local functions */ static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code); static void failed_start(orte_job_t *jdata); -static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, orte_proc_state_t state); +static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, + orte_proc_state_t state, orte_exit_code_t exit_code); static void update_proc(orte_job_t *jdata, orte_process_name_t *proc, orte_proc_state_t state, orte_exit_code_t exit_code); @@ -207,7 +210,7 @@ static int update_state(orte_jobid_t job, break; case ORTE_JOB_STATE_RUNNING: /* update all procs in job */ - update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING); + update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING, 0); break; case ORTE_JOB_STATE_NEVER_LAUNCHED: orte_never_launched = true; @@ -223,7 +226,9 @@ static int update_state(orte_jobid_t job, break; case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: /* update all procs in job */ - update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); + update_local_procs_in_job(jdata, jobstate, + ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, + exit_code); /* order all local procs for this job to be killed */ killprocs(jdata->jobid, ORTE_VPID_WILDCARD); check_job_complete(jdata); /* set the local proc states */ @@ -247,6 +252,18 @@ static int update_state(orte_jobid_t job, hnp_abort(jdata->jobid, exit_code); } break; + case ORTE_JOB_STATE_HEARTBEAT_FAILED: + /* order all local procs for this job to be killed */ + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); + check_job_complete(jdata); /* set the local proc states */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(job))) { + hnp_abort(jdata->jobid, exit_code); + } + break; default: break; @@ -340,8 +357,39 @@ static int update_state(orte_jobid_t job, case ORTE_PROC_STATE_COMM_FAILED: /* is this to a daemon? */ - /* relocate its processes */ - /* attempt to restart? */ + if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { + if (orte_enable_recovery) { + /* relocate its processes */ + if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc))) { + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); + /* kill all jobs */ + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); + } + } else { + update_proc(jdata, proc, state, ORTE_ERR_COMM_FAILURE); + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); + /* kill all jobs */ + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); + } + } else { + /* delete the route */ + orte_routed.delete_route(proc); + } + break; + + case ORTE_PROC_STATE_HEARTBEAT_FAILED: + /* heartbeats are only from daemons */ + if (orte_enable_recovery) { + /* relocate its processes */ + } else { + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); + /* kill all jobs */ + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); + return ORTE_ERR_UNRECOVERABLE; + } break; default: @@ -459,7 +507,8 @@ static void failed_start(orte_job_t *jdata) ORTE_JOBID_PRINT(jdata->jobid))); } -static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, orte_proc_state_t state) +static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, + orte_proc_state_t state, orte_exit_code_t exit_code) { opal_list_item_t *item, *next; orte_odls_job_t *jobdat; @@ -493,6 +542,9 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta child->state = state; proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); proc->state = state; + if (proc->exit_code < exit_code) { + proc->exit_code = exit_code; + } if (ORTE_PROC_STATE_UNTERMINATED < state) { opal_list_remove_item(&orte_local_children, &child->super); OBJ_RELEASE(child); @@ -599,14 +651,14 @@ static void check_job_complete(orte_job_t *jdata) orte_job_map_t *map; orte_std_cntr_t index; bool one_still_alive; - orte_vpid_t non_zero=0; + orte_vpid_t non_zero=0, lowest=0; #if 0 /* Check if FileM is active. If so then keep processing. */ OPAL_ACQUIRE_THREAD(&orte_filem_base_lock, &orte_filem_base_cond, &orte_filem_base_is_active); #endif - for (i=0; i < jdata->procs->size; i++) { + for (i=0; i < jdata->procs->size && !jdata->abort; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { /* the proc array may no longer be left justified, so * we need to check everything @@ -616,123 +668,161 @@ static void check_job_complete(orte_job_t *jdata) if (0 != proc->exit_code) { non_zero++; + if (0 == lowest) { + lowest = proc->exit_code; + } } - /* - * Determine how the process state affects the job state - */ - if (ORTE_PROC_STATE_FAILED_TO_START == proc->state) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr_hnp:check_job_completed proc %s failed to start", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_FAILED_TO_START; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - break; - } - } else if (ORTE_PROC_STATE_ABORTED == proc->state) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s aborted", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - break; - } - } else if (ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s aborted by signal", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - break; - } - } else if (ORTE_PROC_STATE_TERM_WO_SYNC == proc->state) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s terminated without sync", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); - /* now treat a special case - if the proc exit'd without a required - * sync, it may have done so with a zero exit code. We want to ensure - * that the user realizes there was an error, so in this -one- case, - * we overwrite the process' exit code with the default error code + switch (proc->state) { + case ORTE_PROC_STATE_KILLED_BY_CMD: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s killed by cmd", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + /* we ordered this proc to die, so it isn't an abnormal termination + * and we don't flag it as such - just check the remaining jobs to + * see if anyone is still alive */ - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - break; - } - } else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s killed by cmd", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - /* we ordered this proc to die, so it isn't an abnormal termination - * and we don't flag it as such - just check the remaining jobs to - * see if anyone is still alive - */ - if (jdata->num_terminated >= jdata->num_procs) { - /* this job has terminated - now we need to check to see if ALL - * the other jobs have also completed and wakeup if that is true - */ - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD; + if (jdata->num_terminated >= jdata->num_procs) { + /* this job has terminated - now we need to check to see if ALL + * the other jobs have also completed and wakeup if that is true + */ + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD; + } } - } - goto CHECK_ALIVE; - } else if (ORTE_PROC_STATE_CALLED_ABORT == proc->state) { - if (!jdata->abort) { - jdata->state = ORTE_JOB_STATE_CALLED_ABORT; - /* point to the first proc to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + goto CHECK_ALIVE; break; - } - } else if (ORTE_PROC_STATE_UNTERMINATED < proc->state && - jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s errmgr:hnp:check_job_completed proc %s terminated and continuous", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc->name))); - if (!jdata->abort) { - proc->state = ORTE_PROC_STATE_ABORTED; - jdata->state = ORTE_JOB_STATE_ABORTED; - /* point to the lowest rank to cause the problem */ - jdata->aborted_proc = proc; - /* retain the object so it doesn't get free'd */ - OBJ_RETAIN(proc); - jdata->abort = true; - ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + case ORTE_PROC_STATE_ABORTED: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s aborted", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_FAILED_TO_START: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr_hnp:check_job_completed proc %s failed to start", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_FAILED_TO_START; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_ABORTED_BY_SIG: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s aborted by signal", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_TERM_WO_SYNC: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s terminated without sync", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + /* now treat a special case - if the proc exit'd without a required + * sync, it may have done so with a zero exit code. We want to ensure + * that the user realizes there was an error, so in this -one- case, + * we overwrite the process' exit code with the default error code + */ + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + } + break; + case ORTE_PROC_STATE_COMM_FAILED: + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_COMM_FAILED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_CALLED_ABORT: + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_CALLED_ABORT; + /* point to the first proc to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_HEARTBEAT_FAILED: + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + + default: + if (ORTE_PROC_STATE_UNTERMINATED < proc->state && + jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s terminated and continuous", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + proc->state = ORTE_PROC_STATE_ABORTED; + jdata->state = ORTE_JOB_STATE_ABORTED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + } break; - } } } @@ -750,13 +840,20 @@ static void check_job_complete(orte_job_t *jdata) orte_sensor.stop(jdata->jobid); if (0 < non_zero) { + if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { + /* update the exit code */ + ORTE_UPDATE_EXIT_STATUS(lowest); + } + /* warn user */ opal_output(orte_clean_output, - "-----------------------------------------------------\n\n" - "While job %s terminated normally, %s processes returned\n" - "non-zero exit codes. Further examination may be required.\n\n" - "-----------------------------------------------------", - ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(non_zero)); + "-------------------------------------------------------\n" + "While %s job %s terminated normally, %s processes returned\n" + "non-zero exit codes. Further examination may be required.\n" + "-------------------------------------------------------", + (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", + (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), + ORTE_VPID_PRINT(non_zero)); } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs", @@ -921,6 +1018,13 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid) orte_sensor.stop(job); } + if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) { + if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { + ORTE_ERROR_LOG(rc); + } + return; + } + OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); OBJ_CONSTRUCT(&proc, orte_proc_t); proc.name.jobid = job; @@ -935,10 +1039,11 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid) static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc) { - orte_proc_t *pdata; - orte_node_t *node, *newnode; + orte_proc_t *pdata, *pdt; + orte_node_t *node; orte_app_context_t *app; - int rc; + orte_job_map_t *map; + int rc, i, n; /* get the proc_t object for this process */ pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); @@ -952,8 +1057,44 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc) if (jdata->max_global_restarts < pdata->relocates) { return ORTE_ERR_RELOCATE_LIMIT_EXCEEDED; } - /* proc just died - save the node where this proc was located */ - node = pdata->node; + + /* if it is a daemon that died, we need to flag all of its procs + * to be relocated + */ + if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { + map = jdata->map; + for (n=0; n < map->nodes->size; n++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) { + continue; + } + if (node->daemon->name.vpid != proc->vpid) { + continue; + } + /* found the node - now flag the procs */ + for (i=0; i < node->procs->size; i++) { + if (NULL == (pdt = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { + continue; + } + if (ORTE_PROC_STATE_TERMINATED < pdt->state) { + continue; + } + /* if the proc hasn't already terminated, then mark + * it as aborted so it will be restarted + */ + pdt->state = ORTE_PROC_STATE_ABORTED; + } + /* mark the node as "down" */ + node->state = ORTE_NODE_STATE_DOWN; + /* remove it from the map */ + opal_pointer_array_set_item(map->nodes, n, NULL); + /* do a release to maintain accounting - won't actually + * remove the node object from memory + */ + OBJ_RELEASE(node); + break; + } + } + /* reset the job params for restart */ orte_plm_base_reset_job(jdata); @@ -969,13 +1110,6 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc) opal_output(0, "FAILED TO RESTART APP %s on error %s", app->app, ORTE_ERROR_NAME(rc)); return rc; } - /* get the new node */ - newnode = pdata->node; - /* report what we did */ - OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, - "%s Proc %s:%s aborted on node %s and was restarted on node %s\n\n", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - app->app, ORTE_NAME_PRINT(proc), node->name, newnode->name)); return ORTE_SUCCESS; } diff --git a/orte/mca/errmgr/orted/errmgr_orted.c b/orte/mca/errmgr/orted/errmgr_orted.c index e972cfc18d..8f098d1cc3 100644 --- a/orte/mca/errmgr/orted/errmgr_orted.c +++ b/orte/mca/errmgr/orted/errmgr_orted.c @@ -32,6 +32,8 @@ #include "orte/mca/rml/rml.h" #include "orte/mca/odls/odls.h" #include "orte/mca/plm/plm_types.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/sensor/sensor.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/base.h" @@ -192,6 +194,17 @@ static int update_state(orte_jobid_t job, update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); /* order all local procs for this job to be killed */ killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); + case ORTE_JOB_STATE_COMM_FAILED: + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); + /* tell the caller we can't recover */ + return ORTE_ERR_UNRECOVERABLE; + break; + case ORTE_JOB_STATE_HEARTBEAT_FAILED: + /* let the HNP handle this */ + return ORTE_SUCCESS; + break; + default: break; } @@ -216,6 +229,23 @@ static int update_state(orte_jobid_t job, return rc; } + /* if this was a failed comm, then see if it was to our + * lifeline + */ + if (ORTE_PROC_STATE_COMM_FAILED == state) { + /* delete the route */ + orte_routed.delete_route(proc); + /* see is this was a lifeline */ + if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { + /* kill our children */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); + /* tell the caller we can't recover */ + return ORTE_ERR_UNRECOVERABLE; + } + /* if not, then indicate we can continue */ + return ORTE_SUCCESS; + } + /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); @@ -777,6 +807,18 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid) orte_proc_t proc; int rc; + /* stop local sensors for this job */ + if (ORTE_VPID_WILDCARD == vpid) { + orte_sensor.stop(job); + } + + if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) { + if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { + ORTE_ERROR_LOG(rc); + } + return; + } + OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); OBJ_CONSTRUCT(&proc, orte_proc_t); proc.name.jobid = job; diff --git a/orte/mca/ess/hnp/ess_hnp_module.c b/orte/mca/ess/hnp/ess_hnp_module.c index 0fee37def1..39c0f461d9 100644 --- a/orte/mca/ess/hnp/ess_hnp_module.c +++ b/orte/mca/ess/hnp/ess_hnp_module.c @@ -694,19 +694,6 @@ static int rte_finalize(void) return ORTE_SUCCESS; } -/* - * For application procs, we do NOT call the regular - * C-library "abort" function, even though that would have - * alerted us to the fact that this is an abnormal termination, - * because it would automatically cause a core file to be - * generated. On large systems, that can be overwhelming - * (imagine a few thousand Gbyte-sized files hitting - * a shared file system simultaneously...ouch!). - * - * However, the HNP is only ONE process, so we can do it - * here as the core file might prove useful. Do so -only- - * if indicated by the report flag! - */ static void rte_abort(int status, bool report) { /* do NOT do a normal finalize as this will very likely @@ -726,12 +713,7 @@ static void rte_abort(int status, bool report) */ orte_proc_info_finalize(); - /* Now exit/abort */ - if (report) { - abort(); - } - - /* otherwise, just exit */ + /* just exit */ exit(status); } diff --git a/orte/mca/sensor/file/sensor_file.c b/orte/mca/sensor/file/sensor_file.c index 0fccb5fd64..018392bf89 100644 --- a/orte/mca/sensor/file/sensor_file.c +++ b/orte/mca/sensor/file/sensor_file.c @@ -140,7 +140,7 @@ static void start(orte_jobid_t jobid) file_tracker_t *ft; /* cannot monitor my own job */ - if (jobid == ORTE_PROC_MY_NAME->jobid) { + if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) { return; } @@ -150,95 +150,83 @@ static void start(orte_jobid_t jobid) ORTE_JOBID_PRINT(jobid))); /* get the local jobdat for this job */ - jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_end(&orte_local_jobdata)) { jobdat = (orte_odls_job_t*)item; - if (jobid == jobdat->jobid) { - break; - } - } - if (NULL == jobdat) { - /* no local procs for this job */ - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, - "%s sensor:file no local procs for job %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobid))); - return; - } - - /* must be at least one app_context, so use the first */ - if (NULL == (app = jobdat->apps[0])) { - /* got a problem */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return; - } - - /* search the environ to get the filename */ - if (ORTE_SUCCESS != (rc = mca_base_param_find_string(c, "filename", app->env, &filename))) { - /* was a default file given */ - if (NULL == mca_sensor_file_component.file) { - /* can't do anything without a file */ + if (jobid == jobdat->jobid || ORTE_JOBID_WILDCARD == jobid) { + /* must be at least one app_context, so use the first */ + if (NULL == (app = jobdat->apps[0])) { + /* got a problem */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + continue; + } + + /* search the environ to get the filename */ + if (ORTE_SUCCESS != (rc = mca_base_param_find_string(c, "filename", app->env, &filename))) { + /* was a default file given */ + if (NULL == mca_sensor_file_component.file) { + /* can't do anything without a file */ + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, + "%s sensor:file no file for job %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jobid))); + continue; + } + filename = mca_sensor_file_component.file; + } + + /* create the tracking object */ + ft = OBJ_NEW(file_tracker_t); + ft->jobid = jobid; + ft->file = strdup(filename); + + /* search the environ to see what we are checking */ + tmp = 0; + if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_size", app->env, &tmp))) { + /* was a default value given */ + if (0 < mca_sensor_file_component.check_size) { + ft->check_size = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_size); + } + } else { + ft->check_size = OPAL_INT_TO_BOOL(tmp); + } + tmp = 0; + if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_access", app->env, &tmp))) { + /* was a default value given */ + if (0 < mca_sensor_file_component.check_access) { + ft->check_access = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_access); + } + } else { + ft->check_access = OPAL_INT_TO_BOOL(tmp); + } + tmp = 0; + if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_mod", app->env, &tmp))) { + /* was a default value given */ + if (0 < mca_sensor_file_component.check_mod) { + ft->check_mod = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_mod); + } + } else { + ft->check_mod = OPAL_INT_TO_BOOL(tmp); + } + tmp = 0; + if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "limit", app->env, &tmp))) { + ft->limit = mca_sensor_file_component.limit; + } else { + ft->limit = tmp; + } + opal_list_append(&jobs, &ft->super); OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, - "%s sensor:file no file for job %s", + "%s file %s monitored for %s%s%s with limit %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobid))); - return; + ft->file, ft->check_size ? "SIZE:" : " ", + ft->check_access ? "ACCESS TIME:" : " ", + ft->check_mod ? "MOD TIME" : " ", ft->limit)); } - filename = mca_sensor_file_component.file; } - /* create the tracking object */ - ft = OBJ_NEW(file_tracker_t); - ft->jobid = jobid; - ft->file = strdup(filename); - - /* search the environ to see what we are checking */ - tmp = 0; - if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_size", app->env, &tmp))) { - /* was a default value given */ - if (0 < mca_sensor_file_component.check_size) { - ft->check_size = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_size); - } - } else { - ft->check_size = OPAL_INT_TO_BOOL(tmp); - } - tmp = 0; - if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_access", app->env, &tmp))) { - /* was a default value given */ - if (0 < mca_sensor_file_component.check_access) { - ft->check_access = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_access); - } - } else { - ft->check_access = OPAL_INT_TO_BOOL(tmp); - } - tmp = 0; - if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "check_mod", app->env, &tmp))) { - /* was a default value given */ - if (0 < mca_sensor_file_component.check_mod) { - ft->check_mod = OPAL_INT_TO_BOOL(mca_sensor_file_component.check_mod); - } - } else { - ft->check_mod = OPAL_INT_TO_BOOL(tmp); - } - tmp = 0; - if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "limit", app->env, &tmp))) { - ft->limit = mca_sensor_file_component.limit; - } else { - ft->limit = tmp; - } - opal_list_append(&jobs, &ft->super); - - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, - "%s file %s monitored for %s%s%s with limit %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ft->file, ft->check_size ? "SIZE:" : " ", - ft->check_access ? "ACCESS TIME:" : " ", - ft->check_mod ? "MOD TIME" : " ", ft->limit)); - /* start sampling */ - if (NULL == sample_ev) { + if (NULL == sample_ev && !opal_list_is_empty(&jobs)) { /* startup a timer to wake us up periodically * for a data sample */ @@ -258,7 +246,7 @@ static void stop(orte_jobid_t jobid) file_tracker_t *ft; /* cannot monitor my own job */ - if (jobid == ORTE_PROC_MY_NAME->jobid) { + if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) { return; } @@ -266,10 +254,9 @@ static void stop(orte_jobid_t jobid) item != opal_list_get_end(&jobs); item = opal_list_get_next(item)) { ft = (file_tracker_t*)item; - if (jobid == ft->jobid) { + if (jobid == ft->jobid || ORTE_JOBID_WILDCARD == jobid) { opal_list_remove_item(&jobs, item); OBJ_RELEASE(item); - break; } } /* if no jobs remain, stop the sampling */ @@ -355,7 +342,7 @@ static void sample(int fd, short event, void *arg) ft->file, ft->file_size, ctime(&ft->last_access), ctime(&ft->last_mod)); orte_errmgr.update_state(ft->jobid, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED, NULL, ORTE_PROC_STATE_UNDEF, - ORTE_ERROR_DEFAULT_EXIT_CODE); + ORTE_ERR_PROC_STALLED); } } diff --git a/orte/mca/sensor/heartbeat/sensor_heartbeat.c b/orte/mca/sensor/heartbeat/sensor_heartbeat.c index fbc4cebd1c..a9851de8e3 100644 --- a/orte/mca/sensor/heartbeat/sensor_heartbeat.c +++ b/orte/mca/sensor/heartbeat/sensor_heartbeat.c @@ -157,7 +157,7 @@ static void start(orte_jobid_t jobid) { uint64_t time; - if (jobid != ORTE_PROC_MY_NAME->jobid) { + if (jobid != ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) { /* heartbeats are only for daemons and HNPs */ return; } @@ -185,7 +185,7 @@ static void start(orte_jobid_t jobid) static void stop(orte_jobid_t jobid) { - if (jobid != ORTE_PROC_MY_NAME->jobid) { + if (jobid != ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) { /* heartbeats are only for daemons and HNPs */ return; } @@ -285,7 +285,7 @@ static void check_heartbeat(int fd, short dummy, void *arg) name.vpid = v; orte_errmgr.update_state(ORTE_PROC_MY_NAME->jobid, ORTE_JOB_STATE_HEARTBEAT_FAILED, &name, ORTE_PROC_STATE_HEARTBEAT_FAILED, - ORTE_ERROR_DEFAULT_EXIT_CODE); + ORTE_ERR_HEARTBEAT_LOST); } } } diff --git a/orte/mca/sensor/memusage/sensor_memusage.c b/orte/mca/sensor/memusage/sensor_memusage.c index aca504b8fc..cb551e38b2 100644 --- a/orte/mca/sensor/memusage/sensor_memusage.c +++ b/orte/mca/sensor/memusage/sensor_memusage.c @@ -110,7 +110,7 @@ static void start(orte_jobid_t jobid) int rc, tmp; /* cannot monitor my own job */ - if (jobid == ORTE_PROC_MY_NAME->jobid) { + if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) { return; } @@ -120,50 +120,43 @@ static void start(orte_jobid_t jobid) ORTE_JOBID_PRINT(jobid))); /* get the local jobdat for this job */ - jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_end(&orte_local_jobdata)) { jobdat = (orte_odls_job_t*)item; - if (jobid == jobdat->jobid) { - break; + if (jobid == jobdat->jobid || ORTE_JOBID_WILDCARD == jobid) { + /* must be at least one app_context, so use the first */ + if (NULL == (app = jobdat->apps[0])) { + /* got a problem */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + continue; + } + + /* search the environ to get memory limit */ + tmp = 0; + if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "memory_limit", app->env, &tmp))) { + /* was a default value given */ + if (0 < mca_sensor_memusage_component.memory_limit) { + tmp = mca_sensor_memusage_component.memory_limit; + } + } + if (tmp <= 0) { + /* we don't want to monitor this job */ + OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, + "%s memory monitoring for job %s is not requested", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jobid))); + continue; + } + + job = OBJ_NEW(memusage_tracker_t); + job->jobid = jobid; + job->memory_limit = tmp; + opal_list_append(&jobs, &job->super); } } - if (NULL == jobdat) { - /* no local procs for this job */ - return; - } - /* must be at least one app_context, so use the first */ - if (NULL == (app = jobdat->apps[0])) { - /* got a problem */ - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return; - } - - /* search the environ to get memory limit */ - tmp = 0; - if (ORTE_SUCCESS != (rc = mca_base_param_find_int(c, "memory_limit", app->env, &tmp))) { - /* was a default value given */ - if (0 < mca_sensor_memusage_component.memory_limit) { - tmp = mca_sensor_memusage_component.memory_limit; - } - } - if (tmp <= 0) { - /* we don't want to monitor this job */ - OPAL_OUTPUT_VERBOSE((1, orte_sensor_base.output, - "%s memory monitoring for job %s is not requested", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jobid))); - return; - } - - job = OBJ_NEW(memusage_tracker_t); - job->jobid = jobid; - job->memory_limit = tmp; - opal_list_append(&jobs, &job->super); - - if (NULL == sample_ev) { + if (NULL == sample_ev && !opal_list_is_empty(&jobs)) { /* startup a timer to wake us up periodically * for a data sample */ @@ -183,7 +176,7 @@ static void stop(orte_jobid_t jobid) memusage_tracker_t *job; /* cannot monitor my own job */ - if (jobid == ORTE_PROC_MY_NAME->jobid) { + if (jobid == ORTE_PROC_MY_NAME->jobid && ORTE_JOBID_WILDCARD != jobid) { return; } @@ -191,10 +184,9 @@ static void stop(orte_jobid_t jobid) item != opal_list_get_end(&jobs); item = opal_list_get_next(item)) { job = (memusage_tracker_t*)item; - if (jobid == job->jobid) { + if (jobid == job->jobid || ORTE_JOBID_WILDCARD == jobid) { opal_list_remove_item(&jobs, item); OBJ_RELEASE(item); - break; } } /* if no jobs remain, stop the sampling */ @@ -264,7 +256,7 @@ static void sample(int fd, short event, void *arg) (unsigned long)stats.vsize/1000000, (unsigned long)job->memory_limit); orte_errmgr.update_state(child->name->jobid, ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED, child->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, - ORTE_ERROR_DEFAULT_EXIT_CODE); + ORTE_ERR_MEM_LIMIT_EXCEEDED); } OBJ_DESTRUCT(&stats); } diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 3f19bd45b5..bc7fa7f7df 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -181,6 +181,10 @@ int32_t orte_max_local_restarts; /* comm fn for updating state */ orte_default_comm_fn_t orte_comm; +/* exit status reporting */ +bool orte_report_child_jobs_separately; +struct timeval orte_child_time_to_exit; + #endif /* !ORTE_DISABLE_FULL_RTE */ int orte_debug_output = -1; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 5c8947160b..df86e94095 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -688,6 +688,9 @@ ORTE_DECLSPEC int orte_global_comm(orte_process_name_t *recipient, opal_buffer_t *buf, orte_rml_tag_t tag, orte_default_cbfunc_t cbfunc); +/* exit status reporting */ +ORTE_DECLSPEC extern bool orte_report_child_jobs_separately; +ORTE_DECLSPEC extern struct timeval orte_child_time_to_exit; #endif /* ORTE_DISABLE_FULL_SUPPORT */ diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index 7c39820c8a..f4f9d6ef69 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -508,7 +508,18 @@ int orte_register_params(void) orte_enable_recovery = true; } + mca_base_param_reg_int_name("orte", "report_child_jobs_separately", + "Return the exit status of the primary job only", + false, false, + (int)false, &value); + orte_report_child_jobs_separately = OPAL_INT_TO_BOOL(value); + mca_base_param_reg_int_name("orte", "child_time_to_exit", + "Max time a spawned child job is allowed to run after the primary job has terminated (seconds)", + false, false, + INT_MAX, &value); + orte_child_time_to_exit.tv_sec = value; + orte_child_time_to_exit.tv_usec = 0; #endif /* ORTE_DISABLE_FULL_SUPPORT */ diff --git a/orte/tools/orterun/help-orterun.txt b/orte/tools/orterun/help-orterun.txt index 086756571b..34384497b8 100644 --- a/orte/tools/orterun/help-orterun.txt +++ b/orte/tools/orterun/help-orterun.txt @@ -562,3 +562,34 @@ A request was made to bind processes, but process affinity is not supported on this node: Local host: %s +# +[orterun:proc-comm-failed] +A critical communication path was lost to: + + Process name: %s + Node: %s +# +[orterun:proc-mem-exceeded] +A process exceeded memory limits: + + Process name: %s + Node: %s +# +[orterun:proc-stalled] +One or more processes appear to have stalled - a monitored file +failed to show the required activity. +# +[orterun:proc-sensor-exceeded] +One or more processes have exceeded a specified sensor limit, but +no further info is available. +# +[orterun:proc-called-abort] +%s detected that one or more processes called %s_abort, thus causing +the job to be terminated. +# +[orterun:proc-heartbeat-failed] +%s failed to receive scheduled heartbeat communications from a remote process: + + Process name: %s + Node: %s + diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index f04f6495f6..2f3f3d6364 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -152,6 +152,10 @@ static opal_cmd_line_init_t cmd_line_init[] = { &orterun_globals.report_uri, OPAL_CMD_LINE_TYPE_STRING, "Printout URI on stdout [-], stderr [+], or a file [anything else]" }, + /* exit status reporting */ + { "orte", "report", "child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0, + NULL, OPAL_CMD_LINE_TYPE_BOOL, + "Return the exit status of the primary job only" }, /* hetero apps */ { "orte", "hetero", "apps", '\0', NULL, "hetero", 0, @@ -979,7 +983,6 @@ static void dump_aborted_procs(void) orte_app_context_t *app, *approc; orte_job_t *job; orte_node_t *node; - bool found=false; /* find the job that caused the problem - be sure to start the loop * at 1 as the daemons are in 0 and will clearly be "running", so no @@ -1000,8 +1003,6 @@ static void dump_aborted_procs(void) proc = job->aborted_proc; /* always must be at least one app */ app = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, 0); - /* flag that we found at least one job that failed */ - found = true; /* cycle through and count the number that were killed or aborted */ for (i=0; i < job->procs->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(job->procs, i))) { @@ -1184,15 +1185,34 @@ static void dump_aborted_procs(void) orterun_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid, node->name, orterun_basename, orterun_basename); } + } else if (ORTE_JOB_STATE_COMM_FAILED == job->state) { + orte_show_help("help-orterun.txt", "orterun:proc-comm-failed", true, + ORTE_NAME_PRINT(&proc->name), node->name); + } else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) { + switch (proc->exit_code) { + case ORTE_ERR_MEM_LIMIT_EXCEEDED: + orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true, + ORTE_NAME_PRINT(&proc->name), node->name); + break; + case ORTE_ERR_PROC_STALLED: + orte_show_help("help-orterun.txt", "orterun:proc-stalled", true); + break; + + default: + orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true); + break; + } + } else if (ORTE_JOB_STATE_CALLED_ABORT == job->state) { + orte_show_help("help-orterun.txt", "orterun:proc-called-abort", true, + orterun_basename, + (0 == strncmp("orte", orterun_basename, 4)) ? "orte" : "MPI"); + } else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) { + orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true, + orterun_basename, ORTE_NAME_PRINT(&proc->name), node->name); } return; } } - - /* if we got here, then we couldn't find the job that aborted - - * report that fact and give up - */ - orte_show_help("help-orterun.txt", "orterun:proc-aborted-unknown", true, orterun_basename); } static void abort_exit_callback(int fd, short ign, void *arg)