diff --git a/orte/mca/errmgr/base/errmgr_base_fns.c b/orte/mca/errmgr/base/errmgr_base_fns.c index 0a4e283166..764d3338d9 100644 --- a/orte/mca/errmgr/base/errmgr_base_fns.c +++ b/orte/mca/errmgr/base/errmgr_base_fns.c @@ -75,7 +75,7 @@ void orte_errmgr_base_abort() orte_wait_kill(9); /* abnormal exit */ - exit(-1); + orte_abort(-1, NULL); } int orte_errmgr_base_register_job(orte_jobid_t job) diff --git a/orte/mca/pls/poe/pls_poe_module.c b/orte/mca/pls/poe/pls_poe_module.c index 0df605ed93..b7b021f4bb 100644 --- a/orte/mca/pls/poe/pls_poe_module.c +++ b/orte/mca/pls/poe/pls_poe_module.c @@ -224,7 +224,7 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid) rc = orte_ns.get_proc_name_string(&name_string, name); if(ORTE_SUCCESS != rc) { opal_output(0, "orte_pls_poe: unable to create process name"); - exit(-1); + return rc; } argv[proc_name_index] = name_string; for(i=0;i orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) { fprintf(stderr, "orteprobe: comm failure when sending contact info for existing univ back to requestor\n"); OBJ_DESTRUCT(&buffer); - exit(1); + orte_abort(1, NULL); } OBJ_DESTRUCT(&buffer); @@ -355,7 +355,7 @@ int main(int argc, char *argv[]) if (0 > asprintf(&orte_universe_info.name, "%s-%d", universe, pid)) { fprintf(stderr, "orteprobe: failed to create unique universe name"); - exit(1); + orte_abort(1, NULL); } } @@ -363,7 +363,7 @@ int main(int argc, char *argv[]) /* setup the pipe to get the contact info back */ if (pipe(orted_pipe)) { fprintf (stderr, "orteprobe: Pipe failed\n"); - exit(1); + orte_abort(1, NULL); } /* get name of orted application - just in case user specified something different */ @@ -378,7 +378,7 @@ int main(int argc, char *argv[]) ortedargc = opal_argv_count(ortedargv); if (ortedargc <= 0) { fprintf(stderr, "orteprobe: could not initialize argv array for daemon\n"); - exit(1); + orte_abort(1, NULL); } /* setup the path */ @@ -415,11 +415,11 @@ int main(int argc, char *argv[]) Close read end first. */ execv(path, ortedargv); fprintf(stderr, "orteprobe: execv failed with errno=%d\n", errno); - exit(1); + orte_abort(1, NULL); } else if (pid < (pid_t) 0) { /* The fork failed. */ fprintf (stderr, "orteprobe: Fork failed\n"); - exit(1); + orte_abort(1, NULL); } else { /* This is the parent process. Close write end first. */ @@ -437,13 +437,13 @@ int main(int argc, char *argv[]) if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr[0], 1, ORTE_STRING))) { fprintf(stderr, "orteprobe: failed to pack daemon uri\n"); - exit(1); + orte_abort(1, NULL); } if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) { fprintf(stderr, "orteprobe: could not send daemon uri info back to probe\n"); OBJ_DESTRUCT(&buffer); - exit(1); + orte_abort(1, NULL); } OBJ_DESTRUCT(&buffer); diff --git a/orte/tools/orterun/help-orterun.txt b/orte/tools/orterun/help-orterun.txt index 8458c60ac7..39ce0a3f5b 100644 --- a/orte/tools/orterun/help-orterun.txt +++ b/orte/tools/orterun/help-orterun.txt @@ -81,4 +81,8 @@ in the environment. Returned value %d instead of ORTE_SUCCESS. [orterun:proc-aborted] %s noticed that job rank %lu with PID %lu on node "%s" exited on signal %d. [orterun:abnormal-exit] -%s encountered an abnormal exit. +WARNING: %s encountered an abnormal exit. + +This means that %s exited before it received notification that all +started processes had terminated. You should double check and ensure +that there are no runaway processes still executing. diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index cbf85d459f..87fd0b06a7 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -565,7 +565,12 @@ static void exit_callback(int fd, short event, void *arg) { opal_show_help("help-orterun.txt", "orterun:abnormal-exit", true, orterun_basename); - exit(1); + + /* Trigger the normal exit conditions */ + + orterun_globals.exit = true; + orterun_globals.exit_status = 1; + opal_condition_signal(&orterun_globals.cond); }