1
1

- Change a few calls from exit() to orte_abort() so that we get

session directory cleanup (among other things)
- When we get an abnormal exit in orterun (i.e., timeout expires and
  we haven't gotten termination notices from all processes), print a
  better message an exit in a better way (which includes session
  directory cleanup)
- Fix tm and poe pls's to not exit() but rather propagate the error up
  the stack (where relevant)

This commit was SVN r7058.
Этот коммит содержится в:
Jeff Squyres 2005-08-26 20:36:11 +00:00
родитель 2a9ab3eb10
Коммит b3bd549331
6 изменённых файлов: 23 добавлений и 14 удалений

Просмотреть файл

@ -75,7 +75,7 @@ void orte_errmgr_base_abort()
orte_wait_kill(9); orte_wait_kill(9);
/* abnormal exit */ /* abnormal exit */
exit(-1); orte_abort(-1, NULL);
} }
int orte_errmgr_base_register_job(orte_jobid_t job) int orte_errmgr_base_register_job(orte_jobid_t job)

Просмотреть файл

@ -224,7 +224,7 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
rc = orte_ns.get_proc_name_string(&name_string, name); rc = orte_ns.get_proc_name_string(&name_string, name);
if(ORTE_SUCCESS != rc) { if(ORTE_SUCCESS != rc) {
opal_output(0, "orte_pls_poe: unable to create process name"); opal_output(0, "orte_pls_poe: unable to create process name");
exit(-1); return rc;
} }
argv[proc_name_index] = name_string; argv[proc_name_index] = name_string;
for(i=0;i<argc;i++) { for(i=0;i<argc;i++) {

Просмотреть файл

@ -239,7 +239,7 @@ pls_tm_launch(orte_jobid_t jobid)
rc = orte_ns.get_proc_name_string(&name_string, name); rc = orte_ns.get_proc_name_string(&name_string, name);
if (ORTE_SUCCESS != rc) { if (ORTE_SUCCESS != rc) {
opal_output(0, "pls:tm: unable to create process name"); opal_output(0, "pls:tm: unable to create process name");
exit(-1); return rc;
} }
argv[proc_name_index] = name_string; argv[proc_name_index] = name_string;

Просмотреть файл

@ -322,13 +322,13 @@ int main(int argc, char *argv[])
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr, 1, ORTE_STRING))) { if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr, 1, ORTE_STRING))) {
fprintf(stderr, "orteprobe: failed to pack contact info for existing universe\n"); fprintf(stderr, "orteprobe: failed to pack contact info for existing universe\n");
exit(1); orte_abort(1, NULL);
} }
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) { if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
fprintf(stderr, "orteprobe: comm failure when sending contact info for existing univ back to requestor\n"); fprintf(stderr, "orteprobe: comm failure when sending contact info for existing univ back to requestor\n");
OBJ_DESTRUCT(&buffer); OBJ_DESTRUCT(&buffer);
exit(1); orte_abort(1, NULL);
} }
OBJ_DESTRUCT(&buffer); OBJ_DESTRUCT(&buffer);
@ -355,7 +355,7 @@ int main(int argc, char *argv[])
if (0 > asprintf(&orte_universe_info.name, "%s-%d", universe, pid)) { if (0 > asprintf(&orte_universe_info.name, "%s-%d", universe, pid)) {
fprintf(stderr, "orteprobe: failed to create unique universe name"); fprintf(stderr, "orteprobe: failed to create unique universe name");
exit(1); orte_abort(1, NULL);
} }
} }
@ -363,7 +363,7 @@ int main(int argc, char *argv[])
/* setup the pipe to get the contact info back */ /* setup the pipe to get the contact info back */
if (pipe(orted_pipe)) { if (pipe(orted_pipe)) {
fprintf (stderr, "orteprobe: Pipe failed\n"); fprintf (stderr, "orteprobe: Pipe failed\n");
exit(1); orte_abort(1, NULL);
} }
/* get name of orted application - just in case user specified something different */ /* get name of orted application - just in case user specified something different */
@ -378,7 +378,7 @@ int main(int argc, char *argv[])
ortedargc = opal_argv_count(ortedargv); ortedargc = opal_argv_count(ortedargv);
if (ortedargc <= 0) { if (ortedargc <= 0) {
fprintf(stderr, "orteprobe: could not initialize argv array for daemon\n"); fprintf(stderr, "orteprobe: could not initialize argv array for daemon\n");
exit(1); orte_abort(1, NULL);
} }
/* setup the path */ /* setup the path */
@ -415,11 +415,11 @@ int main(int argc, char *argv[])
Close read end first. */ Close read end first. */
execv(path, ortedargv); execv(path, ortedargv);
fprintf(stderr, "orteprobe: execv failed with errno=%d\n", errno); fprintf(stderr, "orteprobe: execv failed with errno=%d\n", errno);
exit(1); orte_abort(1, NULL);
} else if (pid < (pid_t) 0) { } else if (pid < (pid_t) 0) {
/* The fork failed. */ /* The fork failed. */
fprintf (stderr, "orteprobe: Fork failed\n"); fprintf (stderr, "orteprobe: Fork failed\n");
exit(1); orte_abort(1, NULL);
} else { } else {
/* This is the parent process. /* This is the parent process.
Close write end first. */ Close write end first. */
@ -437,13 +437,13 @@ int main(int argc, char *argv[])
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr[0], 1, ORTE_STRING))) { if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr[0], 1, ORTE_STRING))) {
fprintf(stderr, "orteprobe: failed to pack daemon uri\n"); fprintf(stderr, "orteprobe: failed to pack daemon uri\n");
exit(1); orte_abort(1, NULL);
} }
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) { if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
fprintf(stderr, "orteprobe: could not send daemon uri info back to probe\n"); fprintf(stderr, "orteprobe: could not send daemon uri info back to probe\n");
OBJ_DESTRUCT(&buffer); OBJ_DESTRUCT(&buffer);
exit(1); orte_abort(1, NULL);
} }
OBJ_DESTRUCT(&buffer); OBJ_DESTRUCT(&buffer);

Просмотреть файл

@ -81,4 +81,8 @@ in the environment. Returned value %d instead of ORTE_SUCCESS.
[orterun:proc-aborted] [orterun:proc-aborted]
%s noticed that job rank %lu with PID %lu on node "%s" exited on signal %d. %s noticed that job rank %lu with PID %lu on node "%s" exited on signal %d.
[orterun:abnormal-exit] [orterun:abnormal-exit]
%s encountered an abnormal exit. WARNING: %s encountered an abnormal exit.
This means that %s exited before it received notification that all
started processes had terminated. You should double check and ensure
that there are no runaway processes still executing.

Просмотреть файл

@ -565,7 +565,12 @@ static void exit_callback(int fd, short event, void *arg)
{ {
opal_show_help("help-orterun.txt", "orterun:abnormal-exit", opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
true, orterun_basename); true, orterun_basename);
exit(1);
/* Trigger the normal exit conditions */
orterun_globals.exit = true;
orterun_globals.exit_status = 1;
opal_condition_signal(&orterun_globals.cond);
} }