1
1

- Change a few calls from exit() to orte_abort() so that we get

session directory cleanup (among other things)
- When we get an abnormal exit in orterun (i.e., timeout expires and
  we haven't gotten termination notices from all processes), print a
  better message an exit in a better way (which includes session
  directory cleanup)
- Fix tm and poe pls's to not exit() but rather propagate the error up
  the stack (where relevant)

This commit was SVN r7058.
Этот коммит содержится в:
Jeff Squyres 2005-08-26 20:36:11 +00:00
родитель 2a9ab3eb10
Коммит b3bd549331
6 изменённых файлов: 23 добавлений и 14 удалений

Просмотреть файл

@ -75,7 +75,7 @@ void orte_errmgr_base_abort()
orte_wait_kill(9);
/* abnormal exit */
exit(-1);
orte_abort(-1, NULL);
}
int orte_errmgr_base_register_job(orte_jobid_t job)

Просмотреть файл

@ -224,7 +224,7 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
rc = orte_ns.get_proc_name_string(&name_string, name);
if(ORTE_SUCCESS != rc) {
opal_output(0, "orte_pls_poe: unable to create process name");
exit(-1);
return rc;
}
argv[proc_name_index] = name_string;
for(i=0;i<argc;i++) {

Просмотреть файл

@ -239,7 +239,7 @@ pls_tm_launch(orte_jobid_t jobid)
rc = orte_ns.get_proc_name_string(&name_string, name);
if (ORTE_SUCCESS != rc) {
opal_output(0, "pls:tm: unable to create process name");
exit(-1);
return rc;
}
argv[proc_name_index] = name_string;

Просмотреть файл

@ -322,13 +322,13 @@ int main(int argc, char *argv[])
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr, 1, ORTE_STRING))) {
fprintf(stderr, "orteprobe: failed to pack contact info for existing universe\n");
exit(1);
orte_abort(1, NULL);
}
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
fprintf(stderr, "orteprobe: comm failure when sending contact info for existing univ back to requestor\n");
OBJ_DESTRUCT(&buffer);
exit(1);
orte_abort(1, NULL);
}
OBJ_DESTRUCT(&buffer);
@ -355,7 +355,7 @@ int main(int argc, char *argv[])
if (0 > asprintf(&orte_universe_info.name, "%s-%d", universe, pid)) {
fprintf(stderr, "orteprobe: failed to create unique universe name");
exit(1);
orte_abort(1, NULL);
}
}
@ -363,7 +363,7 @@ int main(int argc, char *argv[])
/* setup the pipe to get the contact info back */
if (pipe(orted_pipe)) {
fprintf (stderr, "orteprobe: Pipe failed\n");
exit(1);
orte_abort(1, NULL);
}
/* get name of orted application - just in case user specified something different */
@ -378,7 +378,7 @@ int main(int argc, char *argv[])
ortedargc = opal_argv_count(ortedargv);
if (ortedargc <= 0) {
fprintf(stderr, "orteprobe: could not initialize argv array for daemon\n");
exit(1);
orte_abort(1, NULL);
}
/* setup the path */
@ -415,11 +415,11 @@ int main(int argc, char *argv[])
Close read end first. */
execv(path, ortedargv);
fprintf(stderr, "orteprobe: execv failed with errno=%d\n", errno);
exit(1);
orte_abort(1, NULL);
} else if (pid < (pid_t) 0) {
/* The fork failed. */
fprintf (stderr, "orteprobe: Fork failed\n");
exit(1);
orte_abort(1, NULL);
} else {
/* This is the parent process.
Close write end first. */
@ -437,13 +437,13 @@ int main(int argc, char *argv[])
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr[0], 1, ORTE_STRING))) {
fprintf(stderr, "orteprobe: failed to pack daemon uri\n");
exit(1);
orte_abort(1, NULL);
}
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
fprintf(stderr, "orteprobe: could not send daemon uri info back to probe\n");
OBJ_DESTRUCT(&buffer);
exit(1);
orte_abort(1, NULL);
}
OBJ_DESTRUCT(&buffer);

Просмотреть файл

@ -81,4 +81,8 @@ in the environment. Returned value %d instead of ORTE_SUCCESS.
[orterun:proc-aborted]
%s noticed that job rank %lu with PID %lu on node "%s" exited on signal %d.
[orterun:abnormal-exit]
%s encountered an abnormal exit.
WARNING: %s encountered an abnormal exit.
This means that %s exited before it received notification that all
started processes had terminated. You should double check and ensure
that there are no runaway processes still executing.

Просмотреть файл

@ -565,7 +565,12 @@ static void exit_callback(int fd, short event, void *arg)
{
opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
true, orterun_basename);
exit(1);
/* Trigger the normal exit conditions */
orterun_globals.exit = true;
orterun_globals.exit_status = 1;
opal_condition_signal(&orterun_globals.cond);
}