- Change a few calls from exit() to orte_abort() so that we get
session directory cleanup (among other things) - When we get an abnormal exit in orterun (i.e., timeout expires and we haven't gotten termination notices from all processes), print a better message an exit in a better way (which includes session directory cleanup) - Fix tm and poe pls's to not exit() but rather propagate the error up the stack (where relevant) This commit was SVN r7058.
Этот коммит содержится в:
родитель
2a9ab3eb10
Коммит
b3bd549331
@ -75,7 +75,7 @@ void orte_errmgr_base_abort()
|
||||
orte_wait_kill(9);
|
||||
|
||||
/* abnormal exit */
|
||||
exit(-1);
|
||||
orte_abort(-1, NULL);
|
||||
}
|
||||
|
||||
int orte_errmgr_base_register_job(orte_jobid_t job)
|
||||
|
@ -224,7 +224,7 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
|
||||
rc = orte_ns.get_proc_name_string(&name_string, name);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "orte_pls_poe: unable to create process name");
|
||||
exit(-1);
|
||||
return rc;
|
||||
}
|
||||
argv[proc_name_index] = name_string;
|
||||
for(i=0;i<argc;i++) {
|
||||
|
@ -239,7 +239,7 @@ pls_tm_launch(orte_jobid_t jobid)
|
||||
rc = orte_ns.get_proc_name_string(&name_string, name);
|
||||
if (ORTE_SUCCESS != rc) {
|
||||
opal_output(0, "pls:tm: unable to create process name");
|
||||
exit(-1);
|
||||
return rc;
|
||||
}
|
||||
argv[proc_name_index] = name_string;
|
||||
|
||||
|
@ -322,13 +322,13 @@ int main(int argc, char *argv[])
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr, 1, ORTE_STRING))) {
|
||||
fprintf(stderr, "orteprobe: failed to pack contact info for existing universe\n");
|
||||
exit(1);
|
||||
orte_abort(1, NULL);
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
|
||||
fprintf(stderr, "orteprobe: comm failure when sending contact info for existing univ back to requestor\n");
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
exit(1);
|
||||
orte_abort(1, NULL);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
@ -355,7 +355,7 @@ int main(int argc, char *argv[])
|
||||
|
||||
if (0 > asprintf(&orte_universe_info.name, "%s-%d", universe, pid)) {
|
||||
fprintf(stderr, "orteprobe: failed to create unique universe name");
|
||||
exit(1);
|
||||
orte_abort(1, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
@ -363,7 +363,7 @@ int main(int argc, char *argv[])
|
||||
/* setup the pipe to get the contact info back */
|
||||
if (pipe(orted_pipe)) {
|
||||
fprintf (stderr, "orteprobe: Pipe failed\n");
|
||||
exit(1);
|
||||
orte_abort(1, NULL);
|
||||
}
|
||||
|
||||
/* get name of orted application - just in case user specified something different */
|
||||
@ -378,7 +378,7 @@ int main(int argc, char *argv[])
|
||||
ortedargc = opal_argv_count(ortedargv);
|
||||
if (ortedargc <= 0) {
|
||||
fprintf(stderr, "orteprobe: could not initialize argv array for daemon\n");
|
||||
exit(1);
|
||||
orte_abort(1, NULL);
|
||||
}
|
||||
|
||||
/* setup the path */
|
||||
@ -415,11 +415,11 @@ int main(int argc, char *argv[])
|
||||
Close read end first. */
|
||||
execv(path, ortedargv);
|
||||
fprintf(stderr, "orteprobe: execv failed with errno=%d\n", errno);
|
||||
exit(1);
|
||||
orte_abort(1, NULL);
|
||||
} else if (pid < (pid_t) 0) {
|
||||
/* The fork failed. */
|
||||
fprintf (stderr, "orteprobe: Fork failed\n");
|
||||
exit(1);
|
||||
orte_abort(1, NULL);
|
||||
} else {
|
||||
/* This is the parent process.
|
||||
Close write end first. */
|
||||
@ -437,13 +437,13 @@ int main(int argc, char *argv[])
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr[0], 1, ORTE_STRING))) {
|
||||
fprintf(stderr, "orteprobe: failed to pack daemon uri\n");
|
||||
exit(1);
|
||||
orte_abort(1, NULL);
|
||||
}
|
||||
|
||||
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
|
||||
fprintf(stderr, "orteprobe: could not send daemon uri info back to probe\n");
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
exit(1);
|
||||
orte_abort(1, NULL);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
|
@ -81,4 +81,8 @@ in the environment. Returned value %d instead of ORTE_SUCCESS.
|
||||
[orterun:proc-aborted]
|
||||
%s noticed that job rank %lu with PID %lu on node "%s" exited on signal %d.
|
||||
[orterun:abnormal-exit]
|
||||
%s encountered an abnormal exit.
|
||||
WARNING: %s encountered an abnormal exit.
|
||||
|
||||
This means that %s exited before it received notification that all
|
||||
started processes had terminated. You should double check and ensure
|
||||
that there are no runaway processes still executing.
|
||||
|
@ -565,7 +565,12 @@ static void exit_callback(int fd, short event, void *arg)
|
||||
{
|
||||
opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
|
||||
true, orterun_basename);
|
||||
exit(1);
|
||||
|
||||
/* Trigger the normal exit conditions */
|
||||
|
||||
orterun_globals.exit = true;
|
||||
orterun_globals.exit_status = 1;
|
||||
opal_condition_signal(&orterun_globals.cond);
|
||||
}
|
||||
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user