- Change a few calls from exit() to orte_abort() so that we get
session directory cleanup (among other things) - When we get an abnormal exit in orterun (i.e., timeout expires and we haven't gotten termination notices from all processes), print a better message an exit in a better way (which includes session directory cleanup) - Fix tm and poe pls's to not exit() but rather propagate the error up the stack (where relevant) This commit was SVN r7058.
Этот коммит содержится в:
родитель
2a9ab3eb10
Коммит
b3bd549331
@ -75,7 +75,7 @@ void orte_errmgr_base_abort()
|
|||||||
orte_wait_kill(9);
|
orte_wait_kill(9);
|
||||||
|
|
||||||
/* abnormal exit */
|
/* abnormal exit */
|
||||||
exit(-1);
|
orte_abort(-1, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
int orte_errmgr_base_register_job(orte_jobid_t job)
|
int orte_errmgr_base_register_job(orte_jobid_t job)
|
||||||
|
@ -224,7 +224,7 @@ int pls_poe_launch_interactive_orted(orte_jobid_t jobid)
|
|||||||
rc = orte_ns.get_proc_name_string(&name_string, name);
|
rc = orte_ns.get_proc_name_string(&name_string, name);
|
||||||
if(ORTE_SUCCESS != rc) {
|
if(ORTE_SUCCESS != rc) {
|
||||||
opal_output(0, "orte_pls_poe: unable to create process name");
|
opal_output(0, "orte_pls_poe: unable to create process name");
|
||||||
exit(-1);
|
return rc;
|
||||||
}
|
}
|
||||||
argv[proc_name_index] = name_string;
|
argv[proc_name_index] = name_string;
|
||||||
for(i=0;i<argc;i++) {
|
for(i=0;i<argc;i++) {
|
||||||
|
@ -239,7 +239,7 @@ pls_tm_launch(orte_jobid_t jobid)
|
|||||||
rc = orte_ns.get_proc_name_string(&name_string, name);
|
rc = orte_ns.get_proc_name_string(&name_string, name);
|
||||||
if (ORTE_SUCCESS != rc) {
|
if (ORTE_SUCCESS != rc) {
|
||||||
opal_output(0, "pls:tm: unable to create process name");
|
opal_output(0, "pls:tm: unable to create process name");
|
||||||
exit(-1);
|
return rc;
|
||||||
}
|
}
|
||||||
argv[proc_name_index] = name_string;
|
argv[proc_name_index] = name_string;
|
||||||
|
|
||||||
|
@ -322,13 +322,13 @@ int main(int argc, char *argv[])
|
|||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr, 1, ORTE_STRING))) {
|
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr, 1, ORTE_STRING))) {
|
||||||
fprintf(stderr, "orteprobe: failed to pack contact info for existing universe\n");
|
fprintf(stderr, "orteprobe: failed to pack contact info for existing universe\n");
|
||||||
exit(1);
|
orte_abort(1, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
|
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
|
||||||
fprintf(stderr, "orteprobe: comm failure when sending contact info for existing univ back to requestor\n");
|
fprintf(stderr, "orteprobe: comm failure when sending contact info for existing univ back to requestor\n");
|
||||||
OBJ_DESTRUCT(&buffer);
|
OBJ_DESTRUCT(&buffer);
|
||||||
exit(1);
|
orte_abort(1, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
OBJ_DESTRUCT(&buffer);
|
OBJ_DESTRUCT(&buffer);
|
||||||
@ -355,7 +355,7 @@ int main(int argc, char *argv[])
|
|||||||
|
|
||||||
if (0 > asprintf(&orte_universe_info.name, "%s-%d", universe, pid)) {
|
if (0 > asprintf(&orte_universe_info.name, "%s-%d", universe, pid)) {
|
||||||
fprintf(stderr, "orteprobe: failed to create unique universe name");
|
fprintf(stderr, "orteprobe: failed to create unique universe name");
|
||||||
exit(1);
|
orte_abort(1, NULL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -363,7 +363,7 @@ int main(int argc, char *argv[])
|
|||||||
/* setup the pipe to get the contact info back */
|
/* setup the pipe to get the contact info back */
|
||||||
if (pipe(orted_pipe)) {
|
if (pipe(orted_pipe)) {
|
||||||
fprintf (stderr, "orteprobe: Pipe failed\n");
|
fprintf (stderr, "orteprobe: Pipe failed\n");
|
||||||
exit(1);
|
orte_abort(1, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get name of orted application - just in case user specified something different */
|
/* get name of orted application - just in case user specified something different */
|
||||||
@ -378,7 +378,7 @@ int main(int argc, char *argv[])
|
|||||||
ortedargc = opal_argv_count(ortedargv);
|
ortedargc = opal_argv_count(ortedargv);
|
||||||
if (ortedargc <= 0) {
|
if (ortedargc <= 0) {
|
||||||
fprintf(stderr, "orteprobe: could not initialize argv array for daemon\n");
|
fprintf(stderr, "orteprobe: could not initialize argv array for daemon\n");
|
||||||
exit(1);
|
orte_abort(1, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* setup the path */
|
/* setup the path */
|
||||||
@ -415,11 +415,11 @@ int main(int argc, char *argv[])
|
|||||||
Close read end first. */
|
Close read end first. */
|
||||||
execv(path, ortedargv);
|
execv(path, ortedargv);
|
||||||
fprintf(stderr, "orteprobe: execv failed with errno=%d\n", errno);
|
fprintf(stderr, "orteprobe: execv failed with errno=%d\n", errno);
|
||||||
exit(1);
|
orte_abort(1, NULL);
|
||||||
} else if (pid < (pid_t) 0) {
|
} else if (pid < (pid_t) 0) {
|
||||||
/* The fork failed. */
|
/* The fork failed. */
|
||||||
fprintf (stderr, "orteprobe: Fork failed\n");
|
fprintf (stderr, "orteprobe: Fork failed\n");
|
||||||
exit(1);
|
orte_abort(1, NULL);
|
||||||
} else {
|
} else {
|
||||||
/* This is the parent process.
|
/* This is the parent process.
|
||||||
Close write end first. */
|
Close write end first. */
|
||||||
@ -437,13 +437,13 @@ int main(int argc, char *argv[])
|
|||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr[0], 1, ORTE_STRING))) {
|
if (ORTE_SUCCESS != (ret = orte_dps.pack(&buffer, &orted_uri_ptr[0], 1, ORTE_STRING))) {
|
||||||
fprintf(stderr, "orteprobe: failed to pack daemon uri\n");
|
fprintf(stderr, "orteprobe: failed to pack daemon uri\n");
|
||||||
exit(1);
|
orte_abort(1, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
|
if (0 > orte_rml.send_buffer(&requestor, &buffer, ORTE_RML_TAG_PROBE, 0)) {
|
||||||
fprintf(stderr, "orteprobe: could not send daemon uri info back to probe\n");
|
fprintf(stderr, "orteprobe: could not send daemon uri info back to probe\n");
|
||||||
OBJ_DESTRUCT(&buffer);
|
OBJ_DESTRUCT(&buffer);
|
||||||
exit(1);
|
orte_abort(1, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
OBJ_DESTRUCT(&buffer);
|
OBJ_DESTRUCT(&buffer);
|
||||||
|
@ -81,4 +81,8 @@ in the environment. Returned value %d instead of ORTE_SUCCESS.
|
|||||||
[orterun:proc-aborted]
|
[orterun:proc-aborted]
|
||||||
%s noticed that job rank %lu with PID %lu on node "%s" exited on signal %d.
|
%s noticed that job rank %lu with PID %lu on node "%s" exited on signal %d.
|
||||||
[orterun:abnormal-exit]
|
[orterun:abnormal-exit]
|
||||||
%s encountered an abnormal exit.
|
WARNING: %s encountered an abnormal exit.
|
||||||
|
|
||||||
|
This means that %s exited before it received notification that all
|
||||||
|
started processes had terminated. You should double check and ensure
|
||||||
|
that there are no runaway processes still executing.
|
||||||
|
@ -565,7 +565,12 @@ static void exit_callback(int fd, short event, void *arg)
|
|||||||
{
|
{
|
||||||
opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
|
opal_show_help("help-orterun.txt", "orterun:abnormal-exit",
|
||||||
true, orterun_basename);
|
true, orterun_basename);
|
||||||
exit(1);
|
|
||||||
|
/* Trigger the normal exit conditions */
|
||||||
|
|
||||||
|
orterun_globals.exit = true;
|
||||||
|
orterun_globals.exit_status = 1;
|
||||||
|
opal_condition_signal(&orterun_globals.cond);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user