1
1

Expand our handling of non-zero exit status. If a process exits with non-zero status, pass that info along to the user in case it means something to them, even if the process also exited without calling MPI_Finalize. If the process calls MPI_Abort, that trumps the exit status question.

Provide a new MCA param that allows the user to direct that we abort the job once a process exits with non-zero status. No recovery is allowed in such cases to avoid trying to restart a process that has already exited MPI.

This commit was SVN r24614.
This commit is contained in:
Ralph Castain 2011-04-14 15:04:21 +00:00
parent e4c36a3611
commit 3a28556472
9 changed files with 115 additions and 28 deletions

View File

@ -621,6 +621,20 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
}
break;
case ORTE_PROC_STATE_TERM_NON_ZERO:
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
check_job_complete(jdata); /* need to set the job state */
if (orte_abort_non_zero_exit) {
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
* NULL, then we need to tell everyone else to die
*/
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
hnp_abort(jdata->jobid, exit_code);
}
}
break;
case ORTE_PROC_STATE_FAILED_TO_START:
case ORTE_PROC_STATE_CALLED_ABORT:
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
@ -1201,6 +1215,19 @@ static void check_job_complete(orte_job_t *jdata)
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
break;
case ORTE_PROC_STATE_TERM_NON_ZERO:
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
if (orte_abort_non_zero_exit) {
if (!jdata->abort) {
jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
/* point to the lowest rank to cause the problem */
jdata->aborted_proc = proc;
/* retain the object so it doesn't get free'd */
OBJ_RETAIN(proc);
jdata->abort = true;
}
}
break;
default:
if (ORTE_PROC_STATE_UNTERMINATED < proc->state &&
@ -1246,12 +1273,12 @@ static void check_job_complete(orte_job_t *jdata)
/* warn user */
opal_output(orte_clean_output,
"-------------------------------------------------------\n"
"While %s job %s terminated normally, %s processes returned\n"
"non-zero exit codes. Further examination may be required.\n"
"While %s job %s terminated normally, %s %s. Further examination may be required.\n"
"-------------------------------------------------------",
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child",
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
ORTE_VPID_PRINT(non_zero));
ORTE_VPID_PRINT(non_zero),
(1 == non_zero) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes.");
}
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs",

View File

@ -2515,9 +2515,9 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
int rc;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:waitpid_fired on child %s",
"%s odls:waitpid_fired on child %s with status %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(proc)));
ORTE_NAME_PRINT(proc), WEXITSTATUS(status)));
/* since we are going to be working with the global list of
* children, we need to protect that list from modification
@ -2623,19 +2623,41 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
/* we required a finalizing sync and didn't get it, so this
* is considered an abnormal termination and treated accordingly
*/
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:waitpid_fired child process %s terminated normally "
"but did not provide a required finalize sync - it "
"will be treated as an abnormal termination",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
if (0 != child->exit_code) {
child->state = ORTE_PROC_STATE_TERM_NON_ZERO;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:waitpid_fired child process %s terminated normally "
"but with a non-zero exit status - it "
"will be treated as an abnormal termination",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
} else {
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:waitpid_fired child process %s terminated normally "
"but did not provide a required finalize sync - it "
"will be treated as an abnormal termination",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
}
goto MOVEON;
}
/* if we did recv a finalize sync, then it terminated normally */
child->state = ORTE_PROC_STATE_TERMINATED;
/* if we did recv a finalize sync, then declare it normally terminated
* unless it returned with a non-zero status indicating the code
* felt it was non-normal
*/
if (0 != child->exit_code) {
child->state = ORTE_PROC_STATE_TERM_NON_ZERO;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:waitpid_fired child process %s terminated normally "
"but with a non-zero exit status - it "
"will be treated as an abnormal termination",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
} else {
child->state = ORTE_PROC_STATE_TERMINATED;
}
} else {
/* has any child in this job already registered? */
for (item = opal_list_get_first(&orte_local_children);
@ -2647,27 +2669,43 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
/* someone has registered, and we didn't before
* terminating - this is an abnormal termination
*/
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:waitpid_fired child process %s terminated normally "
"but did not provide a required init sync - it "
"will be treated as an abnormal termination",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
if (0 != child->exit_code) {
child->state = ORTE_PROC_STATE_TERM_NON_ZERO;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:waitpid_fired child process %s terminated normally "
"but with a non-zero exit status - it "
"will be treated as an abnormal termination",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
} else {
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:waitpid_fired child process %s terminated normally "
"but did not provide a required init sync - it "
"will be treated as an abnormal termination",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
}
goto MOVEON;
}
}
/* if no child has registered, then it is possible that
* none of them will. This is considered acceptable
* none of them will. This is considered acceptable. Still
* flag it as abnormal if the exit code was non-zero
*/
child->state = ORTE_PROC_STATE_TERMINATED;
if (0 != child->exit_code) {
child->state = ORTE_PROC_STATE_TERM_NON_ZERO;
} else {
child->state = ORTE_PROC_STATE_TERMINATED;
}
}
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
"%s odls:waitpid_fired child process %s terminated normally",
"%s odls:waitpid_fired child process %s terminated %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(child->name)));
ORTE_NAME_PRINT(child->name),
(0 == child->exit_code) ? "normally" : "with non-zero status"));
} else {
/* the process was terminated with a signal! That's definitely
* abnormal, so indicate that condition

View File

@ -66,7 +66,7 @@ typedef uint32_t orte_proc_state_t;
#define ORTE_PROC_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
#define ORTE_PROC_STATE_MIGRATING 0x00020000 /* process is migrating */
#define ORTE_PROC_STATE_CANNOT_RESTART 0x00040000 /* process failed and cannot be restarted */
#define ORTE_PROC_STATE_TERM_NON_ZERO 0x00080000 /* process exited with a non-zero status, indicating abnormal */
/*
* Job state codes
*/
@ -99,6 +99,7 @@ typedef uint32_t orte_job_state_t;
#define ORTE_JOB_STATE_CALLED_ABORT 0x00008000 /* at least one process called "errmgr.abort" */
#define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
#define ORTE_JOB_STATE_PROCS_MIGRATING 0x00020000 /* procs waiting to migrate */
#define ORTE_JOB_STATE_NON_ZERO_TERM 0x00040000 /* at least one process exited with non-zero status */
/* the job never even attempted to launch due to an error earlier in the
* launch procedure

View File

@ -181,6 +181,7 @@ orte_default_comm_fn_t orte_comm;
/* exit status reporting */
bool orte_report_child_jobs_separately;
struct timeval orte_child_time_to_exit;
bool orte_abort_non_zero_exit;
/* VM control */
bool orte_vm_launch = false;

View File

@ -721,7 +721,7 @@ ORTE_DECLSPEC int orte_global_comm(orte_process_name_t *recipient,
/* exit status reporting */
ORTE_DECLSPEC extern bool orte_report_child_jobs_separately;
ORTE_DECLSPEC extern struct timeval orte_child_time_to_exit;
ORTE_DECLSPEC extern bool orte_abort_non_zero_exit;
/* VM control */
ORTE_DECLSPEC extern bool orte_vm_launch;

View File

@ -479,6 +479,11 @@ int orte_register_params(void)
orte_enable_recovery = true;
}
mca_base_param_reg_int_name("orte", "abort_on_non_zero_status",
"Abort the job if any process returns a non-zero exit status - no restart in such cases",
false, false, (int)false, &value);
orte_abort_non_zero_exit = OPAL_INT_TO_BOOL(value);
mca_base_param_reg_int_name("orte", "report_child_jobs_separately",
"Return the exit status of the primary job only",
false, false,

View File

@ -404,6 +404,10 @@ static void dump_aborted_procs(void)
} else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) {
orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true,
orte_basename, ORTE_NAME_PRINT(&proc->name), node->name);
} else if (orte_abort_non_zero_exit &&
ORTE_JOB_STATE_NON_ZERO_TERM == job->state) {
orte_show_help("help-orterun.txt", "orterun:non-zero-exit", true,
orte_basename, ORTE_NAME_PRINT(&proc->name), proc->exit_code);
}
return;
}

View File

@ -592,4 +592,11 @@ process:
Process name: %s
Node: %s
#
[orterun:non-zero-exit]
%s detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:
Process name: %s
Exit code: %d
#

View File

@ -220,6 +220,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
return "HEARTBEAT FAILED";
case ORTE_JOB_STATE_PROCS_MIGRATING:
return "PROCS MIGRATING";
case ORTE_JOB_STATE_NON_ZERO_TERM:
return "AT LEAST ONE PROCESS EXITED WITH NON-ZERO STATUS";
default:
return "UNKNOWN STATE!";
}
@ -266,6 +268,8 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
return "MIGRATING";
case ORTE_PROC_STATE_CANNOT_RESTART:
return "CANNOT BE RESTARTED";
case ORTE_PROC_STATE_TERM_NON_ZERO:
return "EXITED WITH NON-ZERO STATUS";
default:
return "UNKNOWN STATE!";
}