Expand our handling of non-zero exit status. If a process exits with non-zero status, pass that info along to the user in case it means something to them, even if the process also exited without calling MPI_Finalize. If the process calls MPI_Abort, that trumps the exit status question.
Provide a new MCA param that allows the user to direct that we abort the job once a process exits with non-zero status. No recovery is allowed in such cases to avoid trying to restart a process that has already exited MPI. This commit was SVN r24614.
This commit is contained in:
parent
e4c36a3611
commit
3a28556472
@ -621,6 +621,20 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_TERM_NON_ZERO:
|
||||
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
|
||||
check_job_complete(jdata); /* need to set the job state */
|
||||
if (orte_abort_non_zero_exit) {
|
||||
/* the job object for this job will have been NULL'd
|
||||
* in the array if the job was solely local. If it isn't
|
||||
* NULL, then we need to tell everyone else to die
|
||||
*/
|
||||
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
|
||||
hnp_abort(jdata->jobid, exit_code);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case ORTE_PROC_STATE_FAILED_TO_START:
|
||||
case ORTE_PROC_STATE_CALLED_ABORT:
|
||||
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
|
||||
@ -1201,6 +1215,19 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||
}
|
||||
break;
|
||||
case ORTE_PROC_STATE_TERM_NON_ZERO:
|
||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||
if (orte_abort_non_zero_exit) {
|
||||
if (!jdata->abort) {
|
||||
jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
|
||||
/* point to the lowest rank to cause the problem */
|
||||
jdata->aborted_proc = proc;
|
||||
/* retain the object so it doesn't get free'd */
|
||||
OBJ_RETAIN(proc);
|
||||
jdata->abort = true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
if (ORTE_PROC_STATE_UNTERMINATED < proc->state &&
|
||||
@ -1246,12 +1273,12 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
/* warn user */
|
||||
opal_output(orte_clean_output,
|
||||
"-------------------------------------------------------\n"
|
||||
"While %s job %s terminated normally, %s processes returned\n"
|
||||
"non-zero exit codes. Further examination may be required.\n"
|
||||
"While %s job %s terminated normally, %s %s. Further examination may be required.\n"
|
||||
"-------------------------------------------------------",
|
||||
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child",
|
||||
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
|
||||
ORTE_VPID_PRINT(non_zero));
|
||||
ORTE_VPID_PRINT(non_zero),
|
||||
(1 == non_zero) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes.");
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs",
|
||||
|
@ -2515,9 +2515,9 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
|
||||
int rc;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:waitpid_fired on child %s",
|
||||
"%s odls:waitpid_fired on child %s with status %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
ORTE_NAME_PRINT(proc), WEXITSTATUS(status)));
|
||||
|
||||
/* since we are going to be working with the global list of
|
||||
* children, we need to protect that list from modification
|
||||
@ -2623,19 +2623,41 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
|
||||
/* we required a finalizing sync and didn't get it, so this
|
||||
* is considered an abnormal termination and treated accordingly
|
||||
*/
|
||||
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:waitpid_fired child process %s terminated normally "
|
||||
"but did not provide a required finalize sync - it "
|
||||
"will be treated as an abnormal termination",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
if (0 != child->exit_code) {
|
||||
child->state = ORTE_PROC_STATE_TERM_NON_ZERO;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:waitpid_fired child process %s terminated normally "
|
||||
"but with a non-zero exit status - it "
|
||||
"will be treated as an abnormal termination",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
} else {
|
||||
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:waitpid_fired child process %s terminated normally "
|
||||
"but did not provide a required finalize sync - it "
|
||||
"will be treated as an abnormal termination",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
}
|
||||
|
||||
goto MOVEON;
|
||||
}
|
||||
/* if we did recv a finalize sync, then it terminated normally */
|
||||
child->state = ORTE_PROC_STATE_TERMINATED;
|
||||
/* if we did recv a finalize sync, then declare it normally terminated
|
||||
* unless it returned with a non-zero status indicating the code
|
||||
* felt it was non-normal
|
||||
*/
|
||||
if (0 != child->exit_code) {
|
||||
child->state = ORTE_PROC_STATE_TERM_NON_ZERO;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:waitpid_fired child process %s terminated normally "
|
||||
"but with a non-zero exit status - it "
|
||||
"will be treated as an abnormal termination",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
} else {
|
||||
child->state = ORTE_PROC_STATE_TERMINATED;
|
||||
}
|
||||
} else {
|
||||
/* has any child in this job already registered? */
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
@ -2647,27 +2669,43 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
|
||||
/* someone has registered, and we didn't before
|
||||
* terminating - this is an abnormal termination
|
||||
*/
|
||||
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:waitpid_fired child process %s terminated normally "
|
||||
"but did not provide a required init sync - it "
|
||||
"will be treated as an abnormal termination",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
if (0 != child->exit_code) {
|
||||
child->state = ORTE_PROC_STATE_TERM_NON_ZERO;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:waitpid_fired child process %s terminated normally "
|
||||
"but with a non-zero exit status - it "
|
||||
"will be treated as an abnormal termination",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
} else {
|
||||
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:waitpid_fired child process %s terminated normally "
|
||||
"but did not provide a required init sync - it "
|
||||
"will be treated as an abnormal termination",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
}
|
||||
|
||||
goto MOVEON;
|
||||
}
|
||||
}
|
||||
/* if no child has registered, then it is possible that
|
||||
* none of them will. This is considered acceptable
|
||||
* none of them will. This is considered acceptable. Still
|
||||
* flag it as abnormal if the exit code was non-zero
|
||||
*/
|
||||
child->state = ORTE_PROC_STATE_TERMINATED;
|
||||
if (0 != child->exit_code) {
|
||||
child->state = ORTE_PROC_STATE_TERM_NON_ZERO;
|
||||
} else {
|
||||
child->state = ORTE_PROC_STATE_TERMINATED;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||
"%s odls:waitpid_fired child process %s terminated normally",
|
||||
"%s odls:waitpid_fired child process %s terminated %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(child->name)));
|
||||
ORTE_NAME_PRINT(child->name),
|
||||
(0 == child->exit_code) ? "normally" : "with non-zero status"));
|
||||
} else {
|
||||
/* the process was terminated with a signal! That's definitely
|
||||
* abnormal, so indicate that condition
|
||||
|
@ -66,7 +66,7 @@ typedef uint32_t orte_proc_state_t;
|
||||
#define ORTE_PROC_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
||||
#define ORTE_PROC_STATE_MIGRATING 0x00020000 /* process is migrating */
|
||||
#define ORTE_PROC_STATE_CANNOT_RESTART 0x00040000 /* process failed and cannot be restarted */
|
||||
|
||||
#define ORTE_PROC_STATE_TERM_NON_ZERO 0x00080000 /* process exited with a non-zero status, indicating abnormal */
|
||||
/*
|
||||
* Job state codes
|
||||
*/
|
||||
@ -99,6 +99,7 @@ typedef uint32_t orte_job_state_t;
|
||||
#define ORTE_JOB_STATE_CALLED_ABORT 0x00008000 /* at least one process called "errmgr.abort" */
|
||||
#define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
||||
#define ORTE_JOB_STATE_PROCS_MIGRATING 0x00020000 /* procs waiting to migrate */
|
||||
#define ORTE_JOB_STATE_NON_ZERO_TERM 0x00040000 /* at least one process exited with non-zero status */
|
||||
|
||||
/* the job never even attempted to launch due to an error earlier in the
|
||||
* launch procedure
|
||||
|
@ -181,6 +181,7 @@ orte_default_comm_fn_t orte_comm;
|
||||
/* exit status reporting */
|
||||
bool orte_report_child_jobs_separately;
|
||||
struct timeval orte_child_time_to_exit;
|
||||
bool orte_abort_non_zero_exit;
|
||||
|
||||
/* VM control */
|
||||
bool orte_vm_launch = false;
|
||||
|
@ -721,7 +721,7 @@ ORTE_DECLSPEC int orte_global_comm(orte_process_name_t *recipient,
|
||||
/* exit status reporting */
|
||||
ORTE_DECLSPEC extern bool orte_report_child_jobs_separately;
|
||||
ORTE_DECLSPEC extern struct timeval orte_child_time_to_exit;
|
||||
|
||||
ORTE_DECLSPEC extern bool orte_abort_non_zero_exit;
|
||||
|
||||
/* VM control */
|
||||
ORTE_DECLSPEC extern bool orte_vm_launch;
|
||||
|
@ -479,6 +479,11 @@ int orte_register_params(void)
|
||||
orte_enable_recovery = true;
|
||||
}
|
||||
|
||||
mca_base_param_reg_int_name("orte", "abort_on_non_zero_status",
|
||||
"Abort the job if any process returns a non-zero exit status - no restart in such cases",
|
||||
false, false, (int)false, &value);
|
||||
orte_abort_non_zero_exit = OPAL_INT_TO_BOOL(value);
|
||||
|
||||
mca_base_param_reg_int_name("orte", "report_child_jobs_separately",
|
||||
"Return the exit status of the primary job only",
|
||||
false, false,
|
||||
|
@ -404,6 +404,10 @@ static void dump_aborted_procs(void)
|
||||
} else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true,
|
||||
orte_basename, ORTE_NAME_PRINT(&proc->name), node->name);
|
||||
} else if (orte_abort_non_zero_exit &&
|
||||
ORTE_JOB_STATE_NON_ZERO_TERM == job->state) {
|
||||
orte_show_help("help-orterun.txt", "orterun:non-zero-exit", true,
|
||||
orte_basename, ORTE_NAME_PRINT(&proc->name), proc->exit_code);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
@ -592,4 +592,11 @@ process:
|
||||
|
||||
Process name: %s
|
||||
Node: %s
|
||||
#
|
||||
[orterun:non-zero-exit]
|
||||
%s detected that one or more processes exited with non-zero status, thus causing
|
||||
the job to be terminated. The first process to do so was:
|
||||
|
||||
Process name: %s
|
||||
Exit code: %d
|
||||
#
|
||||
|
@ -220,6 +220,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
|
||||
return "HEARTBEAT FAILED";
|
||||
case ORTE_JOB_STATE_PROCS_MIGRATING:
|
||||
return "PROCS MIGRATING";
|
||||
case ORTE_JOB_STATE_NON_ZERO_TERM:
|
||||
return "AT LEAST ONE PROCESS EXITED WITH NON-ZERO STATUS";
|
||||
default:
|
||||
return "UNKNOWN STATE!";
|
||||
}
|
||||
@ -266,6 +268,8 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
|
||||
return "MIGRATING";
|
||||
case ORTE_PROC_STATE_CANNOT_RESTART:
|
||||
return "CANNOT BE RESTARTED";
|
||||
case ORTE_PROC_STATE_TERM_NON_ZERO:
|
||||
return "EXITED WITH NON-ZERO STATUS";
|
||||
default:
|
||||
return "UNKNOWN STATE!";
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user