Expand our handling of non-zero exit status. If a process exits with non-zero status, pass that info along to the user in case it means something to them, even if the process also exited without calling MPI_Finalize. If the process calls MPI_Abort, that trumps the exit status question.
Provide a new MCA param that allows the user to direct that we abort the job once a process exits with non-zero status. No recovery is allowed in such cases to avoid trying to restart a process that has already exited MPI. This commit was SVN r24614.
Этот коммит содержится в:
родитель
e4c36a3611
Коммит
3a28556472
@ -621,6 +621,20 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case ORTE_PROC_STATE_TERM_NON_ZERO:
|
||||||
|
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
|
||||||
|
check_job_complete(jdata); /* need to set the job state */
|
||||||
|
if (orte_abort_non_zero_exit) {
|
||||||
|
/* the job object for this job will have been NULL'd
|
||||||
|
* in the array if the job was solely local. If it isn't
|
||||||
|
* NULL, then we need to tell everyone else to die
|
||||||
|
*/
|
||||||
|
if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) {
|
||||||
|
hnp_abort(jdata->jobid, exit_code);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
case ORTE_PROC_STATE_FAILED_TO_START:
|
case ORTE_PROC_STATE_FAILED_TO_START:
|
||||||
case ORTE_PROC_STATE_CALLED_ABORT:
|
case ORTE_PROC_STATE_CALLED_ABORT:
|
||||||
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
|
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
|
||||||
@ -1201,6 +1215,19 @@ static void check_job_complete(orte_job_t *jdata)
|
|||||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case ORTE_PROC_STATE_TERM_NON_ZERO:
|
||||||
|
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||||
|
if (orte_abort_non_zero_exit) {
|
||||||
|
if (!jdata->abort) {
|
||||||
|
jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
|
||||||
|
/* point to the lowest rank to cause the problem */
|
||||||
|
jdata->aborted_proc = proc;
|
||||||
|
/* retain the object so it doesn't get free'd */
|
||||||
|
OBJ_RETAIN(proc);
|
||||||
|
jdata->abort = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
if (ORTE_PROC_STATE_UNTERMINATED < proc->state &&
|
if (ORTE_PROC_STATE_UNTERMINATED < proc->state &&
|
||||||
@ -1246,12 +1273,12 @@ static void check_job_complete(orte_job_t *jdata)
|
|||||||
/* warn user */
|
/* warn user */
|
||||||
opal_output(orte_clean_output,
|
opal_output(orte_clean_output,
|
||||||
"-------------------------------------------------------\n"
|
"-------------------------------------------------------\n"
|
||||||
"While %s job %s terminated normally, %s processes returned\n"
|
"While %s job %s terminated normally, %s %s. Further examination may be required.\n"
|
||||||
"non-zero exit codes. Further examination may be required.\n"
|
|
||||||
"-------------------------------------------------------",
|
"-------------------------------------------------------",
|
||||||
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child",
|
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child",
|
||||||
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
|
(1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid),
|
||||||
ORTE_VPID_PRINT(non_zero));
|
ORTE_VPID_PRINT(non_zero),
|
||||||
|
(1 == non_zero) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes.");
|
||||||
}
|
}
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||||
"%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs",
|
"%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs",
|
||||||
|
@ -2515,9 +2515,9 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
|
|||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
"%s odls:waitpid_fired on child %s",
|
"%s odls:waitpid_fired on child %s with status %d",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(proc)));
|
ORTE_NAME_PRINT(proc), WEXITSTATUS(status)));
|
||||||
|
|
||||||
/* since we are going to be working with the global list of
|
/* since we are going to be working with the global list of
|
||||||
* children, we need to protect that list from modification
|
* children, we need to protect that list from modification
|
||||||
@ -2623,19 +2623,41 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
|
|||||||
/* we required a finalizing sync and didn't get it, so this
|
/* we required a finalizing sync and didn't get it, so this
|
||||||
* is considered an abnormal termination and treated accordingly
|
* is considered an abnormal termination and treated accordingly
|
||||||
*/
|
*/
|
||||||
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
if (0 != child->exit_code) {
|
||||||
|
child->state = ORTE_PROC_STATE_TERM_NON_ZERO;
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
"%s odls:waitpid_fired child process %s terminated normally "
|
"%s odls:waitpid_fired child process %s terminated normally "
|
||||||
"but did not provide a required finalize sync - it "
|
"but with a non-zero exit status - it "
|
||||||
"will be treated as an abnormal termination",
|
"will be treated as an abnormal termination",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(child->name)));
|
ORTE_NAME_PRINT(child->name)));
|
||||||
|
} else {
|
||||||
|
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
|
"%s odls:waitpid_fired child process %s terminated normally "
|
||||||
|
"but did not provide a required finalize sync - it "
|
||||||
|
"will be treated as an abnormal termination",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(child->name)));
|
||||||
|
}
|
||||||
|
|
||||||
goto MOVEON;
|
goto MOVEON;
|
||||||
}
|
}
|
||||||
/* if we did recv a finalize sync, then it terminated normally */
|
/* if we did recv a finalize sync, then declare it normally terminated
|
||||||
child->state = ORTE_PROC_STATE_TERMINATED;
|
* unless it returned with a non-zero status indicating the code
|
||||||
|
* felt it was non-normal
|
||||||
|
*/
|
||||||
|
if (0 != child->exit_code) {
|
||||||
|
child->state = ORTE_PROC_STATE_TERM_NON_ZERO;
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
|
"%s odls:waitpid_fired child process %s terminated normally "
|
||||||
|
"but with a non-zero exit status - it "
|
||||||
|
"will be treated as an abnormal termination",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(child->name)));
|
||||||
|
} else {
|
||||||
|
child->state = ORTE_PROC_STATE_TERMINATED;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
/* has any child in this job already registered? */
|
/* has any child in this job already registered? */
|
||||||
for (item = opal_list_get_first(&orte_local_children);
|
for (item = opal_list_get_first(&orte_local_children);
|
||||||
@ -2647,27 +2669,43 @@ void orte_base_default_waitpid_fired(orte_process_name_t *proc, int32_t status)
|
|||||||
/* someone has registered, and we didn't before
|
/* someone has registered, and we didn't before
|
||||||
* terminating - this is an abnormal termination
|
* terminating - this is an abnormal termination
|
||||||
*/
|
*/
|
||||||
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
if (0 != child->exit_code) {
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
child->state = ORTE_PROC_STATE_TERM_NON_ZERO;
|
||||||
"%s odls:waitpid_fired child process %s terminated normally "
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
"but did not provide a required init sync - it "
|
"%s odls:waitpid_fired child process %s terminated normally "
|
||||||
"will be treated as an abnormal termination",
|
"but with a non-zero exit status - it "
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
"will be treated as an abnormal termination",
|
||||||
ORTE_NAME_PRINT(child->name)));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(child->name)));
|
||||||
|
} else {
|
||||||
|
child->state = ORTE_PROC_STATE_TERM_WO_SYNC;
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
|
"%s odls:waitpid_fired child process %s terminated normally "
|
||||||
|
"but did not provide a required init sync - it "
|
||||||
|
"will be treated as an abnormal termination",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
ORTE_NAME_PRINT(child->name)));
|
||||||
|
}
|
||||||
|
|
||||||
goto MOVEON;
|
goto MOVEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* if no child has registered, then it is possible that
|
/* if no child has registered, then it is possible that
|
||||||
* none of them will. This is considered acceptable
|
* none of them will. This is considered acceptable. Still
|
||||||
|
* flag it as abnormal if the exit code was non-zero
|
||||||
*/
|
*/
|
||||||
child->state = ORTE_PROC_STATE_TERMINATED;
|
if (0 != child->exit_code) {
|
||||||
|
child->state = ORTE_PROC_STATE_TERM_NON_ZERO;
|
||||||
|
} else {
|
||||||
|
child->state = ORTE_PROC_STATE_TERMINATED;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output,
|
||||||
"%s odls:waitpid_fired child process %s terminated normally",
|
"%s odls:waitpid_fired child process %s terminated %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(child->name)));
|
ORTE_NAME_PRINT(child->name),
|
||||||
|
(0 == child->exit_code) ? "normally" : "with non-zero status"));
|
||||||
} else {
|
} else {
|
||||||
/* the process was terminated with a signal! That's definitely
|
/* the process was terminated with a signal! That's definitely
|
||||||
* abnormal, so indicate that condition
|
* abnormal, so indicate that condition
|
||||||
|
@ -66,7 +66,7 @@ typedef uint32_t orte_proc_state_t;
|
|||||||
#define ORTE_PROC_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
#define ORTE_PROC_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
||||||
#define ORTE_PROC_STATE_MIGRATING 0x00020000 /* process is migrating */
|
#define ORTE_PROC_STATE_MIGRATING 0x00020000 /* process is migrating */
|
||||||
#define ORTE_PROC_STATE_CANNOT_RESTART 0x00040000 /* process failed and cannot be restarted */
|
#define ORTE_PROC_STATE_CANNOT_RESTART 0x00040000 /* process failed and cannot be restarted */
|
||||||
|
#define ORTE_PROC_STATE_TERM_NON_ZERO 0x00080000 /* process exited with a non-zero status, indicating abnormal */
|
||||||
/*
|
/*
|
||||||
* Job state codes
|
* Job state codes
|
||||||
*/
|
*/
|
||||||
@ -99,6 +99,7 @@ typedef uint32_t orte_job_state_t;
|
|||||||
#define ORTE_JOB_STATE_CALLED_ABORT 0x00008000 /* at least one process called "errmgr.abort" */
|
#define ORTE_JOB_STATE_CALLED_ABORT 0x00008000 /* at least one process called "errmgr.abort" */
|
||||||
#define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
#define ORTE_JOB_STATE_HEARTBEAT_FAILED 0x00010000 /* heartbeat failed to arrive */
|
||||||
#define ORTE_JOB_STATE_PROCS_MIGRATING 0x00020000 /* procs waiting to migrate */
|
#define ORTE_JOB_STATE_PROCS_MIGRATING 0x00020000 /* procs waiting to migrate */
|
||||||
|
#define ORTE_JOB_STATE_NON_ZERO_TERM 0x00040000 /* at least one process exited with non-zero status */
|
||||||
|
|
||||||
/* the job never even attempted to launch due to an error earlier in the
|
/* the job never even attempted to launch due to an error earlier in the
|
||||||
* launch procedure
|
* launch procedure
|
||||||
|
@ -181,6 +181,7 @@ orte_default_comm_fn_t orte_comm;
|
|||||||
/* exit status reporting */
|
/* exit status reporting */
|
||||||
bool orte_report_child_jobs_separately;
|
bool orte_report_child_jobs_separately;
|
||||||
struct timeval orte_child_time_to_exit;
|
struct timeval orte_child_time_to_exit;
|
||||||
|
bool orte_abort_non_zero_exit;
|
||||||
|
|
||||||
/* VM control */
|
/* VM control */
|
||||||
bool orte_vm_launch = false;
|
bool orte_vm_launch = false;
|
||||||
|
@ -721,7 +721,7 @@ ORTE_DECLSPEC int orte_global_comm(orte_process_name_t *recipient,
|
|||||||
/* exit status reporting */
|
/* exit status reporting */
|
||||||
ORTE_DECLSPEC extern bool orte_report_child_jobs_separately;
|
ORTE_DECLSPEC extern bool orte_report_child_jobs_separately;
|
||||||
ORTE_DECLSPEC extern struct timeval orte_child_time_to_exit;
|
ORTE_DECLSPEC extern struct timeval orte_child_time_to_exit;
|
||||||
|
ORTE_DECLSPEC extern bool orte_abort_non_zero_exit;
|
||||||
|
|
||||||
/* VM control */
|
/* VM control */
|
||||||
ORTE_DECLSPEC extern bool orte_vm_launch;
|
ORTE_DECLSPEC extern bool orte_vm_launch;
|
||||||
|
@ -479,6 +479,11 @@ int orte_register_params(void)
|
|||||||
orte_enable_recovery = true;
|
orte_enable_recovery = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mca_base_param_reg_int_name("orte", "abort_on_non_zero_status",
|
||||||
|
"Abort the job if any process returns a non-zero exit status - no restart in such cases",
|
||||||
|
false, false, (int)false, &value);
|
||||||
|
orte_abort_non_zero_exit = OPAL_INT_TO_BOOL(value);
|
||||||
|
|
||||||
mca_base_param_reg_int_name("orte", "report_child_jobs_separately",
|
mca_base_param_reg_int_name("orte", "report_child_jobs_separately",
|
||||||
"Return the exit status of the primary job only",
|
"Return the exit status of the primary job only",
|
||||||
false, false,
|
false, false,
|
||||||
|
@ -404,6 +404,10 @@ static void dump_aborted_procs(void)
|
|||||||
} else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) {
|
} else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) {
|
||||||
orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true,
|
orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true,
|
||||||
orte_basename, ORTE_NAME_PRINT(&proc->name), node->name);
|
orte_basename, ORTE_NAME_PRINT(&proc->name), node->name);
|
||||||
|
} else if (orte_abort_non_zero_exit &&
|
||||||
|
ORTE_JOB_STATE_NON_ZERO_TERM == job->state) {
|
||||||
|
orte_show_help("help-orterun.txt", "orterun:non-zero-exit", true,
|
||||||
|
orte_basename, ORTE_NAME_PRINT(&proc->name), proc->exit_code);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -592,4 +592,11 @@ process:
|
|||||||
|
|
||||||
Process name: %s
|
Process name: %s
|
||||||
Node: %s
|
Node: %s
|
||||||
|
#
|
||||||
|
[orterun:non-zero-exit]
|
||||||
|
%s detected that one or more processes exited with non-zero status, thus causing
|
||||||
|
the job to be terminated. The first process to do so was:
|
||||||
|
|
||||||
|
Process name: %s
|
||||||
|
Exit code: %d
|
||||||
|
#
|
||||||
|
@ -220,6 +220,8 @@ const char *orte_job_state_to_str(orte_job_state_t state)
|
|||||||
return "HEARTBEAT FAILED";
|
return "HEARTBEAT FAILED";
|
||||||
case ORTE_JOB_STATE_PROCS_MIGRATING:
|
case ORTE_JOB_STATE_PROCS_MIGRATING:
|
||||||
return "PROCS MIGRATING";
|
return "PROCS MIGRATING";
|
||||||
|
case ORTE_JOB_STATE_NON_ZERO_TERM:
|
||||||
|
return "AT LEAST ONE PROCESS EXITED WITH NON-ZERO STATUS";
|
||||||
default:
|
default:
|
||||||
return "UNKNOWN STATE!";
|
return "UNKNOWN STATE!";
|
||||||
}
|
}
|
||||||
@ -266,6 +268,8 @@ const char *orte_proc_state_to_str(orte_proc_state_t state)
|
|||||||
return "MIGRATING";
|
return "MIGRATING";
|
||||||
case ORTE_PROC_STATE_CANNOT_RESTART:
|
case ORTE_PROC_STATE_CANNOT_RESTART:
|
||||||
return "CANNOT BE RESTARTED";
|
return "CANNOT BE RESTARTED";
|
||||||
|
case ORTE_PROC_STATE_TERM_NON_ZERO:
|
||||||
|
return "EXITED WITH NON-ZERO STATUS";
|
||||||
default:
|
default:
|
||||||
return "UNKNOWN STATE!";
|
return "UNKNOWN STATE!";
|
||||||
}
|
}
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user