Since there is no defined behavior for the case where all application procs exit normally, but some or all have non-zero returns, just output a warning telling the user how many procs meet that criteria. Let the return code of mpirun in that scenario reflect any errors in OMPI/ORTE itself.
Clearly a temporary solution until a defined behavior can be established. This commit was SVN r23075.
Этот коммит содержится в:
родитель
29f02d88c6
Коммит
c93af95351
@ -130,7 +130,7 @@ static int update_state(orte_jobid_t job,
|
|||||||
orte_proc_state_to_str(state), exit_code));
|
orte_proc_state_to_str(state), exit_code));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* if orterun is trying to shutdown, just let it
|
* if orte is trying to shutdown, just let it
|
||||||
*/
|
*/
|
||||||
if (orte_errmgr_base.shutting_down) {
|
if (orte_errmgr_base.shutting_down) {
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
@ -580,7 +580,7 @@ static void check_job_complete(orte_job_t *jdata)
|
|||||||
orte_job_map_t *map;
|
orte_job_map_t *map;
|
||||||
orte_std_cntr_t index;
|
orte_std_cntr_t index;
|
||||||
bool one_still_alive;
|
bool one_still_alive;
|
||||||
orte_exit_code_t first_non_zero=0;
|
orte_vpid_t non_zero=0;
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
/* Check if FileM is active. If so then keep processing. */
|
/* Check if FileM is active. If so then keep processing. */
|
||||||
@ -595,8 +595,8 @@ static void check_job_complete(orte_job_t *jdata)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (0 == first_non_zero && 0 != proc->exit_code) {
|
if (0 != proc->exit_code) {
|
||||||
first_non_zero = proc->exit_code;
|
non_zero++;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -732,8 +732,15 @@ static void check_job_complete(orte_job_t *jdata)
|
|||||||
/* turn off any sensor monitors on this job */
|
/* turn off any sensor monitors on this job */
|
||||||
orte_sensor.stop(jdata->jobid);
|
orte_sensor.stop(jdata->jobid);
|
||||||
#endif
|
#endif
|
||||||
/* update our exit code */
|
if (0 < non_zero) {
|
||||||
ORTE_UPDATE_EXIT_STATUS(first_non_zero);
|
/* warn user */
|
||||||
|
opal_output(orte_clean_output,
|
||||||
|
"-----------------------------------------------------\n\n"
|
||||||
|
"While job %s terminated normally, %s processes returned\n"
|
||||||
|
"non-zero exit codes. Further examination may be required.\n\n"
|
||||||
|
"-----------------------------------------------------",
|
||||||
|
ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(non_zero));
|
||||||
|
}
|
||||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||||
"%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs",
|
"%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user