1
1

Since there is no defined behavior for the case where all application procs exit normally, but some or all have non-zero returns, just output a warning telling the user how many procs meet that criteria. Let the return code of mpirun in that scenario reflect any errors in OMPI/ORTE itself.

Clearly a temporary solution until a defined behavior can be established.

This commit was SVN r23075.
Этот коммит содержится в:
Ralph Castain 2010-04-30 19:01:10 +00:00
родитель 29f02d88c6
Коммит c93af95351

Просмотреть файл

@ -130,7 +130,7 @@ static int update_state(orte_jobid_t job,
orte_proc_state_to_str(state), exit_code));
/*
* if orterun is trying to shutdown, just let it
* if orte is trying to shutdown, just let it
*/
if (orte_errmgr_base.shutting_down) {
return ORTE_SUCCESS;
@ -580,7 +580,7 @@ static void check_job_complete(orte_job_t *jdata)
orte_job_map_t *map;
orte_std_cntr_t index;
bool one_still_alive;
orte_exit_code_t first_non_zero=0;
orte_vpid_t non_zero=0;
#if 0
/* Check if FileM is active. If so then keep processing. */
@ -595,8 +595,8 @@ static void check_job_complete(orte_job_t *jdata)
continue;
}
if (0 == first_non_zero && 0 != proc->exit_code) {
first_non_zero = proc->exit_code;
if (0 != proc->exit_code) {
non_zero++;
}
/*
@ -732,8 +732,15 @@ static void check_job_complete(orte_job_t *jdata)
/* turn off any sensor monitors on this job */
orte_sensor.stop(jdata->jobid);
#endif
/* update our exit code */
ORTE_UPDATE_EXIT_STATUS(first_non_zero);
if (0 < non_zero) {
/* warn user */
opal_output(orte_clean_output,
"-----------------------------------------------------\n\n"
"While job %s terminated normally, %s processes returned\n"
"non-zero exit codes. Further examination may be required.\n\n"
"-----------------------------------------------------",
ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(non_zero));
}
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),