diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index bfefdfd730..dab9857a06 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -130,7 +130,7 @@ static int update_state(orte_jobid_t job, orte_proc_state_to_str(state), exit_code)); /* - * if orterun is trying to shutdown, just let it + * if orte is trying to shutdown, just let it */ if (orte_errmgr_base.shutting_down) { return ORTE_SUCCESS; @@ -580,7 +580,7 @@ static void check_job_complete(orte_job_t *jdata) orte_job_map_t *map; orte_std_cntr_t index; bool one_still_alive; - orte_exit_code_t first_non_zero=0; + orte_vpid_t non_zero=0; #if 0 /* Check if FileM is active. If so then keep processing. */ @@ -595,8 +595,8 @@ static void check_job_complete(orte_job_t *jdata) continue; } - if (0 == first_non_zero && 0 != proc->exit_code) { - first_non_zero = proc->exit_code; + if (0 != proc->exit_code) { + non_zero++; } /* @@ -732,8 +732,15 @@ static void check_job_complete(orte_job_t *jdata) /* turn off any sensor monitors on this job */ orte_sensor.stop(jdata->jobid); #endif - /* update our exit code */ - ORTE_UPDATE_EXIT_STATUS(first_non_zero); + if (0 < non_zero) { + /* warn user */ + opal_output(orte_clean_output, + "-----------------------------------------------------\n\n" + "While job %s terminated normally, %s processes returned\n" + "non-zero exit codes. Further examination may be required.\n\n" + "-----------------------------------------------------", + ORTE_JOBID_PRINT(jdata->jobid), ORTE_VPID_PRINT(non_zero)); + } OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),