diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index e2c0cd5b2c..01c9f59a99 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -36,6 +36,7 @@ #include "orte/mca/sensor/sensor.h" #include "orte/mca/routed/routed.h" #include "orte/mca/debugger/base/base.h" +#include "orte/mca/notifier/notifier.h" #include "orte/util/error_strings.h" #include "orte/util/name_fns.h" @@ -990,7 +991,8 @@ static void check_job_complete(orte_job_t *jdata) orte_std_cntr_t index; bool one_still_alive; orte_vpid_t non_zero=0, lowest=0; - + char *msg; + #if 0 /* Check if FileM is active. If so then keep processing. */ OPAL_ACQUIRE_THREAD(&orte_filem_base_lock, &orte_filem_base_cond, &orte_filem_base_is_active); @@ -1225,7 +1227,7 @@ static void check_job_complete(orte_job_t *jdata) * This can happen if a ctrl-c hits in the "wrong" place * while launching */ -CHECK_DAEMONS: + CHECK_DAEMONS: if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { if (0 == orte_routed.num_routes()) { /* orteds are done! */ @@ -1301,6 +1303,9 @@ CHECK_DAEMONS: * then go ahead and release it. We cannot release it if it * abnormally terminated as mpirun needs the info so it can * report appropriately to the user + * + * NOTE: do not release the primary job (j=1) so we + * can pretty-print completion message */ if (NULL != jdata && job->jobid == jdata->jobid && (jdata->state == ORTE_JOB_STATE_TERMINATED || @@ -1309,8 +1314,10 @@ CHECK_DAEMONS: * pointer array internal accounting * is maintained! */ - opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ - OBJ_RELEASE(jdata); + if (1 < j) { + opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ + OBJ_RELEASE(jdata); + } continue; } /* if the job is flagged to not be monitored, skip it */ @@ -1356,6 +1363,25 @@ CHECK_DAEMONS: * wasn't already set by an error condition */ ORTE_UPDATE_EXIT_STATUS(0); + /* provide a notifier message if that framework is active - ignored otherwise */ + if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1))) { + if (NULL == job->name) { + job->name = strdup(orte_process_info.nodename); + } + if (NULL == job->instance) { + asprintf(&job->instance, "%d", orte_process_info.pid); + } + if (0 == orte_exit_status) { + asprintf(&msg, "Job %s:%s complete", job->name, job->instance); + orte_notifier.log(ORTE_NOTIFIER_INFO, 0, msg); + } else { + asprintf(&msg, "Job %s:%s terminated abnormally", job->name, job->instance); + orte_notifier.log(ORTE_NOTIFIER_ALERT, orte_exit_status, msg); + } + free(msg); + /* this job object will be release during finalize */ + } + orte_jobs_complete(); /* if I am the only daemon alive, then I can exit now */ if (0 == orte_routed.num_routes()) { diff --git a/orte/mca/notifier/hnp/notifier_hnp_module.c b/orte/mca/notifier/hnp/notifier_hnp_module.c index 8692686e66..801318d53b 100644 --- a/orte/mca/notifier/hnp/notifier_hnp_module.c +++ b/orte/mca/notifier/hnp/notifier_hnp_module.c @@ -37,6 +37,7 @@ #include "opal/dss/dss.h" #include "opal/dss/dss_types.h" +#include "orte/util/show_help.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/notifier/base/base.h" @@ -263,13 +264,12 @@ static int init(void) and/or aggregation, each process maintains a separate SOS table and individually sends each entry in the table to the HNP. */ - /* + OBJ_CONSTRUCT(&orte_notifier_hnp_tables, opal_pointer_array_t); opal_pointer_array_init(&orte_notifier_hnp_tables, orte_process_info.num_procs, INT32_MAX, 8); OBJ_CONSTRUCT(&orte_notifier_hnp_tables_lock, opal_mutex_t); - */ #if OPAL_ENABLE_DEBUG /* If we're debugging, also add an exception handler -- just to @@ -305,8 +305,13 @@ static void mylog(orte_notifier_base_severity_t severity, int errcode, /* If there was a message, output it */ vasprintf(&output, msg, ap); - if (NULL != output && !ORTE_PROC_IS_HNP) { - send_command(severity, errcode, output); + if (NULL != output) { + if (ORTE_PROC_IS_HNP) { + /* output it locally */ + orte_show_help("opal_sos_reporter.txt", "notifier message", false, output); + } else { + send_command(severity, errcode, output); + } free(output); } } @@ -318,8 +323,13 @@ static void myhelplog(orte_notifier_base_severity_t severity, int errcode, output = opal_show_help_vstring(filename, topic, false, ap); - if (NULL != output && !ORTE_PROC_IS_HNP) { - send_command(severity, errcode, output); + if (NULL != output) { + if (ORTE_PROC_IS_HNP) { + /* output it locally */ + orte_show_help("opal_sos_reporter.txt", "notifier message", false, output); + } else { + send_command(severity, errcode, output); + } free(output); } } @@ -330,13 +340,23 @@ static void mypeerlog(orte_notifier_base_severity_t severity, int errcode, { char *buf = orte_notifier_base_peer_log(errcode, peer_proc, msg, ap); - if (NULL != buf && !ORTE_PROC_IS_HNP) { - send_command(severity, errcode, buf); + if (NULL != buf) { + if (ORTE_PROC_IS_HNP) { + /* output it locally */ + orte_show_help("opal_sos_reporter.txt", "notifier message", false, buf); + } else { + send_command(severity, errcode, buf); + } free(buf); } } static void myeventlog(const char *msg) { - send_command(ORTE_NOTIFIER_NOTICE, ORTE_SUCCESS, (char *)msg); + if (ORTE_PROC_IS_HNP) { + /* output it locally */ + orte_show_help("opal_sos_reporter.txt", "notifier message", false, (char*)msg); + } else { + send_command(ORTE_NOTIFIER_NOTICE, ORTE_SUCCESS, (char *)msg); + } }