1
1

Add the ability to receive notifier output when job completes. Set the notification level to INFO for normal job completion, and to ALERT for abnormal termination.

This commit was SVN r23710.
Этот коммит содержится в:
Ralph Castain 2010-09-02 14:42:41 +00:00
родитель 14e7bcc383
Коммит f75437f5a3
2 изменённых файлов: 59 добавлений и 13 удалений

Просмотреть файл

@ -36,6 +36,7 @@
#include "orte/mca/sensor/sensor.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/notifier/notifier.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
@ -990,7 +991,8 @@ static void check_job_complete(orte_job_t *jdata)
orte_std_cntr_t index;
bool one_still_alive;
orte_vpid_t non_zero=0, lowest=0;
char *msg;
#if 0
/* Check if FileM is active. If so then keep processing. */
OPAL_ACQUIRE_THREAD(&orte_filem_base_lock, &orte_filem_base_cond, &orte_filem_base_is_active);
@ -1225,7 +1227,7 @@ static void check_job_complete(orte_job_t *jdata)
* This can happen if a ctrl-c hits in the "wrong" place
* while launching
*/
CHECK_DAEMONS:
CHECK_DAEMONS:
if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
if (0 == orte_routed.num_routes()) {
/* orteds are done! */
@ -1301,6 +1303,9 @@ CHECK_DAEMONS:
* then go ahead and release it. We cannot release it if it
* abnormally terminated as mpirun needs the info so it can
* report appropriately to the user
*
* NOTE: do not release the primary job (j=1) so we
* can pretty-print completion message
*/
if (NULL != jdata && job->jobid == jdata->jobid &&
(jdata->state == ORTE_JOB_STATE_TERMINATED ||
@ -1309,8 +1314,10 @@ CHECK_DAEMONS:
* pointer array internal accounting
* is maintained!
*/
opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */
OBJ_RELEASE(jdata);
if (1 < j) {
opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */
OBJ_RELEASE(jdata);
}
continue;
}
/* if the job is flagged to not be monitored, skip it */
@ -1356,6 +1363,25 @@ CHECK_DAEMONS:
* wasn't already set by an error condition
*/
ORTE_UPDATE_EXIT_STATUS(0);
/* provide a notifier message if that framework is active - ignored otherwise */
if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1))) {
if (NULL == job->name) {
job->name = strdup(orte_process_info.nodename);
}
if (NULL == job->instance) {
asprintf(&job->instance, "%d", orte_process_info.pid);
}
if (0 == orte_exit_status) {
asprintf(&msg, "Job %s:%s complete", job->name, job->instance);
orte_notifier.log(ORTE_NOTIFIER_INFO, 0, msg);
} else {
asprintf(&msg, "Job %s:%s terminated abnormally", job->name, job->instance);
orte_notifier.log(ORTE_NOTIFIER_ALERT, orte_exit_status, msg);
}
free(msg);
/* this job object will be release during finalize */
}
orte_jobs_complete();
/* if I am the only daemon alive, then I can exit now */
if (0 == orte_routed.num_routes()) {

Просмотреть файл

@ -37,6 +37,7 @@
#include "opal/dss/dss.h"
#include "opal/dss/dss_types.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/notifier/base/base.h"
@ -263,13 +264,12 @@ static int init(void)
and/or aggregation, each process maintains a separate SOS
table and individually sends each entry in the table to
the HNP. */
/*
OBJ_CONSTRUCT(&orte_notifier_hnp_tables, opal_pointer_array_t);
opal_pointer_array_init(&orte_notifier_hnp_tables,
orte_process_info.num_procs,
INT32_MAX, 8);
OBJ_CONSTRUCT(&orte_notifier_hnp_tables_lock, opal_mutex_t);
*/
#if OPAL_ENABLE_DEBUG
/* If we're debugging, also add an exception handler -- just to
@ -305,8 +305,13 @@ static void mylog(orte_notifier_base_severity_t severity, int errcode,
/* If there was a message, output it */
vasprintf(&output, msg, ap);
if (NULL != output && !ORTE_PROC_IS_HNP) {
send_command(severity, errcode, output);
if (NULL != output) {
if (ORTE_PROC_IS_HNP) {
/* output it locally */
orte_show_help("opal_sos_reporter.txt", "notifier message", false, output);
} else {
send_command(severity, errcode, output);
}
free(output);
}
}
@ -318,8 +323,13 @@ static void myhelplog(orte_notifier_base_severity_t severity, int errcode,
output = opal_show_help_vstring(filename, topic, false, ap);
if (NULL != output && !ORTE_PROC_IS_HNP) {
send_command(severity, errcode, output);
if (NULL != output) {
if (ORTE_PROC_IS_HNP) {
/* output it locally */
orte_show_help("opal_sos_reporter.txt", "notifier message", false, output);
} else {
send_command(severity, errcode, output);
}
free(output);
}
}
@ -330,13 +340,23 @@ static void mypeerlog(orte_notifier_base_severity_t severity, int errcode,
{
char *buf = orte_notifier_base_peer_log(errcode, peer_proc, msg, ap);
if (NULL != buf && !ORTE_PROC_IS_HNP) {
send_command(severity, errcode, buf);
if (NULL != buf) {
if (ORTE_PROC_IS_HNP) {
/* output it locally */
orte_show_help("opal_sos_reporter.txt", "notifier message", false, buf);
} else {
send_command(severity, errcode, buf);
}
free(buf);
}
}
static void myeventlog(const char *msg)
{
send_command(ORTE_NOTIFIER_NOTICE, ORTE_SUCCESS, (char *)msg);
if (ORTE_PROC_IS_HNP) {
/* output it locally */
orte_show_help("opal_sos_reporter.txt", "notifier message", false, (char*)msg);
} else {
send_command(ORTE_NOTIFIER_NOTICE, ORTE_SUCCESS, (char *)msg);
}
}