diff --git a/orte/mca/notifier/base/base.h b/orte/mca/notifier/base/base.h index a0168da7ca..2f944dfd9d 100644 --- a/orte/mca/notifier/base/base.h +++ b/orte/mca/notifier/base/base.h @@ -78,6 +78,7 @@ ORTE_DECLSPEC int orte_notifier_base_select(void); /* base functions */ ORTE_DECLSPEC void orte_notifier_base_log(int sd, short args, void *cbdata); +ORTE_DECLSPEC void orte_notifier_base_event(int sd, short args, void *cbdata); ORTE_DECLSPEC void orte_notifier_base_report(int sd, short args, void *cbdata); /* severity to string */ diff --git a/orte/mca/notifier/base/notifier_base_fns.c b/orte/mca/notifier/base/notifier_base_fns.c index 0fe93106bf..fe7e74b183 100644 --- a/orte/mca/notifier/base/notifier_base_fns.c +++ b/orte/mca/notifier/base/notifier_base_fns.c @@ -27,6 +27,9 @@ #include "orte/mca/notifier/base/base.h" +static void orte_notifier_base_identify_modules(char ***modules, + orte_notifier_request_t *req); + void orte_notifier_base_log(int sd, short args, void *cbdata) { orte_notifier_request_t *req = (orte_notifier_request_t*)cbdata; @@ -46,34 +49,10 @@ void orte_notifier_base_log(int sd, short args, void *cbdata) return; } - if (ORTE_NOTIFIER_EMERG == req->severity && - (NULL != orte_notifier_base.emerg_actions)) { - modules = opal_argv_split(orte_notifier_base.emerg_actions, ','); - } else if (ORTE_NOTIFIER_ALERT == req->severity && - (NULL != orte_notifier_base.alert_actions)) { - modules = opal_argv_split(orte_notifier_base.alert_actions, ','); - } else if (ORTE_NOTIFIER_CRIT == req->severity && - (NULL != orte_notifier_base.crit_actions)) { - modules = opal_argv_split(orte_notifier_base.crit_actions, ','); - } else if (ORTE_NOTIFIER_WARN == req->severity && - (NULL != orte_notifier_base.warn_actions)) { - modules = opal_argv_split(orte_notifier_base.warn_actions, ','); - } else if (ORTE_NOTIFIER_NOTICE == req->severity && - (NULL != orte_notifier_base.notice_actions)) { - modules = opal_argv_split(orte_notifier_base.notice_actions, ','); - } else if (ORTE_NOTIFIER_INFO == req->severity && - (NULL != orte_notifier_base.info_actions)) { - modules = opal_argv_split(orte_notifier_base.info_actions, ','); - } else if (ORTE_NOTIFIER_DEBUG == req->severity && - (NULL != orte_notifier_base.debug_actions)) { - modules = opal_argv_split(orte_notifier_base.debug_actions, ','); - } else if (ORTE_NOTIFIER_ERROR == req->severity && - (NULL != orte_notifier_base.error_actions)) { - modules = opal_argv_split(orte_notifier_base.error_actions, ','); - } else if (NULL != orte_notifier_base.default_actions) { - modules = opal_argv_split(orte_notifier_base.default_actions, ','); - } else { - /* no modules selected */ + orte_notifier_base_identify_modules(&modules, req); + + /* no modules selected then nothing to do */ + if (NULL == modules) { return; } @@ -87,10 +66,48 @@ void orte_notifier_base_log(int sd, short args, void *cbdata) opal_argv_free(modules); } +void orte_notifier_base_event(int sd, short args, void *cbdata) +{ + orte_notifier_request_t *req = (orte_notifier_request_t*)cbdata; + char **modules = NULL; + orte_notifier_active_module_t *imod; + int i; + + /* if no modules are active, then there is nothing to do */ + if (0 == opal_list_get_size(&orte_notifier_base.modules)) { + return; + } + + /* check if the severity is >= severity level set for + * reporting - note that the severity enum value goes up + * as severity goes down */ + if (orte_notifier_base.severity_level < req->severity ) { + return; + } + + orte_notifier_base_identify_modules(&modules, req); + + /* no modules selected then nothing to do */ + if (NULL == modules) { + return; + } + + for (i=0; NULL != modules[i]; i++) { + OPAL_LIST_FOREACH(imod, &orte_notifier_base.modules, orte_notifier_active_module_t) { + if (NULL != imod->module->log && + 0 == strcmp(imod->component->base_version.mca_component_name, modules[i])) + imod->module->event(req); + } + } + opal_argv_free(modules); +} + void orte_notifier_base_report(int sd, short args, void *cbdata) { orte_notifier_request_t *req = (orte_notifier_request_t*)cbdata; - char *notifies = NULL; + char **modules = NULL; + orte_notifier_active_module_t *imod; + int i; /* if no modules are active, then there is nothing to do */ if (0 == opal_list_get_size(&orte_notifier_base.modules)) { @@ -98,12 +115,28 @@ void orte_notifier_base_report(int sd, short args, void *cbdata) } /* see if the job requested any notifications */ - if (!orte_get_attribute(&req->jdata->attributes, ORTE_JOB_NOTIFICATIONS, (void**)notifies, OPAL_STRING)) { + if (!orte_get_attribute(&req->jdata->attributes, ORTE_JOB_NOTIFICATIONS, (void**)modules, OPAL_STRING)) { return; } /* need to process the notification string to get the names of the modules */ - return; + if (NULL == modules) { + orte_notifier_base_identify_modules(&modules, req); + + /* no modules selected then nothing to do */ + if (NULL == modules) { + return; + } + } + + for (i=0; NULL != modules[i]; i++) { + OPAL_LIST_FOREACH(imod, &orte_notifier_base.modules, orte_notifier_active_module_t) { + if (NULL != imod->module->log && + 0 == strcmp(imod->component->base_version.mca_component_name, modules[i])) + imod->module->report(req); + } + } + opal_argv_free(modules); } const char* orte_notifier_base_sev2str(orte_notifier_severity_t severity) @@ -121,3 +154,39 @@ const char* orte_notifier_base_sev2str(orte_notifier_severity_t severity) } } +static void orte_notifier_base_identify_modules(char ***modules, + orte_notifier_request_t *req) +{ + if (NULL != req->action) { + *modules = opal_argv_split(req->action, ','); + } else { + if (ORTE_NOTIFIER_EMERG == req->severity && + (NULL != orte_notifier_base.emerg_actions)) { + *modules = opal_argv_split(orte_notifier_base.emerg_actions, ','); + } else if (ORTE_NOTIFIER_ALERT == req->severity && + (NULL != orte_notifier_base.alert_actions)) { + *modules = opal_argv_split(orte_notifier_base.alert_actions, ','); + } else if (ORTE_NOTIFIER_CRIT == req->severity && + (NULL != orte_notifier_base.crit_actions)) { + *modules = opal_argv_split(orte_notifier_base.crit_actions, ','); + } else if (ORTE_NOTIFIER_WARN == req->severity && + (NULL != orte_notifier_base.warn_actions)) { + *modules = opal_argv_split(orte_notifier_base.warn_actions, ','); + } else if (ORTE_NOTIFIER_NOTICE == req->severity && + (NULL != orte_notifier_base.notice_actions)) { + *modules = opal_argv_split(orte_notifier_base.notice_actions, ','); + } else if (ORTE_NOTIFIER_INFO == req->severity && + (NULL != orte_notifier_base.info_actions)) { + *modules = opal_argv_split(orte_notifier_base.info_actions, ','); + } else if (ORTE_NOTIFIER_DEBUG == req->severity && + (NULL != orte_notifier_base.debug_actions)) { + *modules = opal_argv_split(orte_notifier_base.debug_actions, ','); + } else if (ORTE_NOTIFIER_ERROR == req->severity && + (NULL != orte_notifier_base.error_actions)) { + *modules = opal_argv_split(orte_notifier_base.error_actions, ','); + } else if (NULL != orte_notifier_base.default_actions) { + *modules = opal_argv_split(orte_notifier_base.default_actions, ','); + } + } + return; +} diff --git a/orte/mca/notifier/notifier.h b/orte/mca/notifier/notifier.h index feccf7e92c..2070554b58 100644 --- a/orte/mca/notifier/notifier.h +++ b/orte/mca/notifier/notifier.h @@ -57,6 +57,10 @@ BEGIN_C_DECLS +/* make the verbose channel visible here so everyone + * doesn't have to include notifier/base/base.h */ +extern int orte_notifier_debug_output; + /* The maximum size of any on-stack buffers used in the notifier * so we can try to avoid calling malloc in OUT_OF_RESOURCES conditions. * The code has NOT been auditied for use of malloc, so this still @@ -84,6 +88,7 @@ typedef struct { orte_notifier_severity_t severity; int errcode; const char *msg; + const char *action; time_t t; } orte_notifier_request_t; OBJ_CLASS_DECLARATION(orte_notifier_request_t); @@ -98,21 +103,27 @@ typedef int (*orte_notifier_base_module_init_fn_t)(void); /* finalize the selected module */ typedef void (*orte_notifier_base_module_finalize_fn_t)(void); -/* Log an error */ +/* Log an internal error - this will include the job that caused the + * error to occur */ typedef void (*orte_notifier_base_module_log_fn_t)(orte_notifier_request_t *req); +/* Report a system event - e.g., a temperature out-of-bound */ +typedef void (*orte_notifier_base_module_event_fn_t)(orte_notifier_request_t *req); -/* Report a state */ +/* Report a job state */ typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *req); -#define ORTE_NOTIFIER_LOG_ERROR(j, st, s, e, m) \ +#define ORTE_NOTIFIER_INTERNAL_ERROR(j, st, s, e, m) \ do { \ orte_notifier_request_t *_n; \ - opal_output_verbose(2, orte_notifier_base_framework.framework_output, \ - "%s notifier:log:error[%s:%d] for job %s error %s severity %s", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, \ - ORTE_JOBID_PRINT((j)->jobid), ORTE_ERROR_NAME((e)), \ + opal_output_verbose(2, orte_notifier_debug_output, \ + "%s notifier:internal:error[%s:%d] " \ + "job %s error %s severity %s", \ + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ + __FILE__, __LINE__, \ + ORTE_JOBID_PRINT((j)->jobid), \ + ORTE_ERROR_NAME((e)), \ orte_notifier_base_sev2str(s)); \ _n = OBJ_NEW(orte_notifier_request_t); \ _n->jdata = (j); \ @@ -121,6 +132,7 @@ typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *r _n->errcode = (e); \ _n->msg = (m); \ _n->t = time(NULL); \ + _n->action = (NULL); \ /* add the event */ \ opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \ OPAL_EV_WRITE, orte_notifier_base_log, (_n)); \ @@ -128,18 +140,21 @@ typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *r opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \ } while(0); -#define ORTE_NOTIFIER_REPORT_STATE(j, st, m) \ +#define ORTE_NOTIFIER_JOB_STATE(j, st, m) \ do { \ orte_notifier_request_t *_n; \ - opal_output_verbose(2, orte_notifier_base_framework.framework_output, \ - "%s notifier:report:event[%s:%d] for job %s state %s", \ - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, \ + opal_output_verbose(2, orte_notifier_debug_output, \ + "%s notifier[%s:%d] job %s state %s", \ + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ + __FILE__, __LINE__, \ ORTE_JOBID_PRINT((j)->jobid), \ orte_job_state_to_str(st)); \ _n = OBJ_NEW(orte_notifier_request_t); \ _n->jdata = (j); \ + _n->state = (st); \ _n->msg = (m); \ _n->t = time(NULL); \ + _n->action = (NULL); \ /* add the event */ \ opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \ OPAL_EV_WRITE, orte_notifier_base_report, (_n)); \ @@ -147,6 +162,29 @@ typedef void (*orte_notifier_base_module_report_fn_t)(orte_notifier_request_t *r opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \ } while(0); +#define ORTE_NOTIFIER_SYSTEM_EVENT(s, m, a) \ + do { \ + orte_notifier_request_t *_n; \ + opal_output_verbose(2, orte_notifier_debug_output, \ + "%s notifier:sys:event[%s:%d] event %s", \ + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), \ + __FILE__, __LINE__, \ + orte_notifier_base_sev2str(s)); \ + _n = OBJ_NEW(orte_notifier_request_t); \ + _n->jdata = (NULL); \ + _n->state = (NULL); \ + _n->jdata = NULL; \ + _n->msg = (m); \ + _n->t = time(NULL); \ + _n->severity = (s); \ + _n->action = (a); \ + /* add the event */ \ + opal_event_set(orte_notifier_base.ev_base, &(_n)->ev, -1, \ + OPAL_EV_WRITE, orte_notifier_base_event, (_n)); \ + opal_event_set_priority(&(_n)->ev, ORTE_ERROR_PRI); \ + opal_event_active(&(_n)->ev, OPAL_EV_WRITE, 1); \ + } while(0); + /* * Ver 1.0 */ @@ -154,6 +192,7 @@ typedef struct { orte_notifier_base_module_init_fn_t init; orte_notifier_base_module_finalize_fn_t finalize; orte_notifier_base_module_log_fn_t log; + orte_notifier_base_module_event_fn_t event; orte_notifier_base_module_report_fn_t report; } orte_notifier_base_module_t; diff --git a/orte/mca/notifier/syslog/notifier_syslog_module.c b/orte/mca/notifier/syslog/notifier_syslog_module.c index ebaa6becce..be1d34def3 100644 --- a/orte/mca/notifier/syslog/notifier_syslog_module.c +++ b/orte/mca/notifier/syslog/notifier_syslog_module.c @@ -45,6 +45,7 @@ static int init(void); static void finalize(void); static void mylog(orte_notifier_request_t *req); +static void myevent(orte_notifier_request_t *req); static void myreport(orte_notifier_request_t *req); /* Module def */ @@ -52,6 +53,7 @@ orte_notifier_base_module_t orte_notifier_syslog_module = { init, finalize, mylog, + myevent, myreport }; @@ -90,7 +92,40 @@ static void mylog(orte_notifier_request_t *req) (NULL == req->msg) ? "" : req->msg); } -static void myreport(orte_notifier_request_t *req) +static void myevent(orte_notifier_request_t *req) { + char tod[48]; + + opal_output_verbose(5, orte_notifier_base_framework.framework_output, + "notifier:syslog:myevent function called with severity %d and messg %s", + (int)req->severity, req->msg); + /* If there was a message, output it */ + (void)ctime_r(&req->t, tod); + /* trim the newline */ + tod[strlen(tod)] = '\0'; + + syslog(req->severity, "[%s]%s SET EVENT : %s", tod, + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (NULL == req->msg) ? "" : req->msg); +} + +static void myreport(orte_notifier_request_t *req) +{ + char tod[48]; + + opal_output_verbose(5, orte_notifier_base_framework.framework_output, + "notifier:syslog:myreport function called with severity %d state %s and messg %s", + (int)req->severity, orte_job_state_to_str(req->state), + req->msg); + /* If there was a message, output it */ + (void)ctime_r(&req->t, tod); + /* trim the newline */ + tod[strlen(tod)] = '\0'; + + syslog(req->severity, "[%s]%s JOBID %s REPORTS STATE %s: %s", tod, + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(req->jdata->jobid), + orte_job_state_to_str(req->state), + (NULL == req->msg) ? "" : req->msg); }