diff --git a/opal/util/opal_sos.c b/opal/util/opal_sos.c new file mode 100644 index 0000000000..fd560d85dd --- /dev/null +++ b/opal/util/opal_sos.c @@ -0,0 +1,531 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007 Los Alamos National Security, LLC. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "opal_config.h" + +#ifdef HAVE_STRING_H +#include +#endif +#include +#include +#ifdef HAVE_STDARG_H +#include +#endif +#ifdef HAVE_STDLIB_H +#include +#endif + +#include "opal/util/opal_sos.h" +#include "opal/constants.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/class/opal_hash_table.h" +#include "opal/util/stacktrace.h" +#include "opal/util/show_help.h" + +/** Global variables */ +opal_hash_table_t opal_sos_table; +opal_mutex_t opal_sos_table_lock; +bool opal_sos_print_low; + +/* Local variables */ +static bool opal_sos_initialized = false; +static const char *dash_line = "--------------------------------------------------------------------------"; +static const char *stackhdr = "[STACK TRACE]:\n"; + +/* Local functions */ +static void opal_sos_error_construct(opal_sos_error_t *obj); +static void opal_sos_error_destruct(opal_sos_error_t *obj); + +/** OPAL SOS callback function pointers */ +static opal_sos_print_callback_fn_t cur_print_callback; +static opal_sos_reporter_callback_fn_t cur_reporter_callback; +/* static opal_sos_print_callback_fn_t prev_print_callback; */ +static opal_sos_reporter_callback_fn_t prev_reporter_callback; + +OBJ_CLASS_INSTANCE(opal_sos_error_t, + opal_object_t, + opal_sos_error_construct, + opal_sos_error_destruct); + +/** + * Constructor + */ +static void opal_sos_error_construct(opal_sos_error_t *obj) +{ + obj->errnum = 0; + obj->file = NULL; + obj->line = 0; + obj->func = NULL; + obj->msg = NULL; + obj->prev = obj->next = OPAL_SOS_ERR_BASE; +} + +/** + * Destructor + */ +static void opal_sos_error_destruct(opal_sos_error_t *obj) +{ + if (NULL != obj->file) { + free(obj->file); + } + + if (NULL != obj->func) { + free(obj->func); + } + + if (NULL != obj->msg) { + free(obj->msg); + } +} + +/** + * Initialize the OPAL SOS interface + * + */ +void opal_sos_init(void) +{ + int value; + + if (opal_sos_initialized) { + return; + } + + mca_base_param_reg_int_name("opal", "sos_print_low", + "Set to non-zero to enable the print-at-bottom" + " preference for OPAL SOS. Enabling this option prints" + " out the errors, warnings or info messages as" + " soon as they are encountered.", + false, false, (int)false, &value); + + opal_sos_print_low = OPAL_INT_TO_BOOL(value); + + OBJ_CONSTRUCT(&opal_sos_table, opal_hash_table_t); + opal_hash_table_init(&opal_sos_table, OPAL_SOS_ERR_TABLE_SIZE); + OBJ_CONSTRUCT(&opal_sos_table_lock, opal_mutex_t); + + opal_sos_reg_reporter_callback(opal_sos_print_error, &prev_reporter_callback); + opal_sos_initialized = true; + return; +} + +/** + * Finalize the OPAL SOS interface + * + */ +void opal_sos_finalize(void) +{ + OBJ_DESTRUCT(&opal_sos_table); + OBJ_DESTRUCT(&opal_sos_table_lock); + opal_sos_initialized = false; + return; +} + +/** + * Free all the SOS errors represented by the error code pointed to by \c errnum + * + */ +void opal_sos_free(int *errnum) +{ + opal_sos_error_t *opal_error, *attached_error; + int err, attached_errnum; + + if (NULL == errnum) { + return; + } else if (true == OPAL_SOS_IS_NATIVE(*errnum)) { + return; + } else { + err = *errnum; + } + + *errnum = OPAL_SOS_GET_ERROR_CODE(err); + + do { + /* Look for attached errors */ + if (0 != (attached_errnum = OPAL_SOS_GET_ATTACHED_INDEX(err))) { + OPAL_THREAD_LOCK(&opal_sos_table_lock); + if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table, + attached_errnum, + (void **)&attached_error)) { + goto cleanup; + } + OPAL_THREAD_UNLOCK(&opal_sos_table_lock); + + /* If there's an attached error trace, free it! */ + if (NULL != attached_error) { + attached_errnum = attached_error->errnum; + opal_sos_free(&attached_errnum); + } + } + + OPAL_THREAD_LOCK(&opal_sos_table_lock); + if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table, + OPAL_SOS_GET_INDEX(err), + (void **)&opal_error)) { + goto cleanup; + } + OPAL_THREAD_UNLOCK(&opal_sos_table_lock); + if (NULL == opal_error) { + goto cleanup; + } + + opal_sos_error_destruct(opal_error); + /* Remove the entry from the SOS table */ + OPAL_THREAD_LOCK(&opal_sos_table_lock); + opal_hash_table_remove_value_uint32(&opal_sos_table, OPAL_SOS_GET_INDEX(err)); + OPAL_THREAD_UNLOCK(&opal_sos_table_lock); + + err = opal_error->prev; + } while (OPAL_SOS_ERR_BASE != err); + +cleanup: + OPAL_THREAD_UNLOCK(&opal_sos_table_lock); +} + +opal_sos_error_t * +opal_sos_build_error(int errnum, bool show_stack, const char *errmsg, ...) +{ + opal_sos_error_t *opal_error; + char *stackframe, msg[OPAL_SOS_MAX_ERR_LEN]; + va_list arglist; + int ret_errno = 0, len; + + if (!opal_sos_initialized) { + opal_sos_init(); + } + + opal_error = OBJ_NEW(opal_sos_error_t); + if (NULL == opal_error) { + return NULL; /* OPAL_ERR_OUT_OF_RESOURCE */ + } + + va_start(arglist, errmsg); + len = vsnprintf(msg, OPAL_SOS_MAX_ERR_LEN, errmsg, arglist); + va_end(arglist); + + if ((true == show_stack) && + (NULL != (stackframe = opal_stackframe_output_string()))) { + len += strlen(stackhdr) + strlen(stackframe) + 2; + if (len > OPAL_SOS_MAX_ERR_LEN) + len = OPAL_SOS_MAX_ERR_LEN; + + opal_error->msg = (char *) malloc(len); + if (NULL == opal_error->msg) { + return NULL; + } + snprintf(opal_error->msg, len, "%s\n%s%s", msg, stackhdr, stackframe); + } else { + opal_error->msg = strdup(msg); + } + + /* Check if errnum is a native error code and encode it into + the encoded error code if it is native */ + if (OPAL_SOS_IS_NATIVE(errnum)) { + OPAL_SOS_SET_ERROR_CODE(ret_errno, errnum); + } else { + /* Extract the native error code from the encoded error and + encode it back again into the newly encoded error code */ + OPAL_SOS_SET_ERROR_CODE(ret_errno, OPAL_SOS_GET_ERROR_CODE(errnum)); + opal_error->prev = errnum; + } + + opal_error->errnum = ret_errno; + return opal_error; +} + +int opal_sos_reporter(const char *file, int line, const char *func, + opal_sos_severity_t severity, opal_sos_error_t *opal_error) +{ + opal_sos_error_t *prev_error; + int ret_errno = 0, hash; + + if (NULL == opal_error) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + + /* Doing more strict validation here since if either of the file, + * func or msg are not known we replace it by to avoid any issues + * during dss pack/unpack + */ + opal_error->file = (NULL != file)?strdup(file):strdup(""); + opal_error->func = (NULL != func)?strdup(func):strdup(""); + opal_error->line = line; + + ret_errno = opal_error->errnum; + /* Encode the severity level into the return error code */ + OPAL_SOS_SET_SEVERITY(ret_errno, severity); + hash = opal_sos_hash_error(opal_error); + OPAL_SOS_SET_INDEX(ret_errno, hash); + opal_error->errnum = ret_errno; + + if (opal_sos_print_low) { + opal_sos_report_error(opal_error); + } + + /* Add the error object to the error table */ + OPAL_THREAD_LOCK(&opal_sos_table_lock); + + if (OPAL_SUCCESS != + opal_hash_table_set_value_uint32(&opal_sos_table, + OPAL_SOS_GET_INDEX(ret_errno), + (void *)opal_error)) { + OPAL_THREAD_UNLOCK(&opal_sos_table_lock); + OBJ_DESTRUCT(opal_error); + return OPAL_ERROR; + } + + /* Get the previous error in the error call stack and update + its next error pointer */ + prev_error = NULL; + opal_hash_table_get_value_uint32(&opal_sos_table, + OPAL_SOS_GET_INDEX(opal_error->prev), + (void **)&prev_error); + if (NULL != prev_error) { + prev_error->next = opal_error->errnum; + } + OPAL_THREAD_UNLOCK(&opal_sos_table_lock); + + return ret_errno; +} + +void +opal_sos_report_error(opal_sos_error_t *error) +{ + int severity, errnum, ret; + char *pretty_error; + + if (NULL == error) + return; + + severity = OPAL_SOS_GET_SEVERITY(error->errnum); + + /* An OPAL SOS encoded error number holds no meaning outside + * the context of Open MPI. We convert it back to the native + * error code before reporting it. */ + if (true == OPAL_SOS_IS_NATIVE(error->errnum)) { + errnum = error->errnum; + } else { + errnum = OPAL_SOS_GET_ERROR_CODE(error->errnum); + } + + /* Prettify the error for printing it locally */ + ret = opal_sos_prettify_error(error->msg, &pretty_error); + + (*cur_reporter_callback)(severity, errnum, "<%s> at %s:%d:%s():\n%s", + opal_sos_severity2str(severity), error->file, + error->line, error->func, + ((0 > ret) ? error->msg : pretty_error)); + + if (ret > 0) { + free(pretty_error); + } + + /* Call the previous reporter callback which should be the selected + * ORTE notifier components */ + if (NULL != prev_reporter_callback) { + prev_reporter_callback(severity, errnum, "<%s> at %s:%d:%s():\n%s", + opal_sos_severity2str(severity), error->file, + error->line, error->func, error->msg); + } +} + +void opal_sos_print(int errnum, bool show_history) +{ + opal_sos_error_t *opal_error, *prev_opal_error, *attached_error; + int tmp, attached_errnum, prev_severity, severity; + + opal_show_help("opal_sos_reporter.txt", "msg header", false, dash_line); + tmp = errnum; + prev_opal_error = NULL; + do { + /* If there is an error attached to this error, print it out. */ + if (0 != (attached_errnum = OPAL_SOS_GET_ATTACHED_INDEX(errnum))) { + OPAL_THREAD_LOCK(&opal_sos_table_lock); + if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table, + attached_errnum, + (void **)&attached_error)) { + goto cleanup; + } + OPAL_THREAD_UNLOCK(&opal_sos_table_lock); + + if (NULL != attached_error) { + opal_sos_print(attached_error->errnum, show_history); + } + } + + OPAL_THREAD_LOCK(&opal_sos_table_lock); + if (OPAL_SUCCESS != + opal_hash_table_get_value_uint32(&opal_sos_table, + OPAL_SOS_GET_INDEX(errnum), + (void **)&opal_error)) { + goto cleanup; + } + OPAL_THREAD_UNLOCK(&opal_sos_table_lock); + if (NULL == opal_error) { + return; + } + + if (NULL != prev_opal_error) { + prev_severity = OPAL_SOS_GET_SEVERITY(prev_opal_error->errnum); + severity = OPAL_SOS_GET_SEVERITY(errnum); + + /* If show_history is enabled, or if the preceeding error + was of higher severity, then report the error */ + if (show_history || (prev_severity <= severity)) + /* Print the error denoted by errnum. */ + opal_sos_report_error(prev_opal_error); + } + + prev_opal_error = opal_error; + /* Get the previous error */ + errnum = opal_error->prev; + /* Terminating condition */ + if (OPAL_SOS_ERR_BASE == errnum) { + opal_sos_report_error(opal_error); + } + } while (errnum != OPAL_SOS_ERR_BASE); + opal_show_help("opal_sos_reporter.txt", "msg header", false, dash_line); + errnum = tmp; + return; + +cleanup: + OPAL_THREAD_UNLOCK(&opal_sos_table_lock); +} + +void opal_sos_print_error(opal_sos_severity_t severity, int errnum, const char *errmsg, ...) +{ + va_list arglist; + va_start(arglist, errmsg); + opal_show_vhelp("opal_sos_reporter.txt", "general message", false, arglist); + va_end(arglist); +} + +inline void opal_sos_log(int errnum) +{ + opal_sos_print(errnum, false); + opal_sos_free(&errnum); +} + +int opal_sos_prettify_error(const char *error, char **pretty_error) +{ + char *str, *token, *saveptr, *errdup; + const char *prefix = "\n| | "; + int len, plen, left; + + if (NULL == error) { + return OPAL_ERROR; + } + + *pretty_error = (char *) malloc(OPAL_SOS_MAX_ERR_LEN); + if (NULL == *pretty_error) { + return OPAL_ERR_OUT_OF_RESOURCE; + } + *(*pretty_error) = '\0'; + + plen = strlen(prefix); + + if (NULL != (errdup = strdup(error))) { + for (str = errdup, len = 0; len < OPAL_SOS_MAX_ERR_LEN; str = NULL) { + if (NULL == (token = strtok_r(str, "\n", &saveptr))) { + break; + } + + left = strlen(token); + if ((len + left) > OPAL_SOS_MAX_ERR_LEN) { + left = OPAL_SOS_MAX_ERR_LEN - len; + } + strncat(*pretty_error, token, left); + len += left; + + left = plen; + if ((len + left) > OPAL_SOS_MAX_ERR_LEN) { + left = OPAL_SOS_MAX_ERR_LEN - len; + } + strncat(*pretty_error, prefix, left); + len += left; + } + free(errdup); + errdup = NULL; + } + + return len; +} + +const char *opal_sos_severity2str(opal_sos_severity_t severity) +{ + switch(severity) { + case OPAL_SOS_SEVERITY_EMERG: return "EMERGENCY"; + case OPAL_SOS_SEVERITY_ALERT: return "ALERT MESSAGE"; + case OPAL_SOS_SEVERITY_CRIT: return "CRITICAL MESSAGE"; + case OPAL_SOS_SEVERITY_ERROR: return "ERROR"; + case OPAL_SOS_SEVERITY_WARN: return "WARNING"; + case OPAL_SOS_SEVERITY_NOTICE: return "NOTICE"; + case OPAL_SOS_SEVERITY_INFO: return "INFO MESSAGE"; + case OPAL_SOS_SEVERITY_DEBUG: return "DEBUG MESSAGE"; + default: return "UNKNOWN ERROR"; + } +} + +int opal_sos_hash_error(opal_sos_error_t *error) +{ + int hash, c; + char *msg; + + /* Naive string hash function to create a key based on the error + details, namely length of the file name, length of the function + name and the sum of the characters in the error message */ + + hash = error->errnum; + if (NULL != error->file) { + hash += strlen(error->file); + } + if (NULL != error->func) { + hash += strlen(error->func); + } + if (NULL != error->msg) { + msg = error->msg; + while ('\0' != (c = *msg++)) { + hash += c; + } + } + + return (hash & (OPAL_SOS_ERR_TABLE_SIZE - 1)); +} + +int opal_sos_reg_print_callback(opal_sos_print_callback_fn_t new_func, + opal_sos_print_callback_fn_t *prev_func) +{ + /* Preserve the previous print callback */ + *prev_func = cur_print_callback; + + /* Update the current print callback */ + cur_print_callback = new_func; + return OPAL_SUCCESS; +} + +int opal_sos_reg_reporter_callback(opal_sos_reporter_callback_fn_t new_func, + opal_sos_reporter_callback_fn_t *prev_func) +{ + /* Preserve the previous reporter callback */ + *prev_func = cur_reporter_callback; + + /* Update the current reporter callback */ + cur_reporter_callback = new_func; + return OPAL_SUCCESS; +} diff --git a/opal/util/opal_sos.h b/opal/util/opal_sos.h new file mode 100644 index 0000000000..53f4c8e503 --- /dev/null +++ b/opal/util/opal_sos.h @@ -0,0 +1,448 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OPAL_SOS_H +#define OPAL_SOS_H + +#ifdef HAVE_LIMITS_H +#include +#endif +#ifdef HAVE_SYSLOG_H +#include +#endif + +#include "opal/class/opal_object.h" +#include "opal/class/opal_hash_table.h" +#include "opal/threads/mutex.h" +#include "opal/util/output.h" + +#ifdef __STDC_VERSION__ +# if __STDC_VERSION__ < 199901L +# if __GNUC__ >= 2 +# define OPAL_SOS_FUNCTION __FUNCTION__ +# else +# define OPAL_SOS_FUNCION "" +# endif +# else +# define OPAL_SOS_FUNCTION __func__ +# endif +#else +# define OPAL_SOS_FUNCTION __func__ +#endif + +/* Internal use only */ +#define OPAL_SOS_ERR_BASE OPAL_SUCCESS + +/** + * Size of the OPAL SOS error table. + * + * Since the index into the error table that is encoded in the error + * code is 9-bit long, setting a higher value than (1 << 9) would make + * no difference at all. + */ +#define OPAL_SOS_ERR_TABLE_SIZE 512 + +/** + * Maximum length for the error string stored per error code in the + * OPAL SOS error table. + */ +#define OPAL_SOS_MAX_ERR_LEN 1024 + +/** + * Reports an error to OPAL SOS reporter. + * + * Encodes an informational message with severity \c severity and + * other passed arguments like errnum, errmsg etc. It also remembers + * the line number, file name and the function name where the error + * has occurred. + * If the MCA parameter \c opal_sos_print_low is set, the error message + * is displayed on stderr using the "show help" subsystem. By default, + * informational messages are not printed out on stderr. + * If \c show_stack is set, the stacktrace is saved and/or printed + * along with the corresponding \c errmsg. + */ +#define OPAL_SOS_REPORT(severity, arg) opal_sos_reporter(__FILE__, __LINE__, \ + OPAL_SOS_FUNCTION, \ + severity, \ + opal_sos_build_error arg) + +/** + * Print or store an event with the maximum severity (EMERG). + */ +#define OPAL_SOS_EMERG(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_EMERG, arg) + +/** + * Report an event of severity "ALERT". + */ +#define OPAL_SOS_ALERT(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_ALERT, arg) + +/** + * Report events with severity marked as "CRITICAL". + */ +#define OPAL_SOS_CRIT(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_CRIT, arg) + +/** + * Prints and/or logs an error. + * This function can be used to log or print error events. + */ +#define OPAL_SOS_ERROR(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_ERROR, arg) + +/** + * Prints and/or logs a warning. + * + * This function is similar to OPAL_SOS_INFO but with a higher + * severity. These events are printed out on the output stream + * by default. + */ +#define OPAL_SOS_WARN(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_WARN, arg) + +/** + * Report an error event with severity "NOTICE". + */ +#define OPAL_SOS_NOTICE(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_NOTICE,arg) + +/** + * Prints or logs an informational message in the OPAL SOS framework. + * Events with this severity are not printed, by default. However, + * they are still stored in the SOS table. + */ +#define OPAL_SOS_INFO(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_INFO, arg) + +/** + * Log debug events in the SOS framework. + */ +#define OPAL_SOS_DEBUG(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_DEBUG, arg) + +/** + * Frees all the (entire stack of) OPAL SOS error objects associated + * with the encoded error code obtained after dereferencing the + * pointer \c errnum. + */ +#define OPAL_SOS_FREE(perrnum) opal_sos_free(perrnum) + +/** + * Print the warnings/errors/informational messages previously logged + * in to the SOS framework. + * + * This function prints the error details encoded by \c errnum. + * If \c show_history is true, the entire history for the error + * represented by \c errnum is printed on the output stream. + */ +#define OPAL_SOS_PRINT(errnum, show_history) \ + opal_sos_print(errnum, show_history) + +/** + * Attach the history from one error code to another error code + * Returns the target encoded error \c errtgt with history of \c + * errnum associated to it. + */ +#define OPAL_SOS_ATTACH(errtgt, errnum) \ + (errtgt = -((-errtgt & ~0xFF80000L) | \ + ((OPAL_SOS_GET_INDEX(errnum) & 0x1FFL) * 0x80000L))) + +/** + * Returns the index of the error attached to errnum using OPAL_SOS_ATTACH(). + */ +#define OPAL_SOS_GET_ATTACHED_INDEX(errnum) ((int) ((-errnum & 0xFF80000L) >> 19)) + +/** + * Returns the native error code for the given encoded error code \c + * errnum. \c errnum can be a native error code itself. + */ +#define OPAL_SOS_GET_ERROR_CODE(errnum) ((int) -(-errnum & 0x3FFL)) + +/** + * Sets the native error code for the potentially encoded error code. + * + * The lower 10 bits are reserved for the native error code. This + * macro sets the lower 10 bits of errnum to nativeerr. + */ +#define OPAL_SOS_SET_ERROR_CODE(errnum, nativeerr) \ + (errnum = -((-errnum & ~0x3FFL) | (-nativeerr & 0x3FFL))) + +/** + * Macro to check if the error encoded by \c errnum is a native error + * or an OPAL SOS encoded error. + */ +#define OPAL_SOS_IS_NATIVE(errnum) ((-errnum & ~0x3FFL) == 0) + +/** + * Returns the severity level for the potentially encoded error code. + * + * The severity is encoded in the last three bits of the first nibble. + */ +#define OPAL_SOS_GET_SEVERITY(errnum) ((int)((-errnum >> 28) & 0x7L)) + +/** + * Sets the severity level for the given error code \c errnum. + * + * This macros do not do strict error checking of the specified + * severity levels. + */ +#define OPAL_SOS_SET_SEVERITY(errnum, severity) \ + (errnum = -((-errnum & ~0x70000000L) | ((severity & 0x7L) * 0x10000000L))) + +/** + * Macro to get the encoded error severity level as a string. + * + * This macro accepts the argument \c severity and calls the corresponding + * function opal_sos_severity2str to convert it to a string. The result + * is returned in a static buffer that should not be freed with free(). + */ +#define OPAL_SOS_SEVERITY2STR(severity) opal_sos_severity2str(severity) + +/** + * Log an encoded error \c errnum. + * + * This macro prints out and consequently frees the entire stack of + * errors associated with the \c errnum. + */ +#define OPAL_SOS_LOG(errnum) opal_sos_log(errnum) + +/** + * \internal + * Returns the index into the error table of the error encoded by \c errnum. + * + * The index is 9-bit long stored from bit 11 to bit 20 in the encoded + * error code. + */ +#define OPAL_SOS_GET_INDEX(errnum) ((int)((-errnum & 0x7FC00L) >> 10)) + +/** + * \internal + * Sets the index into the error table for the error encoded by \c errnum. + */ +#define OPAL_SOS_SET_INDEX(errnum, index) \ + (errnum = -((-errnum & ~0x7FC00L) | ((index & 0x1FFL) * 0x400L))) + +BEGIN_C_DECLS + +/** This MCA parameter sos_print_low can be set to non-zero to enable + * the print-at-bottom preference for OPAL SOS. */ +OPAL_DECLSPEC extern bool opal_sos_print_low; + +/* Severity levels for OPAL SOS */ +typedef enum { + OPAL_SOS_SEVERITY_EMERG = LOG_EMERG, + OPAL_SOS_SEVERITY_ALERT = LOG_ALERT, + OPAL_SOS_SEVERITY_CRIT = LOG_CRIT, + OPAL_SOS_SEVERITY_ERROR = LOG_ERR, + OPAL_SOS_SEVERITY_WARN = LOG_WARNING, + OPAL_SOS_SEVERITY_NOTICE = LOG_NOTICE, + OPAL_SOS_SEVERITY_INFO = LOG_INFO, + OPAL_SOS_SEVERITY_DEBUG = LOG_DEBUG +} opal_sos_severity_t; + +typedef struct opal_sos_error_t { + /** Class parent */ + opal_object_t super; + + /** + * The encoded error code for a given type of error. + * + * errnum encodes a native error code (lower 10 bits) with the + * current severity (higher 2 bits) and an index into the error + * table along with the associated error, if there is one. + */ + int errnum; + + /** File in which the error occured */ + char *file; + + /** Line number on which the error was encountered */ + int line; + + /** This is an optional parameter that indicates the function in + which the error occured */ + char *func; + + /** The actual error message or string for the error indicated by + \c errnum */ + char *msg; + + /** Encoded error numbers of the previous and the next error. + These are used are used to maintain the history of an error. + The complete history of an error can be printed later using + OPAL_SOS_PRINT() */ + int prev; + int next; +} opal_sos_error_t; + +OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_sos_error_t); + +/** + * Signature for OPAL SOS print function callback type. + */ +typedef void (*opal_sos_print_callback_fn_t) (int errcode); + +/** + * Signature for OPAL SOS reporter function callback type. + */ +typedef void (*opal_sos_reporter_callback_fn_t) (opal_sos_severity_t severity, int errcode, + const char *msg, ...) + #if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR + __opal_attribute_format__(__printf__, 3, 4) + #endif + ; + +/** + * A global handle that points to the local OPAL SOS table. + * This is used by the notifier components to reference the local OPAL + * SOS table, especially for packing/unpacking and sending it over to + * the HNP. + */ +OPAL_DECLSPEC extern opal_hash_table_t opal_sos_table; + +/** + * A global handle that points to the OPAL SOS table lock. + * + */ +OPAL_DECLSPEC extern opal_mutex_t opal_sos_table_lock; + +/** + * \internal + * + * Initialize OPAL SOS. + * + * This function initializes and sets up the structures required to + * track the data handled by OPAL SOS. It is invoked by + * opal_util(). + */ +void opal_sos_init(void); + +/** + * \internal + * + * Shut down OPAL SOS. + * + * Invoked by opal_finalize() to deallocate the structures needed by + * OPAL SOS. + */ +void opal_sos_finalize(void); + +/** + * Prints or relays the error locally or using the selected notifier + * components. + */ +void +opal_sos_report_error(opal_sos_error_t *error); + +/** + * Builds an OPAL SOS error object given the parameters errnum, + * show_stack and errmsg. + * NOTE: This function only partially populates the SOS error object + * structure, setting the error message details but nothing about where + * the error occurred. Filling up the rest of the error object is left + * to OPAL SOS reporter which then handles the error appropriately. + * + * @param errnum + * @param show_stack + * @param errmsg + * + * @return + */ +OPAL_DECLSPEC opal_sos_error_t * +opal_sos_build_error(int errnum, bool show_stack, + const char *errmsg, ...) + #if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR + __opal_attribute_format__(__printf__, 3, 4) + #endif + ; + +/** + * OPAL SOS reporter logs the error in the OPAL SOS error table or + * prints it out depending on the associated reporter callback. It can + * also relay the error messages to the selected notifier components + * using the OPAL SOS reporter callback interface. + * + * @param file + * @param line + * @param func + * @param opal_error + * + * @return encoded error code + */ +OPAL_DECLSPEC int opal_sos_reporter(const char *file, int line, const char *func, + opal_sos_severity_t severity, + opal_sos_error_t *opal_error); + +/** + * Prints the error encoded by the error number \c errnum + * + * @param errnum + * @param show_history + * + */ +OPAL_DECLSPEC void opal_sos_print(int errnum, bool show_history); + +OPAL_DECLSPEC int opal_sos_prettify_error(const char *error, char **pretty_error); + +/** + * Prints a single error represented by the OPAL SOS error object + * opal_sos_error_t. + */ +OPAL_DECLSPEC void opal_sos_print_error(opal_sos_severity_t severity, + int errnum, const char *errmsg, ...) + #if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR + __opal_attribute_format__(__printf__, 3, 4) + #endif + ; + +/** + * Frees the error object represented by the error code \c errnum. + */ +OPAL_DECLSPEC void opal_sos_free(int *errnum); + +/** + * Logs (prints and frees) the error object represented by \c errnum. + */ +OPAL_DECLSPEC void opal_sos_log(int errnum); + +/** + * Returns the OPAL SOS severity level as a string. + * + */ +const char *opal_sos_severity2str(opal_sos_severity_t severity); + +/** + * \internal + * Return a unique key into the hash table (opal_sos_error_table) + * depending on the type and location of the error. + * + */ +int opal_sos_hash_error(opal_sos_error_t *error); + +/** + * Registers a print callback function for OPAL_SOS_PRINT() + */ +OPAL_DECLSPEC int +opal_sos_reg_print_callback(opal_sos_print_callback_fn_t new_func, + opal_sos_print_callback_fn_t *prev_func); + +/** + * Registers a reporter callback function for OPAL_SOS_INFO(), + * OPAL_SOS_WARN() and OPAL_SOS_ERROR() + */ +OPAL_DECLSPEC int +opal_sos_reg_reporter_callback(opal_sos_reporter_callback_fn_t new_func, + opal_sos_reporter_callback_fn_t *prev_func); + +END_C_DECLS + +#endif /* OPAL_SOS_H */ diff --git a/opal/util/opal_sos_reporter.txt b/opal/util/opal_sos_reporter.txt new file mode 100644 index 0000000000..8b68da7602 --- /dev/null +++ b/opal/util/opal_sos_reporter.txt @@ -0,0 +1,36 @@ +# -*- text -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English help file for OPAL SOS error messages. +# +# FORMAT: +# filename:linenum:functionname +# error message string +# stacktrace (optional) +[general message] +| |--<%s> at %s:%d:%s(): +| | %s +# +[msg header] +| %s +# +# We marshall all the parameters into a single message when we +# relay it to the notifier. +[notifier message] +%s diff --git a/test/util/opal_sos.c b/test/util/opal_sos.c new file mode 100644 index 0000000000..a6e0f7eedf --- /dev/null +++ b/test/util/opal_sos.c @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2005 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include +#include +#include +#ifdef HAVE_SYS_PARAM_H +#include +#endif +#ifdef HAVE_NETINET_IN_H +#include +#endif +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_NETDB_H +#include +#endif +#include + +#include "support.h" +#include "opal/constants.h" +#include "opal/runtime/opal.h" +#include "opal/util/opal_sos.h" +#include "opal/util/show_help.h" +#include "opal/util/output.h" +#include "orte/runtime/runtime.h" +#include "orte/constants.h" +#include "ompi/constants.h" + +static bool opal_sos_test(void); + +int +main(int argc, char *argv[]) +{ + opal_init(&argc, &argv); + test_init("opal_sos test suite"); + opal_sos_test(); + opal_finalize(); + test_finalize(); + return 0; +} + +/** OPAL_SOS_* macro test */ +static bool opal_sos_test(void) +{ + int errnum1 = 0, errnum2 = 0; + char *err_str; + + /* Checking for the correctness of GET_ and SET_ error code + * operations */ + errnum1 = OPAL_SOS_GET_ERROR_CODE(OMPI_ERR_OUT_OF_RESOURCE); + test_verify("failed", OMPI_ERR_OUT_OF_RESOURCE == errnum1); + + OPAL_SOS_SET_ERROR_CODE(errnum1, OMPI_ERR_IN_ERRNO); + test_verify("failed", OMPI_ERR_IN_ERRNO == + OPAL_SOS_GET_ERROR_CODE(errnum1)); + + /* Check if OMPI_ERR_OUT_OF_RESOURCE is a native error code or + * not. Since OMPI_ERR_OUT_OF_RESOURCE is native, this should + * return true. */ + test_verify("failed", true == + OPAL_SOS_IS_NATIVE(OMPI_ERR_OUT_OF_RESOURCE)); + + test_verify("failed", true == OPAL_SOS_IS_NATIVE(errnum1)); + + /* Encode a native error (OMPI_ERR_OUT_OF_RESOURCE) by + * logging it in the SOS framework using one of the SOS + * reporter macros. This returns an encoded error code + * (errnum1) with information about the native error such + * as the severity, the native error code, the attached + * error index etc. */ + errnum1 = OPAL_SOS_INFO((OMPI_ERR_OUT_OF_RESOURCE, false, + "Error %d: out of resource", + OMPI_ERR_OUT_OF_RESOURCE)); + + /* Check if errnum1 is native or not. This should return false */ + test_verify("failed", false == OPAL_SOS_IS_NATIVE(errnum1)); + test_verify("failed", + OPAL_SOS_SEVERITY_INFO == OPAL_SOS_GET_SEVERITY(errnum1)); + + /* Extract the native error code out of errnum1. This should + * return the encoded native error code associated with errnum1 + * (i.e. OMPI_ERR_OUT_OF_RESOURCE). */ + test_verify("failed", OMPI_ERR_OUT_OF_RESOURCE == + OPAL_SOS_GET_ERROR_CODE(errnum1)); + + /* We log another error event as a child of the previous error + * errnum1. In the process, we decide to raise the severity + * level from INFO to WARN. */ + err_str = opal_output_string(0, 0, "my error string -100"); + errnum1 = OPAL_SOS_WARN((errnum1, false, err_str)); + test_verify("failed", + OPAL_SOS_SEVERITY_WARN == OPAL_SOS_GET_SEVERITY(errnum1)); + + test_verify("failed", OMPI_ERR_OUT_OF_RESOURCE == + OPAL_SOS_GET_ERROR_CODE(errnum1)); + free(err_str); + + /* Let's report another event with severity ERROR using + * OPAL_SOS_ERROR() and in effect promote errnum1 to + * severity 'ERROR'. */ + err_str = opal_show_help_string("help-opal-util.txt", + "stacktrace signal override", + false, 10, 10, 10, "15"); + errnum1 = OPAL_SOS_ERROR((errnum1, false, err_str)); + test_verify("failed", + OPAL_SOS_SEVERITY_ERROR == OPAL_SOS_GET_SEVERITY(errnum1)); + free(err_str); + + /* Check the native code associated with the previously encoded + * error. This should still return (OMPI_ERR_OUT_OF_RESOURCE) + * since the entire error history originates from the native + * error OMPI_ERR_OUT_OF_RESOURCE */ + test_verify("failed", OMPI_ERR_OUT_OF_RESOURCE == + OPAL_SOS_GET_ERROR_CODE(errnum1)); + + /* We start off another error history stack originating with a + * native error, ORTE_ERR_FATAL. */ + asprintf(&err_str, "Fatal error occurred in ORTE %d", errnum1); + errnum2 = OPAL_SOS_ERROR((ORTE_ERR_FATAL, true, err_str)); + free(err_str); + test_verify("failed", + OPAL_SOS_SEVERITY_ERROR == OPAL_SOS_GET_SEVERITY(errnum2)); + test_verify("failed", OMPI_ERR_FATAL == + OPAL_SOS_GET_ERROR_CODE(errnum2)); + + /* Registering another error with severity ERROR. + * There is no change in the severity */ + errnum2 = OPAL_SOS_WARN((errnum2, false, "this process must die.")); + test_verify("failed", + OPAL_SOS_SEVERITY_WARN == OPAL_SOS_GET_SEVERITY(errnum2)); + test_verify("failed", OMPI_ERR_FATAL == + OPAL_SOS_GET_ERROR_CODE(errnum2)); + + /* We attach the two error traces originating from errnum1 + * and errnum2. The "attached error index" in errnum1 is + * set to errnum2 to indicate that the two error stacks + * are forked down from this point on. */ + OPAL_SOS_ATTACH(errnum1, errnum2); + + /* Print out the entire error event history originating from errnum1 */ +#if 0 + printf("<------ BEGIN output of OPAL SOS error message ------->\n"); + OPAL_SOS_PRINT(errnum1, true); + printf("<------ END output of OPAL SOS error message ------->\n"); +#endif + test_success(); + + /* Cleanup */ + OPAL_SOS_FREE(&errnum1); + OPAL_SOS_FREE(&errnum2); + + return true; +}