1
1

Merge OPAL SOS into the trunk.

The OPAL SOS framework tries to meet the following objectives:

 * reduce the cascading error messages and the amount of code needed to print an error message.
 * build and aggregate stacks of encountered errors and associate related individual errors with each other.
 * allow registration of custom callbacks to intercept error events.

For more information, refer to
https://svn.open-mpi.org/trac/ompi/wiki/ErrorMessages

This commit was SVN r23158.
Этот коммит содержится в:
Abhishek Kulkarni 2010-05-17 22:51:52 +00:00
родитель 9c5860706f
Коммит 4e33e6aeaa
4 изменённых файлов: 1187 добавлений и 0 удалений

531
opal/util/opal_sos.c Обычный файл
Просмотреть файл

@ -0,0 +1,531 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <errno.h>
#include <stdio.h>
#ifdef HAVE_STDARG_H
#include <stdarg.h>
#endif
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#include "opal/util/opal_sos.h"
#include "opal/constants.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/class/opal_hash_table.h"
#include "opal/util/stacktrace.h"
#include "opal/util/show_help.h"
/** Global variables */
opal_hash_table_t opal_sos_table;
opal_mutex_t opal_sos_table_lock;
bool opal_sos_print_low;
/* Local variables */
static bool opal_sos_initialized = false;
static const char *dash_line = "--------------------------------------------------------------------------";
static const char *stackhdr = "[STACK TRACE]:\n";
/* Local functions */
static void opal_sos_error_construct(opal_sos_error_t *obj);
static void opal_sos_error_destruct(opal_sos_error_t *obj);
/** OPAL SOS callback function pointers */
static opal_sos_print_callback_fn_t cur_print_callback;
static opal_sos_reporter_callback_fn_t cur_reporter_callback;
/* static opal_sos_print_callback_fn_t prev_print_callback; */
static opal_sos_reporter_callback_fn_t prev_reporter_callback;
OBJ_CLASS_INSTANCE(opal_sos_error_t,
opal_object_t,
opal_sos_error_construct,
opal_sos_error_destruct);
/**
* Constructor
*/
static void opal_sos_error_construct(opal_sos_error_t *obj)
{
obj->errnum = 0;
obj->file = NULL;
obj->line = 0;
obj->func = NULL;
obj->msg = NULL;
obj->prev = obj->next = OPAL_SOS_ERR_BASE;
}
/**
* Destructor
*/
static void opal_sos_error_destruct(opal_sos_error_t *obj)
{
if (NULL != obj->file) {
free(obj->file);
}
if (NULL != obj->func) {
free(obj->func);
}
if (NULL != obj->msg) {
free(obj->msg);
}
}
/**
* Initialize the OPAL SOS interface
*
*/
void opal_sos_init(void)
{
int value;
if (opal_sos_initialized) {
return;
}
mca_base_param_reg_int_name("opal", "sos_print_low",
"Set to non-zero to enable the print-at-bottom"
" preference for OPAL SOS. Enabling this option prints"
" out the errors, warnings or info messages as"
" soon as they are encountered.",
false, false, (int)false, &value);
opal_sos_print_low = OPAL_INT_TO_BOOL(value);
OBJ_CONSTRUCT(&opal_sos_table, opal_hash_table_t);
opal_hash_table_init(&opal_sos_table, OPAL_SOS_ERR_TABLE_SIZE);
OBJ_CONSTRUCT(&opal_sos_table_lock, opal_mutex_t);
opal_sos_reg_reporter_callback(opal_sos_print_error, &prev_reporter_callback);
opal_sos_initialized = true;
return;
}
/**
* Finalize the OPAL SOS interface
*
*/
void opal_sos_finalize(void)
{
OBJ_DESTRUCT(&opal_sos_table);
OBJ_DESTRUCT(&opal_sos_table_lock);
opal_sos_initialized = false;
return;
}
/**
* Free all the SOS errors represented by the error code pointed to by \c errnum
*
*/
void opal_sos_free(int *errnum)
{
opal_sos_error_t *opal_error, *attached_error;
int err, attached_errnum;
if (NULL == errnum) {
return;
} else if (true == OPAL_SOS_IS_NATIVE(*errnum)) {
return;
} else {
err = *errnum;
}
*errnum = OPAL_SOS_GET_ERROR_CODE(err);
do {
/* Look for attached errors */
if (0 != (attached_errnum = OPAL_SOS_GET_ATTACHED_INDEX(err))) {
OPAL_THREAD_LOCK(&opal_sos_table_lock);
if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table,
attached_errnum,
(void **)&attached_error)) {
goto cleanup;
}
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
/* If there's an attached error trace, free it! */
if (NULL != attached_error) {
attached_errnum = attached_error->errnum;
opal_sos_free(&attached_errnum);
}
}
OPAL_THREAD_LOCK(&opal_sos_table_lock);
if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table,
OPAL_SOS_GET_INDEX(err),
(void **)&opal_error)) {
goto cleanup;
}
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
if (NULL == opal_error) {
goto cleanup;
}
opal_sos_error_destruct(opal_error);
/* Remove the entry from the SOS table */
OPAL_THREAD_LOCK(&opal_sos_table_lock);
opal_hash_table_remove_value_uint32(&opal_sos_table, OPAL_SOS_GET_INDEX(err));
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
err = opal_error->prev;
} while (OPAL_SOS_ERR_BASE != err);
cleanup:
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
}
opal_sos_error_t *
opal_sos_build_error(int errnum, bool show_stack, const char *errmsg, ...)
{
opal_sos_error_t *opal_error;
char *stackframe, msg[OPAL_SOS_MAX_ERR_LEN];
va_list arglist;
int ret_errno = 0, len;
if (!opal_sos_initialized) {
opal_sos_init();
}
opal_error = OBJ_NEW(opal_sos_error_t);
if (NULL == opal_error) {
return NULL; /* OPAL_ERR_OUT_OF_RESOURCE */
}
va_start(arglist, errmsg);
len = vsnprintf(msg, OPAL_SOS_MAX_ERR_LEN, errmsg, arglist);
va_end(arglist);
if ((true == show_stack) &&
(NULL != (stackframe = opal_stackframe_output_string()))) {
len += strlen(stackhdr) + strlen(stackframe) + 2;
if (len > OPAL_SOS_MAX_ERR_LEN)
len = OPAL_SOS_MAX_ERR_LEN;
opal_error->msg = (char *) malloc(len);
if (NULL == opal_error->msg) {
return NULL;
}
snprintf(opal_error->msg, len, "%s\n%s%s", msg, stackhdr, stackframe);
} else {
opal_error->msg = strdup(msg);
}
/* Check if errnum is a native error code and encode it into
the encoded error code if it is native */
if (OPAL_SOS_IS_NATIVE(errnum)) {
OPAL_SOS_SET_ERROR_CODE(ret_errno, errnum);
} else {
/* Extract the native error code from the encoded error and
encode it back again into the newly encoded error code */
OPAL_SOS_SET_ERROR_CODE(ret_errno, OPAL_SOS_GET_ERROR_CODE(errnum));
opal_error->prev = errnum;
}
opal_error->errnum = ret_errno;
return opal_error;
}
int opal_sos_reporter(const char *file, int line, const char *func,
opal_sos_severity_t severity, opal_sos_error_t *opal_error)
{
opal_sos_error_t *prev_error;
int ret_errno = 0, hash;
if (NULL == opal_error) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
/* Doing more strict validation here since if either of the file,
* func or msg are not known we replace it by <unknown> to avoid any issues
* during dss pack/unpack
*/
opal_error->file = (NULL != file)?strdup(file):strdup("<unknown>");
opal_error->func = (NULL != func)?strdup(func):strdup("<unknown>");
opal_error->line = line;
ret_errno = opal_error->errnum;
/* Encode the severity level into the return error code */
OPAL_SOS_SET_SEVERITY(ret_errno, severity);
hash = opal_sos_hash_error(opal_error);
OPAL_SOS_SET_INDEX(ret_errno, hash);
opal_error->errnum = ret_errno;
if (opal_sos_print_low) {
opal_sos_report_error(opal_error);
}
/* Add the error object to the error table */
OPAL_THREAD_LOCK(&opal_sos_table_lock);
if (OPAL_SUCCESS !=
opal_hash_table_set_value_uint32(&opal_sos_table,
OPAL_SOS_GET_INDEX(ret_errno),
(void *)opal_error)) {
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
OBJ_DESTRUCT(opal_error);
return OPAL_ERROR;
}
/* Get the previous error in the error call stack and update
its next error pointer */
prev_error = NULL;
opal_hash_table_get_value_uint32(&opal_sos_table,
OPAL_SOS_GET_INDEX(opal_error->prev),
(void **)&prev_error);
if (NULL != prev_error) {
prev_error->next = opal_error->errnum;
}
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
return ret_errno;
}
void
opal_sos_report_error(opal_sos_error_t *error)
{
int severity, errnum, ret;
char *pretty_error;
if (NULL == error)
return;
severity = OPAL_SOS_GET_SEVERITY(error->errnum);
/* An OPAL SOS encoded error number holds no meaning outside
* the context of Open MPI. We convert it back to the native
* error code before reporting it. */
if (true == OPAL_SOS_IS_NATIVE(error->errnum)) {
errnum = error->errnum;
} else {
errnum = OPAL_SOS_GET_ERROR_CODE(error->errnum);
}
/* Prettify the error for printing it locally */
ret = opal_sos_prettify_error(error->msg, &pretty_error);
(*cur_reporter_callback)(severity, errnum, "<%s> at %s:%d:%s():\n%s",
opal_sos_severity2str(severity), error->file,
error->line, error->func,
((0 > ret) ? error->msg : pretty_error));
if (ret > 0) {
free(pretty_error);
}
/* Call the previous reporter callback which should be the selected
* ORTE notifier components */
if (NULL != prev_reporter_callback) {
prev_reporter_callback(severity, errnum, "<%s> at %s:%d:%s():\n%s",
opal_sos_severity2str(severity), error->file,
error->line, error->func, error->msg);
}
}
void opal_sos_print(int errnum, bool show_history)
{
opal_sos_error_t *opal_error, *prev_opal_error, *attached_error;
int tmp, attached_errnum, prev_severity, severity;
opal_show_help("opal_sos_reporter.txt", "msg header", false, dash_line);
tmp = errnum;
prev_opal_error = NULL;
do {
/* If there is an error attached to this error, print it out. */
if (0 != (attached_errnum = OPAL_SOS_GET_ATTACHED_INDEX(errnum))) {
OPAL_THREAD_LOCK(&opal_sos_table_lock);
if (OPAL_SUCCESS != opal_hash_table_get_value_uint32(&opal_sos_table,
attached_errnum,
(void **)&attached_error)) {
goto cleanup;
}
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
if (NULL != attached_error) {
opal_sos_print(attached_error->errnum, show_history);
}
}
OPAL_THREAD_LOCK(&opal_sos_table_lock);
if (OPAL_SUCCESS !=
opal_hash_table_get_value_uint32(&opal_sos_table,
OPAL_SOS_GET_INDEX(errnum),
(void **)&opal_error)) {
goto cleanup;
}
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
if (NULL == opal_error) {
return;
}
if (NULL != prev_opal_error) {
prev_severity = OPAL_SOS_GET_SEVERITY(prev_opal_error->errnum);
severity = OPAL_SOS_GET_SEVERITY(errnum);
/* If show_history is enabled, or if the preceeding error
was of higher severity, then report the error */
if (show_history || (prev_severity <= severity))
/* Print the error denoted by errnum. */
opal_sos_report_error(prev_opal_error);
}
prev_opal_error = opal_error;
/* Get the previous error */
errnum = opal_error->prev;
/* Terminating condition */
if (OPAL_SOS_ERR_BASE == errnum) {
opal_sos_report_error(opal_error);
}
} while (errnum != OPAL_SOS_ERR_BASE);
opal_show_help("opal_sos_reporter.txt", "msg header", false, dash_line);
errnum = tmp;
return;
cleanup:
OPAL_THREAD_UNLOCK(&opal_sos_table_lock);
}
void opal_sos_print_error(opal_sos_severity_t severity, int errnum, const char *errmsg, ...)
{
va_list arglist;
va_start(arglist, errmsg);
opal_show_vhelp("opal_sos_reporter.txt", "general message", false, arglist);
va_end(arglist);
}
inline void opal_sos_log(int errnum)
{
opal_sos_print(errnum, false);
opal_sos_free(&errnum);
}
int opal_sos_prettify_error(const char *error, char **pretty_error)
{
char *str, *token, *saveptr, *errdup;
const char *prefix = "\n| | ";
int len, plen, left;
if (NULL == error) {
return OPAL_ERROR;
}
*pretty_error = (char *) malloc(OPAL_SOS_MAX_ERR_LEN);
if (NULL == *pretty_error) {
return OPAL_ERR_OUT_OF_RESOURCE;
}
*(*pretty_error) = '\0';
plen = strlen(prefix);
if (NULL != (errdup = strdup(error))) {
for (str = errdup, len = 0; len < OPAL_SOS_MAX_ERR_LEN; str = NULL) {
if (NULL == (token = strtok_r(str, "\n", &saveptr))) {
break;
}
left = strlen(token);
if ((len + left) > OPAL_SOS_MAX_ERR_LEN) {
left = OPAL_SOS_MAX_ERR_LEN - len;
}
strncat(*pretty_error, token, left);
len += left;
left = plen;
if ((len + left) > OPAL_SOS_MAX_ERR_LEN) {
left = OPAL_SOS_MAX_ERR_LEN - len;
}
strncat(*pretty_error, prefix, left);
len += left;
}
free(errdup);
errdup = NULL;
}
return len;
}
const char *opal_sos_severity2str(opal_sos_severity_t severity)
{
switch(severity) {
case OPAL_SOS_SEVERITY_EMERG: return "EMERGENCY";
case OPAL_SOS_SEVERITY_ALERT: return "ALERT MESSAGE";
case OPAL_SOS_SEVERITY_CRIT: return "CRITICAL MESSAGE";
case OPAL_SOS_SEVERITY_ERROR: return "ERROR";
case OPAL_SOS_SEVERITY_WARN: return "WARNING";
case OPAL_SOS_SEVERITY_NOTICE: return "NOTICE";
case OPAL_SOS_SEVERITY_INFO: return "INFO MESSAGE";
case OPAL_SOS_SEVERITY_DEBUG: return "DEBUG MESSAGE";
default: return "UNKNOWN ERROR";
}
}
int opal_sos_hash_error(opal_sos_error_t *error)
{
int hash, c;
char *msg;
/* Naive string hash function to create a key based on the error
details, namely length of the file name, length of the function
name and the sum of the characters in the error message */
hash = error->errnum;
if (NULL != error->file) {
hash += strlen(error->file);
}
if (NULL != error->func) {
hash += strlen(error->func);
}
if (NULL != error->msg) {
msg = error->msg;
while ('\0' != (c = *msg++)) {
hash += c;
}
}
return (hash & (OPAL_SOS_ERR_TABLE_SIZE - 1));
}
int opal_sos_reg_print_callback(opal_sos_print_callback_fn_t new_func,
opal_sos_print_callback_fn_t *prev_func)
{
/* Preserve the previous print callback */
*prev_func = cur_print_callback;
/* Update the current print callback */
cur_print_callback = new_func;
return OPAL_SUCCESS;
}
int opal_sos_reg_reporter_callback(opal_sos_reporter_callback_fn_t new_func,
opal_sos_reporter_callback_fn_t *prev_func)
{
/* Preserve the previous reporter callback */
*prev_func = cur_reporter_callback;
/* Update the current reporter callback */
cur_reporter_callback = new_func;
return OPAL_SUCCESS;
}

448
opal/util/opal_sos.h Обычный файл
Просмотреть файл

@ -0,0 +1,448 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OPAL_SOS_H
#define OPAL_SOS_H
#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif
#ifdef HAVE_SYSLOG_H
#include <syslog.h>
#endif
#include "opal/class/opal_object.h"
#include "opal/class/opal_hash_table.h"
#include "opal/threads/mutex.h"
#include "opal/util/output.h"
#ifdef __STDC_VERSION__
# if __STDC_VERSION__ < 199901L
# if __GNUC__ >= 2
# define OPAL_SOS_FUNCTION __FUNCTION__
# else
# define OPAL_SOS_FUNCION "<unknown>"
# endif
# else
# define OPAL_SOS_FUNCTION __func__
# endif
#else
# define OPAL_SOS_FUNCTION __func__
#endif
/* Internal use only */
#define OPAL_SOS_ERR_BASE OPAL_SUCCESS
/**
* Size of the OPAL SOS error table.
*
* Since the index into the error table that is encoded in the error
* code is 9-bit long, setting a higher value than (1 << 9) would make
* no difference at all.
*/
#define OPAL_SOS_ERR_TABLE_SIZE 512
/**
* Maximum length for the error string stored per error code in the
* OPAL SOS error table.
*/
#define OPAL_SOS_MAX_ERR_LEN 1024
/**
* Reports an error to OPAL SOS reporter.
*
* Encodes an informational message with severity \c severity and
* other passed arguments like errnum, errmsg etc. It also remembers
* the line number, file name and the function name where the error
* has occurred.
* If the MCA parameter \c opal_sos_print_low is set, the error message
* is displayed on stderr using the "show help" subsystem. By default,
* informational messages are not printed out on stderr.
* If \c show_stack is set, the stacktrace is saved and/or printed
* along with the corresponding \c errmsg.
*/
#define OPAL_SOS_REPORT(severity, arg) opal_sos_reporter(__FILE__, __LINE__, \
OPAL_SOS_FUNCTION, \
severity, \
opal_sos_build_error arg)
/**
* Print or store an event with the maximum severity (EMERG).
*/
#define OPAL_SOS_EMERG(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_EMERG, arg)
/**
* Report an event of severity "ALERT".
*/
#define OPAL_SOS_ALERT(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_ALERT, arg)
/**
* Report events with severity marked as "CRITICAL".
*/
#define OPAL_SOS_CRIT(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_CRIT, arg)
/**
* Prints and/or logs an error.
* This function can be used to log or print error events.
*/
#define OPAL_SOS_ERROR(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_ERROR, arg)
/**
* Prints and/or logs a warning.
*
* This function is similar to OPAL_SOS_INFO but with a higher
* severity. These events are printed out on the output stream
* by default.
*/
#define OPAL_SOS_WARN(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_WARN, arg)
/**
* Report an error event with severity "NOTICE".
*/
#define OPAL_SOS_NOTICE(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_NOTICE,arg)
/**
* Prints or logs an informational message in the OPAL SOS framework.
* Events with this severity are not printed, by default. However,
* they are still stored in the SOS table.
*/
#define OPAL_SOS_INFO(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_INFO, arg)
/**
* Log debug events in the SOS framework.
*/
#define OPAL_SOS_DEBUG(arg) OPAL_SOS_REPORT(OPAL_SOS_SEVERITY_DEBUG, arg)
/**
* Frees all the (entire stack of) OPAL SOS error objects associated
* with the encoded error code obtained after dereferencing the
* pointer \c errnum.
*/
#define OPAL_SOS_FREE(perrnum) opal_sos_free(perrnum)
/**
* Print the warnings/errors/informational messages previously logged
* in to the SOS framework.
*
* This function prints the error details encoded by \c errnum.
* If \c show_history is true, the entire history for the error
* represented by \c errnum is printed on the output stream.
*/
#define OPAL_SOS_PRINT(errnum, show_history) \
opal_sos_print(errnum, show_history)
/**
* Attach the history from one error code to another error code
* Returns the target encoded error \c errtgt with history of \c
* errnum associated to it.
*/
#define OPAL_SOS_ATTACH(errtgt, errnum) \
(errtgt = -((-errtgt & ~0xFF80000L) | \
((OPAL_SOS_GET_INDEX(errnum) & 0x1FFL) * 0x80000L)))
/**
* Returns the index of the error attached to errnum using OPAL_SOS_ATTACH().
*/
#define OPAL_SOS_GET_ATTACHED_INDEX(errnum) ((int) ((-errnum & 0xFF80000L) >> 19))
/**
* Returns the native error code for the given encoded error code \c
* errnum. \c errnum can be a native error code itself.
*/
#define OPAL_SOS_GET_ERROR_CODE(errnum) ((int) -(-errnum & 0x3FFL))
/**
* Sets the native error code for the potentially encoded error code.
*
* The lower 10 bits are reserved for the native error code. This
* macro sets the lower 10 bits of errnum to nativeerr.
*/
#define OPAL_SOS_SET_ERROR_CODE(errnum, nativeerr) \
(errnum = -((-errnum & ~0x3FFL) | (-nativeerr & 0x3FFL)))
/**
* Macro to check if the error encoded by \c errnum is a native error
* or an OPAL SOS encoded error.
*/
#define OPAL_SOS_IS_NATIVE(errnum) ((-errnum & ~0x3FFL) == 0)
/**
* Returns the severity level for the potentially encoded error code.
*
* The severity is encoded in the last three bits of the first nibble.
*/
#define OPAL_SOS_GET_SEVERITY(errnum) ((int)((-errnum >> 28) & 0x7L))
/**
* Sets the severity level for the given error code \c errnum.
*
* This macros do not do strict error checking of the specified
* severity levels.
*/
#define OPAL_SOS_SET_SEVERITY(errnum, severity) \
(errnum = -((-errnum & ~0x70000000L) | ((severity & 0x7L) * 0x10000000L)))
/**
* Macro to get the encoded error severity level as a string.
*
* This macro accepts the argument \c severity and calls the corresponding
* function opal_sos_severity2str to convert it to a string. The result
* is returned in a static buffer that should not be freed with free().
*/
#define OPAL_SOS_SEVERITY2STR(severity) opal_sos_severity2str(severity)
/**
* Log an encoded error \c errnum.
*
* This macro prints out and consequently frees the entire stack of
* errors associated with the \c errnum.
*/
#define OPAL_SOS_LOG(errnum) opal_sos_log(errnum)
/**
* \internal
* Returns the index into the error table of the error encoded by \c errnum.
*
* The index is 9-bit long stored from bit 11 to bit 20 in the encoded
* error code.
*/
#define OPAL_SOS_GET_INDEX(errnum) ((int)((-errnum & 0x7FC00L) >> 10))
/**
* \internal
* Sets the index into the error table for the error encoded by \c errnum.
*/
#define OPAL_SOS_SET_INDEX(errnum, index) \
(errnum = -((-errnum & ~0x7FC00L) | ((index & 0x1FFL) * 0x400L)))
BEGIN_C_DECLS
/** This MCA parameter sos_print_low can be set to non-zero to enable
* the print-at-bottom preference for OPAL SOS. */
OPAL_DECLSPEC extern bool opal_sos_print_low;
/* Severity levels for OPAL SOS */
typedef enum {
OPAL_SOS_SEVERITY_EMERG = LOG_EMERG,
OPAL_SOS_SEVERITY_ALERT = LOG_ALERT,
OPAL_SOS_SEVERITY_CRIT = LOG_CRIT,
OPAL_SOS_SEVERITY_ERROR = LOG_ERR,
OPAL_SOS_SEVERITY_WARN = LOG_WARNING,
OPAL_SOS_SEVERITY_NOTICE = LOG_NOTICE,
OPAL_SOS_SEVERITY_INFO = LOG_INFO,
OPAL_SOS_SEVERITY_DEBUG = LOG_DEBUG
} opal_sos_severity_t;
typedef struct opal_sos_error_t {
/** Class parent */
opal_object_t super;
/**
* The encoded error code for a given type of error.
*
* errnum encodes a native error code (lower 10 bits) with the
* current severity (higher 2 bits) and an index into the error
* table along with the associated error, if there is one.
*/
int errnum;
/** File in which the error occured */
char *file;
/** Line number on which the error was encountered */
int line;
/** This is an optional parameter that indicates the function in
which the error occured */
char *func;
/** The actual error message or string for the error indicated by
\c errnum */
char *msg;
/** Encoded error numbers of the previous and the next error.
These are used are used to maintain the history of an error.
The complete history of an error can be printed later using
OPAL_SOS_PRINT() */
int prev;
int next;
} opal_sos_error_t;
OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_sos_error_t);
/**
* Signature for OPAL SOS print function callback type.
*/
typedef void (*opal_sos_print_callback_fn_t) (int errcode);
/**
* Signature for OPAL SOS reporter function callback type.
*/
typedef void (*opal_sos_reporter_callback_fn_t) (opal_sos_severity_t severity, int errcode,
const char *msg, ...)
#if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 3, 4)
#endif
;
/**
* A global handle that points to the local OPAL SOS table.
* This is used by the notifier components to reference the local OPAL
* SOS table, especially for packing/unpacking and sending it over to
* the HNP.
*/
OPAL_DECLSPEC extern opal_hash_table_t opal_sos_table;
/**
* A global handle that points to the OPAL SOS table lock.
*
*/
OPAL_DECLSPEC extern opal_mutex_t opal_sos_table_lock;
/**
* \internal
*
* Initialize OPAL SOS.
*
* This function initializes and sets up the structures required to
* track the data handled by OPAL SOS. It is invoked by
* opal_util().
*/
void opal_sos_init(void);
/**
* \internal
*
* Shut down OPAL SOS.
*
* Invoked by opal_finalize() to deallocate the structures needed by
* OPAL SOS.
*/
void opal_sos_finalize(void);
/**
* Prints or relays the error locally or using the selected notifier
* components.
*/
void
opal_sos_report_error(opal_sos_error_t *error);
/**
* Builds an OPAL SOS error object given the parameters errnum,
* show_stack and errmsg.
* NOTE: This function only partially populates the SOS error object
* structure, setting the error message details but nothing about where
* the error occurred. Filling up the rest of the error object is left
* to OPAL SOS reporter which then handles the error appropriately.
*
* @param errnum
* @param show_stack
* @param errmsg
*
* @return
*/
OPAL_DECLSPEC opal_sos_error_t *
opal_sos_build_error(int errnum, bool show_stack,
const char *errmsg, ...)
#if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 3, 4)
#endif
;
/**
* OPAL SOS reporter logs the error in the OPAL SOS error table or
* prints it out depending on the associated reporter callback. It can
* also relay the error messages to the selected notifier components
* using the OPAL SOS reporter callback interface.
*
* @param file
* @param line
* @param func
* @param opal_error
*
* @return encoded error code
*/
OPAL_DECLSPEC int opal_sos_reporter(const char *file, int line, const char *func,
opal_sos_severity_t severity,
opal_sos_error_t *opal_error);
/**
* Prints the error encoded by the error number \c errnum
*
* @param errnum
* @param show_history
*
*/
OPAL_DECLSPEC void opal_sos_print(int errnum, bool show_history);
OPAL_DECLSPEC int opal_sos_prettify_error(const char *error, char **pretty_error);
/**
* Prints a single error represented by the OPAL SOS error object
* opal_sos_error_t.
*/
OPAL_DECLSPEC void opal_sos_print_error(opal_sos_severity_t severity,
int errnum, const char *errmsg, ...)
#if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR
__opal_attribute_format__(__printf__, 3, 4)
#endif
;
/**
* Frees the error object represented by the error code \c errnum.
*/
OPAL_DECLSPEC void opal_sos_free(int *errnum);
/**
* Logs (prints and frees) the error object represented by \c errnum.
*/
OPAL_DECLSPEC void opal_sos_log(int errnum);
/**
* Returns the OPAL SOS severity level as a string.
*
*/
const char *opal_sos_severity2str(opal_sos_severity_t severity);
/**
* \internal
* Return a unique key into the hash table (opal_sos_error_table)
* depending on the type and location of the error.
*
*/
int opal_sos_hash_error(opal_sos_error_t *error);
/**
* Registers a print callback function for OPAL_SOS_PRINT()
*/
OPAL_DECLSPEC int
opal_sos_reg_print_callback(opal_sos_print_callback_fn_t new_func,
opal_sos_print_callback_fn_t *prev_func);
/**
* Registers a reporter callback function for OPAL_SOS_INFO(),
* OPAL_SOS_WARN() and OPAL_SOS_ERROR()
*/
OPAL_DECLSPEC int
opal_sos_reg_reporter_callback(opal_sos_reporter_callback_fn_t new_func,
opal_sos_reporter_callback_fn_t *prev_func);
END_C_DECLS
#endif /* OPAL_SOS_H */

36
opal/util/opal_sos_reporter.txt Обычный файл
Просмотреть файл

@ -0,0 +1,36 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English help file for OPAL SOS error messages.
#
# FORMAT:
# <severity> filename:linenum:functionname
# error message string
# stacktrace (optional)
[general message]
| |--<%s> at %s:%d:%s():
| | %s
#
[msg header]
| %s
#
# We marshall all the parameters into a single message when we
# relay it to the notifier.
[notifier message]
%s

172
test/util/opal_sos.c Обычный файл
Просмотреть файл

@ -0,0 +1,172 @@
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "ompi_config.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#include <errno.h>
#include "support.h"
#include "opal/constants.h"
#include "opal/runtime/opal.h"
#include "opal/util/opal_sos.h"
#include "opal/util/show_help.h"
#include "opal/util/output.h"
#include "orte/runtime/runtime.h"
#include "orte/constants.h"
#include "ompi/constants.h"
static bool opal_sos_test(void);
int
main(int argc, char *argv[])
{
opal_init(&argc, &argv);
test_init("opal_sos test suite");
opal_sos_test();
opal_finalize();
test_finalize();
return 0;
}
/** OPAL_SOS_* macro test */
static bool opal_sos_test(void)
{
int errnum1 = 0, errnum2 = 0;
char *err_str;
/* Checking for the correctness of GET_ and SET_ error code
* operations */
errnum1 = OPAL_SOS_GET_ERROR_CODE(OMPI_ERR_OUT_OF_RESOURCE);
test_verify("failed", OMPI_ERR_OUT_OF_RESOURCE == errnum1);
OPAL_SOS_SET_ERROR_CODE(errnum1, OMPI_ERR_IN_ERRNO);
test_verify("failed", OMPI_ERR_IN_ERRNO ==
OPAL_SOS_GET_ERROR_CODE(errnum1));
/* Check if OMPI_ERR_OUT_OF_RESOURCE is a native error code or
* not. Since OMPI_ERR_OUT_OF_RESOURCE is native, this should
* return true. */
test_verify("failed", true ==
OPAL_SOS_IS_NATIVE(OMPI_ERR_OUT_OF_RESOURCE));
test_verify("failed", true == OPAL_SOS_IS_NATIVE(errnum1));
/* Encode a native error (OMPI_ERR_OUT_OF_RESOURCE) by
* logging it in the SOS framework using one of the SOS
* reporter macros. This returns an encoded error code
* (errnum1) with information about the native error such
* as the severity, the native error code, the attached
* error index etc. */
errnum1 = OPAL_SOS_INFO((OMPI_ERR_OUT_OF_RESOURCE, false,
"Error %d: out of resource",
OMPI_ERR_OUT_OF_RESOURCE));
/* Check if errnum1 is native or not. This should return false */
test_verify("failed", false == OPAL_SOS_IS_NATIVE(errnum1));
test_verify("failed",
OPAL_SOS_SEVERITY_INFO == OPAL_SOS_GET_SEVERITY(errnum1));
/* Extract the native error code out of errnum1. This should
* return the encoded native error code associated with errnum1
* (i.e. OMPI_ERR_OUT_OF_RESOURCE). */
test_verify("failed", OMPI_ERR_OUT_OF_RESOURCE ==
OPAL_SOS_GET_ERROR_CODE(errnum1));
/* We log another error event as a child of the previous error
* errnum1. In the process, we decide to raise the severity
* level from INFO to WARN. */
err_str = opal_output_string(0, 0, "my error string -100");
errnum1 = OPAL_SOS_WARN((errnum1, false, err_str));
test_verify("failed",
OPAL_SOS_SEVERITY_WARN == OPAL_SOS_GET_SEVERITY(errnum1));
test_verify("failed", OMPI_ERR_OUT_OF_RESOURCE ==
OPAL_SOS_GET_ERROR_CODE(errnum1));
free(err_str);
/* Let's report another event with severity ERROR using
* OPAL_SOS_ERROR() and in effect promote errnum1 to
* severity 'ERROR'. */
err_str = opal_show_help_string("help-opal-util.txt",
"stacktrace signal override",
false, 10, 10, 10, "15");
errnum1 = OPAL_SOS_ERROR((errnum1, false, err_str));
test_verify("failed",
OPAL_SOS_SEVERITY_ERROR == OPAL_SOS_GET_SEVERITY(errnum1));
free(err_str);
/* Check the native code associated with the previously encoded
* error. This should still return (OMPI_ERR_OUT_OF_RESOURCE)
* since the entire error history originates from the native
* error OMPI_ERR_OUT_OF_RESOURCE */
test_verify("failed", OMPI_ERR_OUT_OF_RESOURCE ==
OPAL_SOS_GET_ERROR_CODE(errnum1));
/* We start off another error history stack originating with a
* native error, ORTE_ERR_FATAL. */
asprintf(&err_str, "Fatal error occurred in ORTE %d", errnum1);
errnum2 = OPAL_SOS_ERROR((ORTE_ERR_FATAL, true, err_str));
free(err_str);
test_verify("failed",
OPAL_SOS_SEVERITY_ERROR == OPAL_SOS_GET_SEVERITY(errnum2));
test_verify("failed", OMPI_ERR_FATAL ==
OPAL_SOS_GET_ERROR_CODE(errnum2));
/* Registering another error with severity ERROR.
* There is no change in the severity */
errnum2 = OPAL_SOS_WARN((errnum2, false, "this process must die."));
test_verify("failed",
OPAL_SOS_SEVERITY_WARN == OPAL_SOS_GET_SEVERITY(errnum2));
test_verify("failed", OMPI_ERR_FATAL ==
OPAL_SOS_GET_ERROR_CODE(errnum2));
/* We attach the two error traces originating from errnum1
* and errnum2. The "attached error index" in errnum1 is
* set to errnum2 to indicate that the two error stacks
* are forked down from this point on. */
OPAL_SOS_ATTACH(errnum1, errnum2);
/* Print out the entire error event history originating from errnum1 */
#if 0
printf("<------ BEGIN output of OPAL SOS error message ------->\n");
OPAL_SOS_PRINT(errnum1, true);
printf("<------ END output of OPAL SOS error message ------->\n");
#endif
test_success();
/* Cleanup */
OPAL_SOS_FREE(&errnum1);
OPAL_SOS_FREE(&errnum2);
return true;
}