1
1

Update the notifier framework in prep for move to v1.3. Add an API to handle the case where error messages have been expressed via "show_help" so they can look similar to what was presented to users. Add three key calls in the openib btl to drop messages into syslog.

This will sit in trunk for a few days - would like to actually see some errors reported to syslog before moving the code to 1.3

This commit was SVN r19986.
Этот коммит содержится в:
Ralph Castain 2008-11-12 18:03:51 +00:00
родитель a48b2d45be
Коммит ce26e3a2fb
3 изменённых файлов: 69 добавлений и 30 удалений

Просмотреть файл

@ -64,6 +64,7 @@ const char *ibv_get_sysfs_path(void);
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/mca/notifier/notifier.h"
#include "ompi/proc/proc.h" #include "ompi/proc/proc.h"
#include "ompi/mca/pml/pml.h" #include "ompi/mca/pml/pml.h"
@ -2842,6 +2843,23 @@ error:
"status number %d for wr_id %llu opcode %d qp_idx %d", "status number %d for wr_id %llu opcode %d qp_idx %d",
cq_name[cq], btl_openib_component_status_to_string(wc->status), cq_name[cq], btl_openib_component_status_to_string(wc->status),
wc->status, wc->wr_id, wc->opcode, qp)); wc->status, wc->wr_id, wc->opcode, qp));
if (NULL == remote_proc) {
orte_notifier.log(ORTE_NOTIFIER_INFRA, "Proc %s on node %s encountered IB error "
"communicating to unknown proc/node:\n\tpolling %s with status %s "
"status number %d for wr_id %llu opcode %d qp_idx %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_process_info.nodename,
cq_name[cq], btl_openib_component_status_to_string(wc->status),
wc->status, wc->wr_id, wc->opcode, qp);
} else {
orte_notifier.log(ORTE_NOTIFIER_INFRA, "Proc %s on node %s encountered IB error while "
"communicating to proc %s on node %s:\n\tpolling %s with status %s "
"status number %d for wr_id %llu opcode %d qp_idx %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_process_info.nodename,
ORTE_NAME_PRINT(&remote_proc->proc_name),
(NULL == remote_proc->proc_hostname) ? "UNKNOWN" : remote_proc->proc_hostname,
cq_name[cq], btl_openib_component_status_to_string(wc->status),
wc->status, wc->wr_id, wc->opcode, qp);
}
} }
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status || if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
@ -2860,11 +2878,23 @@ error:
"srq rnr retry exceeded", true, "srq rnr retry exceeded", true,
orte_process_info.nodename, device_name, orte_process_info.nodename, device_name,
peer_hostname); peer_hostname);
orte_notifier.log_help(ORTE_NOTIFIER_INFRA,
"help-mpi-btl-openib.txt",
BTL_OPENIB_QP_TYPE_PP(qp) ?
"pp rnr retry exceeded" :
"srq rnr retry exceeded",
orte_process_info.nodename, device_name,
peer_hostname);
} else if (IBV_WC_RETRY_EXC_ERR == wc->status) { } else if (IBV_WC_RETRY_EXC_ERR == wc->status) {
orte_show_help("help-mpi-btl-openib.txt", orte_show_help("help-mpi-btl-openib.txt",
"pp retry exceeded", true, "pp retry exceeded", true,
orte_process_info.nodename, orte_process_info.nodename,
device_name, peer_hostname); device_name, peer_hostname);
orte_notifier.log_help(ORTE_NOTIFIER_INFRA,
"help-mpi-btl-openib.txt",
"pp retry exceeded",
orte_process_info.nodename,
device_name, peer_hostname);
} }
} }

Просмотреть файл

@ -40,34 +40,38 @@
#include <syslog.h> #include <syslog.h>
#endif /* HAVE_SYSLOG_H */ #endif /* HAVE_SYSLOG_H */
#ifdef HAVE_STDARG_H
#include <stdarg.h>
#endif /* HAVE_STDARG_H */
#include "opal/mca/mca.h" #include "opal/mca/mca.h"
BEGIN_C_DECLS BEGIN_C_DECLS
/* define priorities - this will eventually be replaced by OPAL_SOS priorities */
#define ORTE_NOTIFIER_INFRA LOG_CRIT
#define ORTE_NOTIFIER_WARNING LOG_WARNING
/* /*
* Component functions - all MUST be provided! * Component functions - all MUST be provided!
*/ */
/* initialize the selected module */ /* initialize the selected module */
typedef int (*orte_notifier_base_module_init_fn_t)(void); typedef int (*orte_notifier_base_module_init_fn_t)(void);
/* finalize the selected module */ /* finalize the selected module */
typedef void (*orte_notifier_base_module_finalize_fn_t)(void); typedef void (*orte_notifier_base_module_finalize_fn_t)(void);
/* Log a failure message */ /* Log a failure message */
typedef void (*orte_notifier_base_module_log_fn_t)(int priority, const char *message, ...); typedef void (*orte_notifier_base_module_log_fn_t)(int priority, const char *msg, ...);
/* Log a failure that is based upon a show_help message */
typedef void (*orte_notifier_base_module_log_show_help_fn_t)(int priority, const char *file, const char *topic, ...);
/* /*
* Ver 1.0 * Ver 1.0
*/ */
struct orte_notifier_base_module_1_0_0_t { struct orte_notifier_base_module_1_0_0_t {
orte_notifier_base_module_init_fn_t init; orte_notifier_base_module_init_fn_t init;
orte_notifier_base_module_finalize_fn_t finalize; orte_notifier_base_module_finalize_fn_t finalize;
orte_notifier_base_module_log_fn_t log; orte_notifier_base_module_log_fn_t log;
orte_notifier_base_module_log_show_help_fn_t log_help;
}; };
typedef struct orte_notifier_base_module_1_0_0_t orte_notifier_base_module_1_0_0_t; typedef struct orte_notifier_base_module_1_0_0_t orte_notifier_base_module_1_0_0_t;

Просмотреть файл

@ -19,7 +19,6 @@
#include "orte_config.h" #include "orte_config.h"
#include "orte/constants.h" #include "orte/constants.h"
#include "orte/types.h"
#include <string.h> #include <string.h>
#ifdef HAVE_SYS_TIME_H #ifdef HAVE_SYS_TIME_H
@ -32,22 +31,7 @@
#include <stdarg.h> #include <stdarg.h>
#endif #endif
#include "opal/threads/condition.h" #include "opal/util/show_help.h"
#include "opal/util/bit_ops.h"
#include "opal/class/opal_hash_table.h"
#include "opal/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/rml/rml.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/proc_info.h"
#include "orte/orted/orted.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/notifier/base/base.h" #include "orte/mca/notifier/base/base.h"
#include "notifier_syslog.h" #include "notifier_syslog.h"
@ -57,12 +41,14 @@
static int init(void); static int init(void);
static void finalize(void); static void finalize(void);
static void mylog(int priority, const char *msg, ...); static void mylog(int priority, const char *msg, ...);
static void myhelplog(int priority, const char *filename, const char *topic, ...);
/* Module def */ /* Module def */
orte_notifier_base_module_t orte_notifier_syslog_module = { orte_notifier_base_module_t orte_notifier_syslog_module = {
init, init,
finalize, finalize,
mylog mylog,
myhelplog
}; };
@ -70,7 +56,7 @@ static int init(void) {
int opts; int opts;
opts = LOG_CONS | LOG_PID | LOG_SYSLOG; opts = LOG_CONS | LOG_PID | LOG_SYSLOG;
openlog("OpenMPI Error Report:", opts, LOG_USER); openlog("Open MPI Error Report:", opts, LOG_USER);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
@ -88,3 +74,22 @@ static void mylog(int priority, const char *msg, ...)
vsyslog(priority, msg, arglist); vsyslog(priority, msg, arglist);
va_end(arglist); va_end(arglist);
} }
static void myhelplog(int priority, const char *filename, const char *topic, ...)
{
va_list arglist;
char *output;
va_start(arglist, topic);
output = opal_show_help_vstring(filename, topic, false, arglist);
va_end(arglist);
/* if nothing came back, then nothing to do */
if (NULL == output) {
return;
}
/* go ahead and output it */
syslog(priority, output);
free(output);
}