Update the notifier framework in prep for move to v1.3. Add an API to handle the case where error messages have been expressed via "show_help" so they can look similar to what was presented to users. Add three key calls in the openib btl to drop messages into syslog.
This will sit in trunk for a few days - would like to actually see some errors reported to syslog before moving the code to 1.3 This commit was SVN r19986.
Этот коммит содержится в:
родитель
a48b2d45be
Коммит
ce26e3a2fb
@ -64,6 +64,7 @@ const char *ibv_get_sysfs_path(void);
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/notifier/notifier.h"
|
||||
|
||||
#include "ompi/proc/proc.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
@ -2842,6 +2843,23 @@ error:
|
||||
"status number %d for wr_id %llu opcode %d qp_idx %d",
|
||||
cq_name[cq], btl_openib_component_status_to_string(wc->status),
|
||||
wc->status, wc->wr_id, wc->opcode, qp));
|
||||
if (NULL == remote_proc) {
|
||||
orte_notifier.log(ORTE_NOTIFIER_INFRA, "Proc %s on node %s encountered IB error "
|
||||
"communicating to unknown proc/node:\n\tpolling %s with status %s "
|
||||
"status number %d for wr_id %llu opcode %d qp_idx %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_process_info.nodename,
|
||||
cq_name[cq], btl_openib_component_status_to_string(wc->status),
|
||||
wc->status, wc->wr_id, wc->opcode, qp);
|
||||
} else {
|
||||
orte_notifier.log(ORTE_NOTIFIER_INFRA, "Proc %s on node %s encountered IB error while "
|
||||
"communicating to proc %s on node %s:\n\tpolling %s with status %s "
|
||||
"status number %d for wr_id %llu opcode %d qp_idx %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_process_info.nodename,
|
||||
ORTE_NAME_PRINT(&remote_proc->proc_name),
|
||||
(NULL == remote_proc->proc_hostname) ? "UNKNOWN" : remote_proc->proc_hostname,
|
||||
cq_name[cq], btl_openib_component_status_to_string(wc->status),
|
||||
wc->status, wc->wr_id, wc->opcode, qp);
|
||||
}
|
||||
}
|
||||
|
||||
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
|
||||
@ -2860,11 +2878,23 @@ error:
|
||||
"srq rnr retry exceeded", true,
|
||||
orte_process_info.nodename, device_name,
|
||||
peer_hostname);
|
||||
orte_notifier.log_help(ORTE_NOTIFIER_INFRA,
|
||||
"help-mpi-btl-openib.txt",
|
||||
BTL_OPENIB_QP_TYPE_PP(qp) ?
|
||||
"pp rnr retry exceeded" :
|
||||
"srq rnr retry exceeded",
|
||||
orte_process_info.nodename, device_name,
|
||||
peer_hostname);
|
||||
} else if (IBV_WC_RETRY_EXC_ERR == wc->status) {
|
||||
orte_show_help("help-mpi-btl-openib.txt",
|
||||
"pp retry exceeded", true,
|
||||
orte_process_info.nodename,
|
||||
device_name, peer_hostname);
|
||||
orte_notifier.log_help(ORTE_NOTIFIER_INFRA,
|
||||
"help-mpi-btl-openib.txt",
|
||||
"pp retry exceeded",
|
||||
orte_process_info.nodename,
|
||||
device_name, peer_hostname);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -40,34 +40,38 @@
|
||||
#include <syslog.h>
|
||||
#endif /* HAVE_SYSLOG_H */
|
||||
|
||||
#ifdef HAVE_STDARG_H
|
||||
#include <stdarg.h>
|
||||
#endif /* HAVE_STDARG_H */
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/* define priorities - this will eventually be replaced by OPAL_SOS priorities */
|
||||
#define ORTE_NOTIFIER_INFRA LOG_CRIT
|
||||
#define ORTE_NOTIFIER_WARNING LOG_WARNING
|
||||
|
||||
/*
|
||||
* Component functions - all MUST be provided!
|
||||
*/
|
||||
|
||||
/* initialize the selected module */
|
||||
typedef int (*orte_notifier_base_module_init_fn_t)(void);
|
||||
|
||||
|
||||
/* finalize the selected module */
|
||||
typedef void (*orte_notifier_base_module_finalize_fn_t)(void);
|
||||
|
||||
/* Log a failure message */
|
||||
typedef void (*orte_notifier_base_module_log_fn_t)(int priority, const char *message, ...);
|
||||
typedef void (*orte_notifier_base_module_log_fn_t)(int priority, const char *msg, ...);
|
||||
|
||||
/* Log a failure that is based upon a show_help message */
|
||||
typedef void (*orte_notifier_base_module_log_show_help_fn_t)(int priority, const char *file, const char *topic, ...);
|
||||
|
||||
/*
|
||||
* Ver 1.0
|
||||
*/
|
||||
struct orte_notifier_base_module_1_0_0_t {
|
||||
orte_notifier_base_module_init_fn_t init;
|
||||
orte_notifier_base_module_finalize_fn_t finalize;
|
||||
orte_notifier_base_module_log_fn_t log;
|
||||
orte_notifier_base_module_init_fn_t init;
|
||||
orte_notifier_base_module_finalize_fn_t finalize;
|
||||
orte_notifier_base_module_log_fn_t log;
|
||||
orte_notifier_base_module_log_show_help_fn_t log_help;
|
||||
};
|
||||
|
||||
typedef struct orte_notifier_base_module_1_0_0_t orte_notifier_base_module_1_0_0_t;
|
||||
|
@ -19,7 +19,6 @@
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#include <string.h>
|
||||
#ifdef HAVE_SYS_TIME_H
|
||||
@ -32,22 +31,7 @@
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
#include "opal/threads/condition.h"
|
||||
#include "opal/util/bit_ops.h"
|
||||
#include "opal/class/opal_hash_table.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/orted/orted.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "opal/util/show_help.h"
|
||||
|
||||
#include "orte/mca/notifier/base/base.h"
|
||||
#include "notifier_syslog.h"
|
||||
@ -57,12 +41,14 @@
|
||||
static int init(void);
|
||||
static void finalize(void);
|
||||
static void mylog(int priority, const char *msg, ...);
|
||||
static void myhelplog(int priority, const char *filename, const char *topic, ...);
|
||||
|
||||
/* Module def */
|
||||
orte_notifier_base_module_t orte_notifier_syslog_module = {
|
||||
init,
|
||||
finalize,
|
||||
mylog
|
||||
init,
|
||||
finalize,
|
||||
mylog,
|
||||
myhelplog
|
||||
};
|
||||
|
||||
|
||||
@ -70,7 +56,7 @@ static int init(void) {
|
||||
int opts;
|
||||
|
||||
opts = LOG_CONS | LOG_PID | LOG_SYSLOG;
|
||||
openlog("OpenMPI Error Report:", opts, LOG_USER);
|
||||
openlog("Open MPI Error Report:", opts, LOG_USER);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -88,3 +74,22 @@ static void mylog(int priority, const char *msg, ...)
|
||||
vsyslog(priority, msg, arglist);
|
||||
va_end(arglist);
|
||||
}
|
||||
|
||||
static void myhelplog(int priority, const char *filename, const char *topic, ...)
|
||||
{
|
||||
va_list arglist;
|
||||
char *output;
|
||||
|
||||
va_start(arglist, topic);
|
||||
output = opal_show_help_vstring(filename, topic, false, arglist);
|
||||
va_end(arglist);
|
||||
|
||||
/* if nothing came back, then nothing to do */
|
||||
if (NULL == output) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* go ahead and output it */
|
||||
syslog(priority, output);
|
||||
free(output);
|
||||
}
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user