Update the notifier framework in prep for move to v1.3. Add an API to handle the case where error messages have been expressed via "show_help" so they can look similar to what was presented to users. Add three key calls in the openib btl to drop messages into syslog.
This will sit in trunk for a few days - would like to actually see some errors reported to syslog before moving the code to 1.3 This commit was SVN r19986.
Этот коммит содержится в:
родитель
a48b2d45be
Коммит
ce26e3a2fb
@ -64,6 +64,7 @@ const char *ibv_get_sysfs_path(void);
|
|||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include "orte/util/name_fns.h"
|
#include "orte/util/name_fns.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
|
#include "orte/mca/notifier/notifier.h"
|
||||||
|
|
||||||
#include "ompi/proc/proc.h"
|
#include "ompi/proc/proc.h"
|
||||||
#include "ompi/mca/pml/pml.h"
|
#include "ompi/mca/pml/pml.h"
|
||||||
@ -2842,6 +2843,23 @@ error:
|
|||||||
"status number %d for wr_id %llu opcode %d qp_idx %d",
|
"status number %d for wr_id %llu opcode %d qp_idx %d",
|
||||||
cq_name[cq], btl_openib_component_status_to_string(wc->status),
|
cq_name[cq], btl_openib_component_status_to_string(wc->status),
|
||||||
wc->status, wc->wr_id, wc->opcode, qp));
|
wc->status, wc->wr_id, wc->opcode, qp));
|
||||||
|
if (NULL == remote_proc) {
|
||||||
|
orte_notifier.log(ORTE_NOTIFIER_INFRA, "Proc %s on node %s encountered IB error "
|
||||||
|
"communicating to unknown proc/node:\n\tpolling %s with status %s "
|
||||||
|
"status number %d for wr_id %llu opcode %d qp_idx %d",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_process_info.nodename,
|
||||||
|
cq_name[cq], btl_openib_component_status_to_string(wc->status),
|
||||||
|
wc->status, wc->wr_id, wc->opcode, qp);
|
||||||
|
} else {
|
||||||
|
orte_notifier.log(ORTE_NOTIFIER_INFRA, "Proc %s on node %s encountered IB error while "
|
||||||
|
"communicating to proc %s on node %s:\n\tpolling %s with status %s "
|
||||||
|
"status number %d for wr_id %llu opcode %d qp_idx %d",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), orte_process_info.nodename,
|
||||||
|
ORTE_NAME_PRINT(&remote_proc->proc_name),
|
||||||
|
(NULL == remote_proc->proc_hostname) ? "UNKNOWN" : remote_proc->proc_hostname,
|
||||||
|
cq_name[cq], btl_openib_component_status_to_string(wc->status),
|
||||||
|
wc->status, wc->wr_id, wc->opcode, qp);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
|
if (IBV_WC_RNR_RETRY_EXC_ERR == wc->status ||
|
||||||
@ -2860,11 +2878,23 @@ error:
|
|||||||
"srq rnr retry exceeded", true,
|
"srq rnr retry exceeded", true,
|
||||||
orte_process_info.nodename, device_name,
|
orte_process_info.nodename, device_name,
|
||||||
peer_hostname);
|
peer_hostname);
|
||||||
|
orte_notifier.log_help(ORTE_NOTIFIER_INFRA,
|
||||||
|
"help-mpi-btl-openib.txt",
|
||||||
|
BTL_OPENIB_QP_TYPE_PP(qp) ?
|
||||||
|
"pp rnr retry exceeded" :
|
||||||
|
"srq rnr retry exceeded",
|
||||||
|
orte_process_info.nodename, device_name,
|
||||||
|
peer_hostname);
|
||||||
} else if (IBV_WC_RETRY_EXC_ERR == wc->status) {
|
} else if (IBV_WC_RETRY_EXC_ERR == wc->status) {
|
||||||
orte_show_help("help-mpi-btl-openib.txt",
|
orte_show_help("help-mpi-btl-openib.txt",
|
||||||
"pp retry exceeded", true,
|
"pp retry exceeded", true,
|
||||||
orte_process_info.nodename,
|
orte_process_info.nodename,
|
||||||
device_name, peer_hostname);
|
device_name, peer_hostname);
|
||||||
|
orte_notifier.log_help(ORTE_NOTIFIER_INFRA,
|
||||||
|
"help-mpi-btl-openib.txt",
|
||||||
|
"pp retry exceeded",
|
||||||
|
orte_process_info.nodename,
|
||||||
|
device_name, peer_hostname);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -40,34 +40,38 @@
|
|||||||
#include <syslog.h>
|
#include <syslog.h>
|
||||||
#endif /* HAVE_SYSLOG_H */
|
#endif /* HAVE_SYSLOG_H */
|
||||||
|
|
||||||
#ifdef HAVE_STDARG_H
|
|
||||||
#include <stdarg.h>
|
|
||||||
#endif /* HAVE_STDARG_H */
|
|
||||||
|
|
||||||
#include "opal/mca/mca.h"
|
#include "opal/mca/mca.h"
|
||||||
|
|
||||||
BEGIN_C_DECLS
|
BEGIN_C_DECLS
|
||||||
|
|
||||||
|
/* define priorities - this will eventually be replaced by OPAL_SOS priorities */
|
||||||
|
#define ORTE_NOTIFIER_INFRA LOG_CRIT
|
||||||
|
#define ORTE_NOTIFIER_WARNING LOG_WARNING
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Component functions - all MUST be provided!
|
* Component functions - all MUST be provided!
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* initialize the selected module */
|
/* initialize the selected module */
|
||||||
typedef int (*orte_notifier_base_module_init_fn_t)(void);
|
typedef int (*orte_notifier_base_module_init_fn_t)(void);
|
||||||
|
|
||||||
/* finalize the selected module */
|
/* finalize the selected module */
|
||||||
typedef void (*orte_notifier_base_module_finalize_fn_t)(void);
|
typedef void (*orte_notifier_base_module_finalize_fn_t)(void);
|
||||||
|
|
||||||
/* Log a failure message */
|
/* Log a failure message */
|
||||||
typedef void (*orte_notifier_base_module_log_fn_t)(int priority, const char *message, ...);
|
typedef void (*orte_notifier_base_module_log_fn_t)(int priority, const char *msg, ...);
|
||||||
|
|
||||||
|
/* Log a failure that is based upon a show_help message */
|
||||||
|
typedef void (*orte_notifier_base_module_log_show_help_fn_t)(int priority, const char *file, const char *topic, ...);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Ver 1.0
|
* Ver 1.0
|
||||||
*/
|
*/
|
||||||
struct orte_notifier_base_module_1_0_0_t {
|
struct orte_notifier_base_module_1_0_0_t {
|
||||||
orte_notifier_base_module_init_fn_t init;
|
orte_notifier_base_module_init_fn_t init;
|
||||||
orte_notifier_base_module_finalize_fn_t finalize;
|
orte_notifier_base_module_finalize_fn_t finalize;
|
||||||
orte_notifier_base_module_log_fn_t log;
|
orte_notifier_base_module_log_fn_t log;
|
||||||
|
orte_notifier_base_module_log_show_help_fn_t log_help;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct orte_notifier_base_module_1_0_0_t orte_notifier_base_module_1_0_0_t;
|
typedef struct orte_notifier_base_module_1_0_0_t orte_notifier_base_module_1_0_0_t;
|
||||||
|
@ -19,7 +19,6 @@
|
|||||||
|
|
||||||
#include "orte_config.h"
|
#include "orte_config.h"
|
||||||
#include "orte/constants.h"
|
#include "orte/constants.h"
|
||||||
#include "orte/types.h"
|
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#ifdef HAVE_SYS_TIME_H
|
#ifdef HAVE_SYS_TIME_H
|
||||||
@ -32,22 +31,7 @@
|
|||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "opal/threads/condition.h"
|
#include "opal/util/show_help.h"
|
||||||
#include "opal/util/bit_ops.h"
|
|
||||||
#include "opal/class/opal_hash_table.h"
|
|
||||||
#include "opal/dss/dss.h"
|
|
||||||
|
|
||||||
|
|
||||||
#include "orte/mca/errmgr/errmgr.h"
|
|
||||||
#include "orte/mca/ess/ess.h"
|
|
||||||
#include "orte/mca/odls/odls_types.h"
|
|
||||||
#include "orte/mca/rml/rml.h"
|
|
||||||
#include "orte/util/name_fns.h"
|
|
||||||
#include "orte/util/show_help.h"
|
|
||||||
#include "orte/util/proc_info.h"
|
|
||||||
#include "orte/orted/orted.h"
|
|
||||||
#include "orte/runtime/orte_wait.h"
|
|
||||||
#include "orte/runtime/orte_globals.h"
|
|
||||||
|
|
||||||
#include "orte/mca/notifier/base/base.h"
|
#include "orte/mca/notifier/base/base.h"
|
||||||
#include "notifier_syslog.h"
|
#include "notifier_syslog.h"
|
||||||
@ -57,12 +41,14 @@
|
|||||||
static int init(void);
|
static int init(void);
|
||||||
static void finalize(void);
|
static void finalize(void);
|
||||||
static void mylog(int priority, const char *msg, ...);
|
static void mylog(int priority, const char *msg, ...);
|
||||||
|
static void myhelplog(int priority, const char *filename, const char *topic, ...);
|
||||||
|
|
||||||
/* Module def */
|
/* Module def */
|
||||||
orte_notifier_base_module_t orte_notifier_syslog_module = {
|
orte_notifier_base_module_t orte_notifier_syslog_module = {
|
||||||
init,
|
init,
|
||||||
finalize,
|
finalize,
|
||||||
mylog
|
mylog,
|
||||||
|
myhelplog
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -70,7 +56,7 @@ static int init(void) {
|
|||||||
int opts;
|
int opts;
|
||||||
|
|
||||||
opts = LOG_CONS | LOG_PID | LOG_SYSLOG;
|
opts = LOG_CONS | LOG_PID | LOG_SYSLOG;
|
||||||
openlog("OpenMPI Error Report:", opts, LOG_USER);
|
openlog("Open MPI Error Report:", opts, LOG_USER);
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -88,3 +74,22 @@ static void mylog(int priority, const char *msg, ...)
|
|||||||
vsyslog(priority, msg, arglist);
|
vsyslog(priority, msg, arglist);
|
||||||
va_end(arglist);
|
va_end(arglist);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void myhelplog(int priority, const char *filename, const char *topic, ...)
|
||||||
|
{
|
||||||
|
va_list arglist;
|
||||||
|
char *output;
|
||||||
|
|
||||||
|
va_start(arglist, topic);
|
||||||
|
output = opal_show_help_vstring(filename, topic, false, arglist);
|
||||||
|
va_end(arglist);
|
||||||
|
|
||||||
|
/* if nothing came back, then nothing to do */
|
||||||
|
if (NULL == output) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* go ahead and output it */
|
||||||
|
syslog(priority, output);
|
||||||
|
free(output);
|
||||||
|
}
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user