Update the FTB notifier wrt events decided by the CIFTS working group
This commit was SVN r24001.
Этот коммит содержится в:
родитель
ac2768ca7c
Коммит
d1a4cc33dd
@ -15,23 +15,15 @@ start
|
|||||||
|
|
||||||
ftb.mpi.openmpi
|
ftb.mpi.openmpi
|
||||||
|
|
||||||
MPI_INIT INFO
|
FTB_MPI_PROCS_DEAD ERROR
|
||||||
MPI_FINALIZE INFO
|
FTB_MPI_PROCS_UNREACHABLE ERROR
|
||||||
MPI_NODE_DEAD ERROR
|
FTB_MPI_PROCS_COMM_ERROR WARN
|
||||||
MPI_NODE_RESTORED INFO
|
FTB_MPI_PROCS_MIGRATED INFO
|
||||||
MPI_RANK_DEAD ERROR
|
FTB_MPI_PROCS_MIGRATE_FAIL ERROR
|
||||||
MPI_RANK_RESTORED INFO
|
FTB_MPI_PROCS_CKPTED INFO
|
||||||
MPI_NODE_MIGRATE_DONE INFO
|
FTB_MPI_PROCS_CKPT_FAIL ERROR
|
||||||
MPI_JOB_ABORT_CMD ERROR
|
FTB_MPI_PROCS_RESTARTED INFO
|
||||||
MPI_JOB_RESUME_CMD INFO
|
FTB_MPI_PROCS_RESTART_FAIL ERROR
|
||||||
MPI_JOB_ABORTED ERROR
|
FTB_MPI_PROCS_ABORTED ERROR
|
||||||
MPI_JOB_RESUMED INFO
|
|
||||||
MPI_MSG_CORRUPT ERROR
|
|
||||||
MPI_IFACE_DEAD ERROR
|
|
||||||
MPI_IFACE_RESTORED ERROR
|
|
||||||
MPI_UNKNOWN_ERROR ERROR
|
|
||||||
MPI_OUT_OF_RESOURCE ERROR
|
|
||||||
MPI_NODE_UNREACHABLE ERROR
|
|
||||||
MPI_COMM_FAILURE ERROR
|
|
||||||
|
|
||||||
end
|
end
|
@ -53,8 +53,8 @@ typedef enum {
|
|||||||
FTB_EVENT_RESPONSE = 2
|
FTB_EVENT_RESPONSE = 2
|
||||||
} ftb_event_type_t;
|
} ftb_event_type_t;
|
||||||
|
|
||||||
/* Macro that returns FTB event name given the FTB event code */
|
/* Returns the FTB event name (as a string) given the event code */
|
||||||
#define FTB_ERROR(errnum) #errnum
|
#define FTB_EVENT(errnum) #errnum
|
||||||
|
|
||||||
END_C_DECLS
|
END_C_DECLS
|
||||||
|
|
||||||
|
@ -119,30 +119,31 @@ static const char* get_ftb_event_severity(orte_notifier_base_severity_t severity
|
|||||||
static const char* get_ftb_event_name(int errnum)
|
static const char* get_ftb_event_name(int errnum)
|
||||||
{
|
{
|
||||||
/* If it an OMPI error, translate it to an equivalent FTB event */
|
/* If it an OMPI error, translate it to an equivalent FTB event */
|
||||||
if (OPAL_SUCCESS > errnum) {
|
if (ORTE_SUCCESS > errnum) {
|
||||||
switch (errnum) {
|
switch (errnum) {
|
||||||
case ORTE_ERR_OUT_OF_RESOURCE:
|
|
||||||
case ORTE_ERR_TEMP_OUT_OF_RESOURCE:
|
case ORTE_SNAPC_CKPT_STATE_ESTABLISHED:
|
||||||
return FTB_ERROR(MPI_OUT_OF_RESOURCE);
|
case ORTE_SNAPC_CKPT_STATE_RECOVERED:
|
||||||
|
return FTB_EVENT(FTB_MPI_PROCS_CKPTED);
|
||||||
|
|
||||||
|
case ORTE_SNAPC_CKPT_STATE_NO_CKPT:
|
||||||
|
case ORTE_SNAPC_CKPT_STATE_ERROR:
|
||||||
|
return FTB_EVENT(FTB_MPI_PROCS_CKPT_FAIL);
|
||||||
|
|
||||||
case ORTE_ERR_CONNECTION_REFUSED:
|
case ORTE_ERR_CONNECTION_REFUSED:
|
||||||
case ORTE_ERR_CONNECTION_FAILED:
|
case ORTE_ERR_CONNECTION_FAILED:
|
||||||
case ORTE_ERR_UNREACH:
|
case ORTE_ERR_UNREACH:
|
||||||
return FTB_ERROR(MPI_NODE_DEAD);
|
return FTB_EVENT(FTB_MPI_PROCS_UNREACHABLE);
|
||||||
|
|
||||||
case ORTE_ERR_COMM_FAILURE:
|
case ORTE_ERR_COMM_FAILURE:
|
||||||
return FTB_ERROR(MPI_COMM_FAILURE);
|
return FTB_EVENT(FTB_MPI_PROCS_COMM_ERROR);
|
||||||
|
|
||||||
case ORTE_ERR_PROC_DEAD:
|
|
||||||
return FTB_ERROR(MPI_RANK_DEAD);
|
|
||||||
|
|
||||||
case ORTE_ERR_FATAL:
|
|
||||||
default:
|
default:
|
||||||
return FTB_ERROR(MPI_UNKNOWN_ERROR);
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return FTB_ERROR(MPI_UNKNOWN_ERROR);
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void publish_ftb_event(orte_notifier_base_severity_t severity, int errcode, char *payload)
|
static void publish_ftb_event(orte_notifier_base_severity_t severity, int errcode, char *payload)
|
||||||
@ -162,12 +163,14 @@ static void publish_ftb_event(orte_notifier_base_severity_t severity, int errcod
|
|||||||
|
|
||||||
/* Publish the event to the Fault Tolerant Backplane */
|
/* Publish the event to the Fault Tolerant Backplane */
|
||||||
event_name = get_ftb_event_name(errcode);
|
event_name = get_ftb_event_name(errcode);
|
||||||
|
if (NULL != event_name) {
|
||||||
ret = FTB_Publish(ftb_client_handle, event_name, &eprop, &ehandle);
|
ret = FTB_Publish(ftb_client_handle, event_name, &eprop, &ehandle);
|
||||||
if (FTB_SUCCESS != ret) {
|
if (FTB_SUCCESS != ret) {
|
||||||
orte_show_help("help-orte-notifier-ftb.txt", "publish failed", true,
|
orte_show_help("help-orte-notifier-ftb.txt", "publish failed", true,
|
||||||
"FTB_Publish() failed", ret, get_ftb_event_severity(severity),
|
"FTB_Publish() failed", ret, get_ftb_event_severity(severity),
|
||||||
event_name, payload, errcode);
|
event_name, payload, errcode);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ftb_log(orte_notifier_base_severity_t severity, int errcode, const char *msg,
|
static void ftb_log(orte_notifier_base_severity_t severity, int errcode, const char *msg,
|
||||||
@ -175,7 +178,6 @@ static void ftb_log(orte_notifier_base_severity_t severity, int errcode, const c
|
|||||||
{
|
{
|
||||||
char *payload;
|
char *payload;
|
||||||
|
|
||||||
/* If there was a message, output it */
|
|
||||||
vasprintf(&payload, msg, ap);
|
vasprintf(&payload, msg, ap);
|
||||||
if (NULL != payload) {
|
if (NULL != payload) {
|
||||||
publish_ftb_event(severity, errcode, payload);
|
publish_ftb_event(severity, errcode, payload);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user