Add an opal/errhandler so opal-level errors can be up-leveled
This commit is contained in:
parent
efc4c93d7a
commit
a2243dcddd
@ -10,6 +10,7 @@
|
||||
# Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
# All rights reserved.
|
||||
# Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
|
||||
# Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
@ -75,6 +76,7 @@ nobase_opal_HEADERS = $(headers)
|
||||
endif
|
||||
|
||||
include class/Makefile.am
|
||||
include errhandler/Makefile.am
|
||||
include memoryhooks/Makefile.am
|
||||
include runtime/Makefile.am
|
||||
include threads/Makefile.am
|
||||
|
17
opal/errhandler/Makefile.am
Normal file
17
opal/errhandler/Makefile.am
Normal file
@ -0,0 +1,17 @@
|
||||
# -*- makefile -*-
|
||||
#
|
||||
# Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# This makefile.am does not stand on its own - it is included from opal/Makefile.am
|
||||
|
||||
headers += \
|
||||
errhandler/opal_errhandler.h
|
||||
|
||||
lib@OPAL_LIB_PREFIX@open_pal_la_SOURCES += \
|
||||
errhandler/opal_errhandler.c
|
34
opal/errhandler/opal_errhandler.c
Normal file
34
opal/errhandler/opal_errhandler.c
Normal file
@ -0,0 +1,34 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "opal_config.h"
|
||||
|
||||
#include "opal/errhandler/opal_errhandler.h"
|
||||
|
||||
opal_errhandler_fn_t errhandler = NULL;
|
||||
void *cbdata = NULL;
|
||||
|
||||
void opal_register_errhandler(opal_errhandler_fn_t newerr, void *cbd)
|
||||
{
|
||||
errhandler = newerr;
|
||||
cbdata = cbd;
|
||||
}
|
||||
|
||||
void opal_deregister_errhandler(void)
|
||||
{
|
||||
errhandler = NULL;
|
||||
cbdata = NULL;
|
||||
}
|
||||
|
||||
void opal_invoke_errhandler(int status, opal_proc_t *proc)
|
||||
{
|
||||
if (NULL != errhandler) {
|
||||
errhandler(status, proc, cbdata);
|
||||
}
|
||||
}
|
25
opal/errhandler/opal_errhandler.h
Normal file
25
opal/errhandler/opal_errhandler.h
Normal file
@ -0,0 +1,25 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#ifndef OPAL_ERRHANDLER_H
|
||||
#define OPAL_ERRHANDLER_H
|
||||
|
||||
#include "opal_config.h"
|
||||
|
||||
#include "opal/util/proc.h"
|
||||
|
||||
typedef void (*opal_errhandler_fn_t)(int status, opal_proc_t *proc, void *cbdata);
|
||||
|
||||
OPAL_DECLSPEC void opal_register_errhandler(opal_errhandler_fn_t errhandler, void *cbdata);
|
||||
|
||||
OPAL_DECLSPEC void opal_deregister_errhandler(void);
|
||||
|
||||
OPAL_DECLSPEC void opal_invoke_errhandler(int status, opal_proc_t *proc);
|
||||
|
||||
#endif
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -17,7 +17,6 @@
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/mca_base_framework.h"
|
||||
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
@ -31,11 +30,6 @@ OPAL_DECLSPEC int opal_pmix_base_select(void);
|
||||
|
||||
OPAL_DECLSPEC extern bool opal_pmix_base_allow_delayed_server;
|
||||
|
||||
OPAL_DECLSPEC void opal_pmix_base_register_handler(opal_pmix_errhandler_fn_t err);
|
||||
OPAL_DECLSPEC void opal_pmix_base_deregister_handler(void);
|
||||
OPAL_DECLSPEC void opal_pmix_base_errhandler(int error);
|
||||
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif
|
||||
|
@ -2,7 +2,7 @@
|
||||
/*
|
||||
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -34,25 +34,6 @@
|
||||
|
||||
#define OPAL_PMI_PAD 10
|
||||
|
||||
static opal_pmix_errhandler_fn_t errhandler = NULL;
|
||||
|
||||
void opal_pmix_base_register_handler(opal_pmix_errhandler_fn_t err)
|
||||
{
|
||||
errhandler = err;
|
||||
}
|
||||
|
||||
void opal_pmix_base_errhandler(int error)
|
||||
{
|
||||
if (NULL != errhandler) {
|
||||
errhandler(error);
|
||||
}
|
||||
}
|
||||
|
||||
void opal_pmix_base_deregister_handler(void)
|
||||
{
|
||||
errhandler = NULL;
|
||||
}
|
||||
|
||||
static char* setup_key(const opal_process_name_t* name, const char *key, int pmix_keylen_max);
|
||||
static char *pmi_encode(const void *val, size_t vallen);
|
||||
static uint8_t *pmi_decode (const char *data, size_t *retlen);
|
||||
|
@ -79,9 +79,7 @@ const opal_pmix_base_module_t opal_pmix_cray_module = {
|
||||
NULL,
|
||||
cray_spawn,
|
||||
cray_job_connect,
|
||||
cray_job_disconnect,
|
||||
NULL,
|
||||
NULL
|
||||
cray_job_disconnect
|
||||
};
|
||||
|
||||
// usage accounting
|
||||
|
@ -93,9 +93,7 @@ const opal_pmix_base_module_t opal_pmix_native_module = {
|
||||
native_get_attr_nb,
|
||||
native_spawn,
|
||||
native_job_connect,
|
||||
native_job_disconnect,
|
||||
opal_pmix_base_register_handler,
|
||||
opal_pmix_base_deregister_handler
|
||||
native_job_disconnect
|
||||
};
|
||||
|
||||
// local variables
|
||||
@ -196,7 +194,7 @@ static int native_init(void)
|
||||
}
|
||||
}
|
||||
|
||||
/* we will connect on first send */
|
||||
/* we will connect on first send */
|
||||
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -21,6 +21,7 @@
|
||||
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
#include "opal/errhandler/opal_errhandler.h"
|
||||
#include "opal/util/proc.h"
|
||||
|
||||
#include "opal/mca/pmix/base/base.h"
|
||||
@ -45,7 +46,7 @@ typedef enum {
|
||||
#define PMIX_NATIVE_ABNORMAL_TERM \
|
||||
do { \
|
||||
mca_pmix_native_component.state = PMIX_USOCK_FAILED; \
|
||||
opal_pmix_base_errhandler(OPAL_ERR_COMM_FAILURE); \
|
||||
opal_invoke_errhandler(OPAL_ERR_COMM_FAILURE, NULL); \
|
||||
} while(0);
|
||||
|
||||
/* define a command type for communicating to the
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2014 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
|
||||
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
@ -21,7 +21,6 @@
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/runtime/opal.h"
|
||||
#include "opal/mca/dstore/dstore.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/util/error.h"
|
||||
#include "opal/util/proc.h"
|
||||
|
||||
@ -283,9 +282,6 @@ typedef void (*opal_pmix_cbfunc_t)(int status, opal_value_t *kv, void *cbdata);
|
||||
#define OPAL_FENCE(p, s, cf, cd) \
|
||||
opal_pmix.fence((p), (s));
|
||||
|
||||
/* callback handler for errors */
|
||||
typedef void (*opal_pmix_errhandler_fn_t)(int error);
|
||||
|
||||
/**** DEFINE THE PUBLIC API'S ****
|
||||
**** NOTE THAT WE DO NOT HAVE A 1:1 MAPPING OF APIs ****
|
||||
**** HERE TO THOSE CURRENTLY DEFINED BY PMI AS WE ****
|
||||
@ -402,12 +398,6 @@ typedef int (*opal_pmix_base_module_job_connect_fn_t)(const char jobId[]);
|
||||
typedef int (*opal_pmix_base_module_job_disconnect_fn_t)(const char jobId[]);
|
||||
|
||||
|
||||
/* register an errhandler to report loss of connection to the server */
|
||||
typedef void (*opal_pmix_base_module_register_fn_t)(opal_pmix_errhandler_fn_t errhandler);
|
||||
|
||||
/* deregister the errhandler */
|
||||
typedef void (*opal_pmix_base_module_deregister_fn_t)(void);
|
||||
|
||||
/*
|
||||
* the standard public API data structure
|
||||
*/
|
||||
@ -431,9 +421,6 @@ typedef struct {
|
||||
opal_pmix_base_module_spawn_fn_t spawn;
|
||||
opal_pmix_base_module_job_connect_fn_t job_connect;
|
||||
opal_pmix_base_module_job_disconnect_fn_t job_disconnect;
|
||||
/* register the errhandler */
|
||||
opal_pmix_base_module_register_fn_t register_errhandler;
|
||||
opal_pmix_base_module_deregister_fn_t deregister_errhandler;
|
||||
} opal_pmix_base_module_t;
|
||||
|
||||
typedef struct {
|
||||
|
@ -73,9 +73,7 @@ const opal_pmix_base_module_t opal_pmix_s1_module = {
|
||||
NULL,
|
||||
s1_spawn,
|
||||
s1_job_connect,
|
||||
s1_job_disconnect,
|
||||
NULL,
|
||||
NULL
|
||||
s1_job_disconnect
|
||||
};
|
||||
|
||||
// usage accounting
|
||||
|
@ -81,9 +81,7 @@ const opal_pmix_base_module_t opal_pmix_s2_module = {
|
||||
NULL,
|
||||
s2_spawn,
|
||||
s2_job_connect,
|
||||
s2_job_disconnect,
|
||||
NULL,
|
||||
NULL
|
||||
s2_job_disconnect
|
||||
};
|
||||
|
||||
// usage accounting
|
||||
|
@ -9,6 +9,7 @@
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2015 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -26,7 +27,7 @@
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
#include "opal/errhandler/opal_errhandler.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
@ -43,17 +44,17 @@
|
||||
/*
|
||||
* Module functions: Global
|
||||
*/
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
static int abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs,
|
||||
int error_code);
|
||||
static int abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs,
|
||||
int error_code);
|
||||
|
||||
/******************
|
||||
* HNP module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_default_app_module = {
|
||||
orte_errmgr_base_module_t orte_errmgr_default_app_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
@ -68,7 +69,7 @@ orte_errmgr_base_module_t orte_errmgr_default_app_module = {
|
||||
};
|
||||
|
||||
static void proc_errors(int fd, short args, void *cbdata);
|
||||
static void pmix_error(int error)
|
||||
static void pmix_error(int error, opal_proc_t *proc, void *cbdata)
|
||||
{
|
||||
/* push it into our event base */
|
||||
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED);
|
||||
@ -77,25 +78,20 @@ static void pmix_error(int error)
|
||||
/************************
|
||||
* API Definitions
|
||||
************************/
|
||||
static int init(void)
|
||||
{
|
||||
static int init(void)
|
||||
{
|
||||
/* setup state machine to trap proc errors */
|
||||
orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
|
||||
|
||||
/* register an errhandler with the PMIx framework so
|
||||
* we can know of loss of connection to the server */
|
||||
if (NULL != opal_pmix.register_errhandler) {
|
||||
opal_pmix.register_errhandler(pmix_error);
|
||||
}
|
||||
/* register an errhandler */
|
||||
opal_register_errhandler(pmix_error, NULL);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int finalize(void)
|
||||
{
|
||||
if (NULL != opal_pmix.deregister_errhandler) {
|
||||
opal_pmix.deregister_errhandler();
|
||||
}
|
||||
opal_deregister_errhandler();
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -108,15 +104,15 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
opal_pointer_array_t errors;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default_app: proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&caddy->name),
|
||||
orte_proc_state_to_str(caddy->proc_state)));
|
||||
"%s errmgr:default_app: proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&caddy->name),
|
||||
orte_proc_state_to_str(caddy->proc_state)));
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_finalizing) {
|
||||
if (orte_finalizing) {
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
@ -141,8 +137,8 @@ static void proc_errors(int fd, short args, void *cbdata)
|
||||
/* flag that we must abnormally terminate as far as the
|
||||
* RTE is concerned
|
||||
*/
|
||||
orte_abnormal_term_ordered = true;
|
||||
} else if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
|
||||
orte_abnormal_term_ordered = true;
|
||||
} else if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
|
||||
/* we need to die, so mark us so */
|
||||
orte_abnormal_term_ordered = true;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user