1
1

Add an opal/errhandler so opal-level errors can be up-leveled

This commit is contained in:
Ralph Castain 2015-07-11 07:09:11 -07:00
parent efc4c93d7a
commit a2243dcddd
13 changed files with 110 additions and 81 deletions

View File

@ -10,6 +10,7 @@
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
@ -75,6 +76,7 @@ nobase_opal_HEADERS = $(headers)
endif
include class/Makefile.am
include errhandler/Makefile.am
include memoryhooks/Makefile.am
include runtime/Makefile.am
include threads/Makefile.am

View File

@ -0,0 +1,17 @@
# -*- makefile -*-
#
# Copyright (c) 2015 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This makefile.am does not stand on its own - it is included from opal/Makefile.am
headers += \
errhandler/opal_errhandler.h
lib@OPAL_LIB_PREFIX@open_pal_la_SOURCES += \
errhandler/opal_errhandler.c

View File

@ -0,0 +1,34 @@
/*
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "opal_config.h"
#include "opal/errhandler/opal_errhandler.h"
opal_errhandler_fn_t errhandler = NULL;
void *cbdata = NULL;
void opal_register_errhandler(opal_errhandler_fn_t newerr, void *cbd)
{
errhandler = newerr;
cbdata = cbd;
}
void opal_deregister_errhandler(void)
{
errhandler = NULL;
cbdata = NULL;
}
void opal_invoke_errhandler(int status, opal_proc_t *proc)
{
if (NULL != errhandler) {
errhandler(status, proc, cbdata);
}
}

View File

@ -0,0 +1,25 @@
/*
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#ifndef OPAL_ERRHANDLER_H
#define OPAL_ERRHANDLER_H
#include "opal_config.h"
#include "opal/util/proc.h"
typedef void (*opal_errhandler_fn_t)(int status, opal_proc_t *proc, void *cbdata);
OPAL_DECLSPEC void opal_register_errhandler(opal_errhandler_fn_t errhandler, void *cbdata);
OPAL_DECLSPEC void opal_deregister_errhandler(void);
OPAL_DECLSPEC void opal_invoke_errhandler(int status, opal_proc_t *proc);
#endif

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -17,7 +17,6 @@
#include "opal/mca/mca.h"
#include "opal/mca/base/mca_base_framework.h"
#include "opal/mca/pmix/pmix.h"
BEGIN_C_DECLS
@ -31,11 +30,6 @@ OPAL_DECLSPEC int opal_pmix_base_select(void);
OPAL_DECLSPEC extern bool opal_pmix_base_allow_delayed_server;
OPAL_DECLSPEC void opal_pmix_base_register_handler(opal_pmix_errhandler_fn_t err);
OPAL_DECLSPEC void opal_pmix_base_deregister_handler(void);
OPAL_DECLSPEC void opal_pmix_base_errhandler(int error);
END_C_DECLS
#endif

View File

@ -2,7 +2,7 @@
/*
* Copyright (c) 2012-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
@ -34,25 +34,6 @@
#define OPAL_PMI_PAD 10
static opal_pmix_errhandler_fn_t errhandler = NULL;
void opal_pmix_base_register_handler(opal_pmix_errhandler_fn_t err)
{
errhandler = err;
}
void opal_pmix_base_errhandler(int error)
{
if (NULL != errhandler) {
errhandler(error);
}
}
void opal_pmix_base_deregister_handler(void)
{
errhandler = NULL;
}
static char* setup_key(const opal_process_name_t* name, const char *key, int pmix_keylen_max);
static char *pmi_encode(const void *val, size_t vallen);
static uint8_t *pmi_decode (const char *data, size_t *retlen);

View File

@ -79,9 +79,7 @@ const opal_pmix_base_module_t opal_pmix_cray_module = {
NULL,
cray_spawn,
cray_job_connect,
cray_job_disconnect,
NULL,
NULL
cray_job_disconnect
};
// usage accounting

View File

@ -93,9 +93,7 @@ const opal_pmix_base_module_t opal_pmix_native_module = {
native_get_attr_nb,
native_spawn,
native_job_connect,
native_job_disconnect,
opal_pmix_base_register_handler,
opal_pmix_base_deregister_handler
native_job_disconnect
};
// local variables
@ -196,7 +194,7 @@ static int native_init(void)
}
}
/* we will connect on first send */
/* we will connect on first send */
return OPAL_SUCCESS;
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -21,6 +21,7 @@
#include "opal/mca/mca.h"
#include "opal/mca/event/event.h"
#include "opal/errhandler/opal_errhandler.h"
#include "opal/util/proc.h"
#include "opal/mca/pmix/base/base.h"
@ -45,7 +46,7 @@ typedef enum {
#define PMIX_NATIVE_ABNORMAL_TERM \
do { \
mca_pmix_native_component.state = PMIX_USOCK_FAILED; \
opal_pmix_base_errhandler(OPAL_ERR_COMM_FAILURE); \
opal_invoke_errhandler(OPAL_ERR_COMM_FAILURE, NULL); \
} while(0);
/* define a command type for communicating to the

View File

@ -1,6 +1,6 @@
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
* Copyright (c) 2014 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
@ -21,7 +21,6 @@
#include "opal/dss/dss.h"
#include "opal/runtime/opal.h"
#include "opal/mca/dstore/dstore.h"
#include "opal/dss/dss.h"
#include "opal/util/error.h"
#include "opal/util/proc.h"
@ -283,9 +282,6 @@ typedef void (*opal_pmix_cbfunc_t)(int status, opal_value_t *kv, void *cbdata);
#define OPAL_FENCE(p, s, cf, cd) \
opal_pmix.fence((p), (s));
/* callback handler for errors */
typedef void (*opal_pmix_errhandler_fn_t)(int error);
/**** DEFINE THE PUBLIC API'S ****
**** NOTE THAT WE DO NOT HAVE A 1:1 MAPPING OF APIs ****
**** HERE TO THOSE CURRENTLY DEFINED BY PMI AS WE ****
@ -402,12 +398,6 @@ typedef int (*opal_pmix_base_module_job_connect_fn_t)(const char jobId[]);
typedef int (*opal_pmix_base_module_job_disconnect_fn_t)(const char jobId[]);
/* register an errhandler to report loss of connection to the server */
typedef void (*opal_pmix_base_module_register_fn_t)(opal_pmix_errhandler_fn_t errhandler);
/* deregister the errhandler */
typedef void (*opal_pmix_base_module_deregister_fn_t)(void);
/*
* the standard public API data structure
*/
@ -431,9 +421,6 @@ typedef struct {
opal_pmix_base_module_spawn_fn_t spawn;
opal_pmix_base_module_job_connect_fn_t job_connect;
opal_pmix_base_module_job_disconnect_fn_t job_disconnect;
/* register the errhandler */
opal_pmix_base_module_register_fn_t register_errhandler;
opal_pmix_base_module_deregister_fn_t deregister_errhandler;
} opal_pmix_base_module_t;
typedef struct {

View File

@ -73,9 +73,7 @@ const opal_pmix_base_module_t opal_pmix_s1_module = {
NULL,
s1_spawn,
s1_job_connect,
s1_job_disconnect,
NULL,
NULL
s1_job_disconnect
};
// usage accounting

View File

@ -81,9 +81,7 @@ const opal_pmix_base_module_t opal_pmix_s2_module = {
NULL,
s2_spawn,
s2_job_connect,
s2_job_disconnect,
NULL,
NULL
s2_job_disconnect
};
// usage accounting

View File

@ -9,6 +9,7 @@
* reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -26,7 +27,7 @@
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/errhandler/opal_errhandler.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
@ -43,17 +44,17 @@
/*
* Module functions: Global
*/
static int init(void);
static int finalize(void);
static int init(void);
static int finalize(void);
static int abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs,
int error_code);
static int abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs,
int error_code);
/******************
* HNP module
******************/
orte_errmgr_base_module_t orte_errmgr_default_app_module = {
orte_errmgr_base_module_t orte_errmgr_default_app_module = {
init,
finalize,
orte_errmgr_base_log,
@ -68,7 +69,7 @@ orte_errmgr_base_module_t orte_errmgr_default_app_module = {
};
static void proc_errors(int fd, short args, void *cbdata);
static void pmix_error(int error)
static void pmix_error(int error, opal_proc_t *proc, void *cbdata)
{
/* push it into our event base */
ORTE_ACTIVATE_PROC_STATE(ORTE_PROC_MY_NAME, ORTE_PROC_STATE_COMM_FAILED);
@ -77,25 +78,20 @@ static void pmix_error(int error)
/************************
* API Definitions
************************/
static int init(void)
{
static int init(void)
{
/* setup state machine to trap proc errors */
orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
/* register an errhandler with the PMIx framework so
* we can know of loss of connection to the server */
if (NULL != opal_pmix.register_errhandler) {
opal_pmix.register_errhandler(pmix_error);
}
/* register an errhandler */
opal_register_errhandler(pmix_error, NULL);
return ORTE_SUCCESS;
}
static int finalize(void)
{
if (NULL != opal_pmix.deregister_errhandler) {
opal_pmix.deregister_errhandler();
}
opal_deregister_errhandler();
return ORTE_SUCCESS;
}
@ -108,15 +104,15 @@ static void proc_errors(int fd, short args, void *cbdata)
opal_pointer_array_t errors;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:default_app: proc %s state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&caddy->name),
orte_proc_state_to_str(caddy->proc_state)));
"%s errmgr:default_app: proc %s state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&caddy->name),
orte_proc_state_to_str(caddy->proc_state)));
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
if (orte_finalizing) {
OBJ_RELEASE(caddy);
return;
}
@ -141,8 +137,8 @@ static void proc_errors(int fd, short args, void *cbdata)
/* flag that we must abnormally terminate as far as the
* RTE is concerned
*/
orte_abnormal_term_ordered = true;
} else if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
orte_abnormal_term_ordered = true;
} else if (ORTE_PROC_STATE_LIFELINE_LOST == caddy->proc_state) {
/* we need to die, so mark us so */
orte_abnormal_term_ordered = true;
}