Tools require their own errmgr and state components so they can handle any errors that occur in, for example, communication .
Refs trac:3992 This commit was SVN r29972. The following Trac tickets were found above: Ticket 3992 --> https://svn.open-mpi.org/trac/ompi/ticket/3992
Этот коммит содержится в:
родитель
3aefca32b0
Коммит
bf5e314f76
34
orte/mca/errmgr/default_tool/Makefile.am
Обычный файл
34
orte/mca/errmgr/default_tool/Makefile.am
Обычный файл
@ -0,0 +1,34 @@
|
||||
#
|
||||
# Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
errmgr_default_tool.h \
|
||||
errmgr_default_tool_component.c \
|
||||
errmgr_default_tool.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_errmgr_default_tool_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_default_tool.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_default_tool.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_default_tool_la_SOURCES = $(sources)
|
||||
mca_errmgr_default_tool_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_default_tool_la_SOURCES =$(sources)
|
||||
libmca_errmgr_default_tool_la_LDFLAGS = -module -avoid-version
|
125
orte/mca/errmgr/default_tool/errmgr_default_tool.c
Обычный файл
125
orte/mca/errmgr/default_tool/errmgr_default_tool.c
Обычный файл
@ -0,0 +1,125 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/dss/dss.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
#include "orte/mca/state/state.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "errmgr_default_tool.h"
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
*/
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
static int abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs,
|
||||
int error_code);
|
||||
|
||||
/******************
|
||||
* HNP module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_default_tool_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
abort_peers,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
orte_errmgr_base_register_error_callback,
|
||||
orte_errmgr_base_execute_error_callbacks
|
||||
};
|
||||
|
||||
static void proc_errors(int fd, short args, void *cbdata);
|
||||
|
||||
/************************
|
||||
* API Definitions
|
||||
************************/
|
||||
static int init(void)
|
||||
{
|
||||
/* setup state machine to trap proc errors */
|
||||
orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static void proc_errors(int fd, short args, void *cbdata)
|
||||
{
|
||||
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
|
||||
"%s errmgr:default_tool: proc %s state %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&caddy->name),
|
||||
orte_proc_state_to_str(caddy->proc_state)));
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_finalizing) {
|
||||
OBJ_RELEASE(caddy);
|
||||
return;
|
||||
}
|
||||
|
||||
orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
|
||||
|
||||
OBJ_RELEASE(caddy);
|
||||
}
|
||||
|
||||
static int abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs,
|
||||
int error_code)
|
||||
{
|
||||
/* just abort */
|
||||
if (0 < opal_output_get_verbosity(orte_errmgr_base_framework.framework_output)) {
|
||||
orte_errmgr_base_abort(error_code, "%s called abort_peers",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
} else {
|
||||
orte_errmgr_base_abort(error_code, NULL);
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
35
orte/mca/errmgr/default_tool/errmgr_default_tool.h
Обычный файл
35
orte/mca/errmgr/default_tool/errmgr_default_tool.h
Обычный файл
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_default_tool_EXPORT_H
|
||||
#define MCA_ERRMGR_default_tool_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_default_tool_component;
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_default_tool_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_tool_EXPORT_H */
|
98
orte/mca/errmgr/default_tool/errmgr_default_tool_component.c
Обычный файл
98
orte/mca/errmgr/default_tool/errmgr_default_tool_component.c
Обычный файл
@ -0,0 +1,98 @@
|
||||
/*
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "errmgr_default_tool.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_errmgr_default_tool_component_version_string =
|
||||
"ORTE ERRMGR default_tool MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int errmgr_default_tool_register(void);
|
||||
static int errmgr_default_tool_open(void);
|
||||
static int errmgr_default_tool_close(void);
|
||||
static int errmgr_default_tool_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_errmgr_base_component_t mca_errmgr_default_tool_component =
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component
|
||||
*/
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
||||
/* Component name and version */
|
||||
"default_tool",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
errmgr_default_tool_open,
|
||||
errmgr_default_tool_close,
|
||||
errmgr_default_tool_component_query,
|
||||
errmgr_default_tool_register
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
};
|
||||
|
||||
static int my_priority;
|
||||
|
||||
static int errmgr_default_tool_register(void)
|
||||
{
|
||||
mca_base_component_t *c = &mca_errmgr_default_tool_component.base_version;
|
||||
|
||||
my_priority = 1000;
|
||||
(void) mca_base_component_var_register(c, "priority",
|
||||
"Priority of the default_tool errmgr component",
|
||||
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9,
|
||||
MCA_BASE_VAR_SCOPE_READONLY, &my_priority);
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_default_tool_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_default_tool_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_default_tool_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (ORTE_PROC_IS_TOOL) {
|
||||
/* set our priority high as we are the default for tools */
|
||||
*priority = my_priority;
|
||||
*module = (mca_base_module_t *)&orte_errmgr_default_tool_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
@ -40,6 +40,7 @@
|
||||
#include "orte/mca/routed/base/base.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/iof/base/base.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#if OPAL_ENABLE_FT_CR == 1
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#endif
|
||||
@ -67,6 +68,18 @@ int orte_ess_base_tool_setup(void)
|
||||
orte_process_info.proc_type |= ORTE_PROC_NON_MPI;
|
||||
}
|
||||
|
||||
/* open and setup the state machine */
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_state_base_open";
|
||||
goto error;
|
||||
}
|
||||
if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
error = "orte_state_base_select";
|
||||
goto error;
|
||||
}
|
||||
|
||||
/* Setup the communication infrastructure */
|
||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
|
34
orte/mca/state/tool/Makefile.am
Обычный файл
34
orte/mca/state/tool/Makefile.am
Обычный файл
@ -0,0 +1,34 @@
|
||||
#
|
||||
# Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
sources = \
|
||||
state_tool.h \
|
||||
state_tool_component.c \
|
||||
state_tool.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_state_tool_DSO
|
||||
component_noinst =
|
||||
component_install = mca_state_tool.la
|
||||
else
|
||||
component_noinst = libmca_state_tool.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_state_tool_la_SOURCES = $(sources)
|
||||
mca_state_tool_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_state_tool_la_SOURCES =$(sources)
|
||||
libmca_state_tool_la_LDFLAGS = -module -avoid-version
|
102
orte/mca/state/tool/state_tool.c
Обычный файл
102
orte/mca/state/tool/state_tool.c
Обычный файл
@ -0,0 +1,102 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/runtime/orte_quit.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/mca/state/base/state_private.h"
|
||||
#include "state_tool.h"
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
*/
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
/******************
|
||||
* TOOL module - just uses base functions after
|
||||
* initializing the proc state machine. Job state
|
||||
* machine is unused by tools at this
|
||||
* time.
|
||||
******************/
|
||||
orte_state_base_module_t orte_state_tool_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_state_base_activate_job_state,
|
||||
orte_state_base_add_job_state,
|
||||
orte_state_base_set_job_state_callback,
|
||||
orte_state_base_set_job_state_priority,
|
||||
orte_state_base_remove_job_state,
|
||||
orte_state_base_activate_proc_state,
|
||||
orte_state_base_add_proc_state,
|
||||
orte_state_base_set_proc_state_callback,
|
||||
orte_state_base_set_proc_state_priority,
|
||||
orte_state_base_remove_proc_state
|
||||
};
|
||||
|
||||
static void force_quit(int fd, short args, void *cbdata)
|
||||
{
|
||||
/* dont attempt to finalize as it could throw
|
||||
* us into an infinite loop on errors
|
||||
*/
|
||||
exit(orte_exit_status);
|
||||
}
|
||||
|
||||
/************************
|
||||
* API Definitions
|
||||
************************/
|
||||
static int init(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
OBJ_CONSTRUCT(&orte_job_states, opal_list_t);
|
||||
OBJ_CONSTRUCT(&orte_proc_states, opal_list_t);
|
||||
|
||||
/* add a default error response */
|
||||
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_FORCED_EXIT,
|
||||
force_quit, ORTE_ERROR_PRI))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int finalize(void)
|
||||
{
|
||||
opal_list_item_t *item;
|
||||
|
||||
/* cleanup the state machines */
|
||||
while (NULL != (item = opal_list_remove_first(&orte_proc_states))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&orte_proc_states);
|
||||
|
||||
while (NULL != (item = opal_list_remove_first(&orte_job_states))) {
|
||||
OBJ_RELEASE(item);
|
||||
}
|
||||
OBJ_DESTRUCT(&orte_job_states);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
37
orte/mca/state/tool/state_tool.h
Обычный файл
37
orte/mca/state/tool/state_tool.h
Обычный файл
@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_STATE_TOOL_EXPORT_H
|
||||
#define MCA_STATE_TOOL_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/state/state.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_tool_component;
|
||||
|
||||
ORTE_DECLSPEC extern orte_state_base_module_t orte_state_tool_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_STATE_TOOL_EXPORT_H */
|
85
orte/mca/state/tool/state_tool_component.c
Обычный файл
85
orte/mca/state/tool/state_tool_component.c
Обычный файл
@ -0,0 +1,85 @@
|
||||
/*
|
||||
* Copyright (c) 2011 Los Alamos National Security, LLC.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2013 Intel, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/state/state.h"
|
||||
#include "orte/mca/state/base/base.h"
|
||||
#include "state_tool.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_state_tool_component_version_string =
|
||||
"ORTE STATE tool MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int state_tool_open(void);
|
||||
static int state_tool_close(void);
|
||||
static int state_tool_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_state_base_component_t mca_state_tool_component =
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component
|
||||
*/
|
||||
{
|
||||
ORTE_STATE_BASE_VERSION_1_0_0,
|
||||
/* Component name and version */
|
||||
"tool",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
state_tool_open,
|
||||
state_tool_close,
|
||||
state_tool_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
};
|
||||
|
||||
static int my_priority=1000;
|
||||
|
||||
static int state_tool_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int state_tool_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int state_tool_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (ORTE_PROC_IS_TOOL) {
|
||||
/* set our priority high as we are the default for tools */
|
||||
*priority = my_priority;
|
||||
*module = (mca_base_module_t *)&orte_state_tool_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
Загрузка…
x
Ссылка в новой задаче
Block a user