1
1

Tools require their own errmgr and state components so they can handle any errors that occur in, for example, communication .

Refs trac:3992

This commit was SVN r29972.

The following Trac tickets were found above:
  Ticket 3992 --> https://svn.open-mpi.org/trac/ompi/ticket/3992
Этот коммит содержится в:
Ralph Castain 2013-12-19 01:49:33 +00:00
родитель 3aefca32b0
Коммит bf5e314f76
9 изменённых файлов: 563 добавлений и 0 удалений

34
orte/mca/errmgr/default_tool/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,34 @@
#
# Copyright (c) 2013 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
errmgr_default_tool.h \
errmgr_default_tool_component.c \
errmgr_default_tool.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_errmgr_default_tool_DSO
component_noinst =
component_install = mca_errmgr_default_tool.la
else
component_noinst = libmca_errmgr_default_tool.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_default_tool_la_SOURCES = $(sources)
mca_errmgr_default_tool_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_default_tool_la_SOURCES =$(sources)
libmca_errmgr_default_tool_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -0,0 +1,125 @@
/*
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* All rights reserved.
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/state/state.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_default_tool.h"
/*
* Module functions: Global
*/
static int init(void);
static int finalize(void);
static int abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs,
int error_code);
/******************
* HNP module
******************/
orte_errmgr_base_module_t orte_errmgr_default_tool_module = {
init,
finalize,
orte_errmgr_base_log,
orte_errmgr_base_abort,
abort_peers,
NULL,
NULL,
NULL,
orte_errmgr_base_register_migration_warning,
orte_errmgr_base_register_error_callback,
orte_errmgr_base_execute_error_callbacks
};
static void proc_errors(int fd, short args, void *cbdata);
/************************
* API Definitions
************************/
static int init(void)
{
/* setup state machine to trap proc errors */
orte_state.add_proc_state(ORTE_PROC_STATE_ERROR, proc_errors, ORTE_ERROR_PRI);
return ORTE_SUCCESS;
}
static int finalize(void)
{
return ORTE_SUCCESS;
}
static void proc_errors(int fd, short args, void *cbdata)
{
orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
"%s errmgr:default_tool: proc %s state %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&caddy->name),
orte_proc_state_to_str(caddy->proc_state)));
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
OBJ_RELEASE(caddy);
return;
}
orte_errmgr_base_abort(ORTE_ERROR_DEFAULT_EXIT_CODE, NULL);
OBJ_RELEASE(caddy);
}
static int abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs,
int error_code)
{
/* just abort */
if (0 < opal_output_get_verbosity(orte_errmgr_base_framework.framework_output)) {
orte_errmgr_base_abort(error_code, "%s called abort_peers",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
} else {
orte_errmgr_base_abort(error_code, NULL);
}
return ORTE_SUCCESS;
}

Просмотреть файл

@ -0,0 +1,35 @@
/*
* Copyright (c) 2013 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_default_tool_EXPORT_H
#define MCA_ERRMGR_default_tool_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_default_tool_component;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_default_tool_module;
END_C_DECLS
#endif /* MCA_ERRMGR_tool_EXPORT_H */

Просмотреть файл

@ -0,0 +1,98 @@
/*
* Copyright (c) 2013 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_default_tool.h"
/*
* Public string for version number
*/
const char *orte_errmgr_default_tool_component_version_string =
"ORTE ERRMGR default_tool MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_default_tool_register(void);
static int errmgr_default_tool_open(void);
static int errmgr_default_tool_close(void);
static int errmgr_default_tool_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_default_tool_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"default_tool",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_default_tool_open,
errmgr_default_tool_close,
errmgr_default_tool_component_query,
errmgr_default_tool_register
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
};
static int my_priority;
static int errmgr_default_tool_register(void)
{
mca_base_component_t *c = &mca_errmgr_default_tool_component.base_version;
my_priority = 1000;
(void) mca_base_component_var_register(c, "priority",
"Priority of the default_tool errmgr component",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9,
MCA_BASE_VAR_SCOPE_READONLY, &my_priority);
return ORTE_SUCCESS;
}
static int errmgr_default_tool_open(void)
{
return ORTE_SUCCESS;
}
static int errmgr_default_tool_close(void)
{
return ORTE_SUCCESS;
}
static int errmgr_default_tool_component_query(mca_base_module_t **module, int *priority)
{
if (ORTE_PROC_IS_TOOL) {
/* set our priority high as we are the default for tools */
*priority = my_priority;
*module = (mca_base_module_t *)&orte_errmgr_default_tool_module;
return ORTE_SUCCESS;
}
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}

Просмотреть файл

@ -40,6 +40,7 @@
#include "orte/mca/routed/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/iof/base/base.h"
#include "orte/mca/state/base/base.h"
#if OPAL_ENABLE_FT_CR == 1
#include "orte/mca/snapc/base/base.h"
#endif
@ -67,6 +68,18 @@ int orte_ess_base_tool_setup(void)
orte_process_info.proc_type |= ORTE_PROC_NON_MPI;
}
/* open and setup the state machine */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_state_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_state_base_select";
goto error;
}
/* Setup the communication infrastructure */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
ORTE_ERROR_LOG(ret);

34
orte/mca/state/tool/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,34 @@
#
# Copyright (c) 2013 Intel, Inc. All rights reserved
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
sources = \
state_tool.h \
state_tool_component.c \
state_tool.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_state_tool_DSO
component_noinst =
component_install = mca_state_tool.la
else
component_noinst = libmca_state_tool.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_state_tool_la_SOURCES = $(sources)
mca_state_tool_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_state_tool_la_SOURCES =$(sources)
libmca_state_tool_la_LDFLAGS = -module -avoid-version

102
orte/mca/state/tool/state_tool.c Обычный файл
Просмотреть файл

@ -0,0 +1,102 @@
/*
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/util/output.h"
#include "orte/runtime/orte_quit.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/state/state.h"
#include "orte/mca/state/base/state_private.h"
#include "state_tool.h"
/*
* Module functions: Global
*/
static int init(void);
static int finalize(void);
/******************
* TOOL module - just uses base functions after
* initializing the proc state machine. Job state
* machine is unused by tools at this
* time.
******************/
orte_state_base_module_t orte_state_tool_module = {
init,
finalize,
orte_state_base_activate_job_state,
orte_state_base_add_job_state,
orte_state_base_set_job_state_callback,
orte_state_base_set_job_state_priority,
orte_state_base_remove_job_state,
orte_state_base_activate_proc_state,
orte_state_base_add_proc_state,
orte_state_base_set_proc_state_callback,
orte_state_base_set_proc_state_priority,
orte_state_base_remove_proc_state
};
static void force_quit(int fd, short args, void *cbdata)
{
/* dont attempt to finalize as it could throw
* us into an infinite loop on errors
*/
exit(orte_exit_status);
}
/************************
* API Definitions
************************/
static int init(void)
{
int rc;
OBJ_CONSTRUCT(&orte_job_states, opal_list_t);
OBJ_CONSTRUCT(&orte_proc_states, opal_list_t);
/* add a default error response */
if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_FORCED_EXIT,
force_quit, ORTE_ERROR_PRI))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
static int finalize(void)
{
opal_list_item_t *item;
/* cleanup the state machines */
while (NULL != (item = opal_list_remove_first(&orte_proc_states))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_proc_states);
while (NULL != (item = opal_list_remove_first(&orte_job_states))) {
OBJ_RELEASE(item);
}
OBJ_DESTRUCT(&orte_job_states);
return ORTE_SUCCESS;
}

37
orte/mca/state/tool/state_tool.h Обычный файл
Просмотреть файл

@ -0,0 +1,37 @@
/*
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_STATE_TOOL_EXPORT_H
#define MCA_STATE_TOOL_EXPORT_H
#include "orte_config.h"
#include "orte/mca/state/state.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_state_base_component_t mca_state_tool_component;
ORTE_DECLSPEC extern orte_state_base_module_t orte_state_tool_module;
END_C_DECLS
#endif /* MCA_STATE_TOOL_EXPORT_H */

85
orte/mca/state/tool/state_tool_component.c Обычный файл
Просмотреть файл

@ -0,0 +1,85 @@
/*
* Copyright (c) 2011 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013 Intel, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/state/state.h"
#include "orte/mca/state/base/base.h"
#include "state_tool.h"
/*
* Public string for version number
*/
const char *orte_state_tool_component_version_string =
"ORTE STATE tool MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int state_tool_open(void);
static int state_tool_close(void);
static int state_tool_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_state_base_component_t mca_state_tool_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component
*/
{
ORTE_STATE_BASE_VERSION_1_0_0,
/* Component name and version */
"tool",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
state_tool_open,
state_tool_close,
state_tool_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
};
static int my_priority=1000;
static int state_tool_open(void)
{
return ORTE_SUCCESS;
}
static int state_tool_close(void)
{
return ORTE_SUCCESS;
}
static int state_tool_component_query(mca_base_module_t **module, int *priority)
{
if (ORTE_PROC_IS_TOOL) {
/* set our priority high as we are the default for tools */
*priority = my_priority;
*module = (mca_base_module_t *)&orte_state_tool_module;
return ORTE_SUCCESS;
}
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}