1
1

Move the resilient orte errmgr code into a seperate errmgr for now while it's

still unstable. Reverted errmgr modules back to the original errmgr (with the
updates since the resilient code was brought into the trunk).

This commit was SVN r24958.
Этот коммит содержится в:
Wesley Bland 2011-07-28 21:24:34 +00:00
родитель 6c879f87fb
Коммит 5fde3e0e00
28 изменённых файлов: 7288 добавлений и 1102 удалений

Просмотреть файл

@ -371,7 +371,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
}
/* Register errhandler callback with orte errmgr */
orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback);
if (NULL != orte_errmgr.set_fault_callback) {
orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback);
}
/* Figure out the final MPI thread levels. If we were not
compiled for support for MPI threads, then don't allow

Просмотреть файл

@ -1,13 +1,9 @@
/*
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
*
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -26,15 +22,11 @@
#endif
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "opal/mca/event/event.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
@ -56,22 +48,9 @@ static int update_state(orte_jobid_t job,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
static int post_startup(void);
static int pre_shutdown(void);
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata);
void epoch_change(int fd,
short event,
void *data);
/******************
* HNP module
******************/
@ -86,11 +65,11 @@ orte_errmgr_base_module_t orte_errmgr_app_module = {
NULL,
NULL,
orte_errmgr_base_register_migration_warning,
post_startup,
pre_shutdown,
NULL,
orte_errmgr_base_set_fault_callback,
NULL
NULL, /* post_startup */
NULL, /* pre_shutdown */
NULL, /* mark_processes_as_dead */
NULL, /* set_fault_callback */
NULL /* failure_notification */
};
/************************
@ -113,8 +92,6 @@ static int update_state(orte_jobid_t job,
pid_t pid,
orte_exit_code_t exit_code)
{
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app: job %s reported state %s"
" for proc %s state %s exit_code %d",
@ -132,9 +109,9 @@ static int update_state(orte_jobid_t job,
}
if (ORTE_PROC_STATE_COMM_FAILED == state) {
mask = ORTE_NS_CMP_ALL;
/* if it is our own connection, ignore it */
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
if (ORTE_PROC_MY_NAME->jobid == proc->vpid &&
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
return ORTE_SUCCESS;
}
@ -148,95 +125,6 @@ static int update_state(orte_jobid_t job,
return ORTE_SUCCESS;
}
static int post_startup(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE,
ORTE_RML_PERSISTENT,
epoch_change_recv,
NULL);
return ret;
}
static int pre_shutdown(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE);
return ret;
}
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata) {
ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change);
}
void epoch_change(int fd,
short event,
void *data) {
orte_message_event_t *mev = (orte_message_event_t *) data;
opal_buffer_t *buffer = mev->buffer;
orte_process_name_t *proc;
int n = 1, ret, num_dead, i;
opal_pointer_array_t *procs;
if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Received epoch change notification",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
procs = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead);
for (i = 0; i < num_dead; i++) {
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc[i].epoch++;
orte_util_set_epoch(&proc[i], proc[i].epoch);
opal_pointer_array_add(procs, &proc[i]);
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Epoch for %s updated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc[i])));
}
if (NULL != fault_cbfunc && 0 < num_dead) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Calling fault callback",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
(*fault_cbfunc)(procs);
} else {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Calling fault callback failed!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
free(proc);
OBJ_RELEASE(procs);
}
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
{
int ret, exit_status = ORTE_SUCCESS;
@ -278,7 +166,7 @@ static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr
goto cleanup;
}
cleanup:
cleanup:
OBJ_DESTRUCT(&buffer);
return exit_status;

12
orte/mca/errmgr/appresil/.windows Обычный файл
Просмотреть файл

@ -0,0 +1,12 @@
#
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
mca_link_libraries=libopen-rte

36
orte/mca/errmgr/appresil/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,36 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
sources = \
errmgr_appresil.h \
errmgr_appresil_component.c \
errmgr_appresil.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_errmgr_appresil_DSO
component_noinst =
component_install = mca_errmgr_appresil.la
else
component_noinst = libmca_errmgr_appresil.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_appresil_la_SOURCES = $(sources)
mca_errmgr_appresil_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_appresil_la_SOURCES =$(sources)
libmca_errmgr_appresil_la_LDFLAGS = -module -avoid-version

285
orte/mca/errmgr/appresil/errmgr_appresil.c Обычный файл
Просмотреть файл

@ -0,0 +1,285 @@
/*
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* All rights reserved.
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "opal/mca/event/event.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_appresil.h"
/*
* Module functions: Global
*/
static int init(void);
static int finalize(void);
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
static int orte_errmgr_appresil_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
static int post_startup(void);
static int pre_shutdown(void);
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata);
void epoch_change(int fd,
short event,
void *data);
/******************
* HNP module
******************/
orte_errmgr_base_module_t orte_errmgr_appresil_module = {
init,
finalize,
orte_errmgr_base_log,
orte_errmgr_base_abort,
orte_errmgr_appresil_abort_peers,
update_state,
NULL,
NULL,
NULL,
orte_errmgr_base_register_migration_warning,
post_startup,
pre_shutdown,
NULL,
orte_errmgr_base_set_fault_callback,
NULL
};
/************************
* API Definitions
************************/
static int init(void)
{
return ORTE_SUCCESS;
}
static int finalize(void)
{
return ORTE_SUCCESS;
}
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code)
{
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil: job %s reported state %s"
" for proc %s state %s exit_code %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job),
orte_job_state_to_str(jobstate),
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state), exit_code));
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
return ORTE_SUCCESS;
}
if (ORTE_PROC_STATE_COMM_FAILED == state) {
mask = ORTE_NS_CMP_ALL;
/* if it is our own connection, ignore it */
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
return ORTE_SUCCESS;
}
/* delete the route */
orte_routed.delete_route(proc);
/* see is this was a lifeline */
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
return ORTE_ERR_UNRECOVERABLE;
}
}
return ORTE_SUCCESS;
}
static int post_startup(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE,
ORTE_RML_PERSISTENT,
epoch_change_recv,
NULL);
return ret;
}
static int pre_shutdown(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE);
return ret;
}
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata) {
ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change);
}
void epoch_change(int fd,
short event,
void *data) {
orte_message_event_t *mev = (orte_message_event_t *) data;
opal_buffer_t *buffer = mev->buffer;
orte_process_name_t *proc;
int n = 1, ret, num_dead, i;
opal_pointer_array_t *procs;
if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil Received epoch change notification",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
procs = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead);
for (i = 0; i < num_dead; i++) {
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc[i].epoch++;
orte_util_set_epoch(&proc[i], proc[i].epoch);
opal_pointer_array_add(procs, &proc[i]);
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil Epoch for %s updated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc[i])));
}
if (NULL != fault_cbfunc && 0 < num_dead) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil Calling fault callback",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
(*fault_cbfunc)(procs);
} else {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil Calling fault callback failed!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
free(proc);
OBJ_RELEASE(procs);
}
static int orte_errmgr_appresil_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
{
int ret, exit_status = ORTE_SUCCESS;
opal_buffer_t buffer;
orte_std_cntr_t i;
orte_daemon_cmd_flag_t command = ORTE_DAEMON_ABORT_PROCS_CALLED;
/*
* Pack up the list of processes and send them to the HNP
*/
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/* pack number of processes */
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(num_procs), 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/* Pack the list of names */
for( i = 0; i < num_procs; ++i ) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(procs[i]), 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
}
/* Send to HNP for termination */
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_DAEMON, 0))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
cleanup:
OBJ_DESTRUCT(&buffer);
return exit_status;
}

35
orte/mca/errmgr/appresil/errmgr_appresil.h Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_APPRESIL_EXPORT_H
#define MCA_ERRMGR_APPRESIL_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_appresil_component;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_appresil_module;
END_C_DECLS
#endif /* MCA_ERRMGR_APPRESIL_EXPORT_H */

Просмотреть файл

@ -0,0 +1,89 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_appresil.h"
/*
* Public string for version number
*/
const char *orte_errmgr_appresil_component_version_string =
"ORTE ERRMGR appresil MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_appresil_open(void);
static int errmgr_appresil_close(void);
static int errmgr_appresil_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_appresil_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component itapp
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"appresil",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_appresil_open,
errmgr_appresil_close,
errmgr_appresil_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
0
};
static int errmgr_appresil_open(void)
{
return ORTE_SUCCESS;
}
static int errmgr_appresil_close(void)
{
return ORTE_SUCCESS;
}
static int errmgr_appresil_component_query(mca_base_module_t **module, int *priority)
{
if (ORTE_PROC_IS_APP) {
/* keep our priority low so that other modules are higher
* and will run before us
*/
*priority = 0;
*module = (mca_base_module_t *)&orte_errmgr_appresil_module;
return ORTE_SUCCESS;
}
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,8 +1,5 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -60,6 +57,10 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat,
orte_vpid_t vpid,
orte_proc_state_t state,
orte_exit_code_t exit_code);
/***************************
* Module functions: Global
@ -80,11 +81,6 @@ int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnp_global_ft_event(int state);
int orte_errmgr_hnp_global_post_startup(void);
int orte_errmgr_hnp_global_pre_shutdown(void);
int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs);
int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc);
/* HNP Versions */
int orte_errmgr_hnp_base_global_init(void);

Просмотреть файл

@ -1,10 +1,7 @@
/*
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -394,7 +391,6 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *node = NULL;
bool found = false;
int num_removed = 0, num_to_remove;
orte_ns_cmp_bitmask_t mask;
if( NULL == current_global_jobdata ) {
return ORTE_SUCCESS;
@ -414,8 +410,8 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
item = opal_list_get_next(item) ) {
wp_item = (errmgr_autor_wp_item_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &wp_item->name, &proc->name)) {
if( wp_item->name.vpid == proc->name.vpid &&
wp_item->name.jobid == proc->name.jobid ) {
found = true;
break;
}
@ -522,7 +518,6 @@ static void errmgr_autor_process_fault_app(orte_job_t *jdata,
wp_item = OBJ_NEW(errmgr_autor_wp_item_t);
wp_item->name.jobid = proc->jobid;
wp_item->name.vpid = proc->vpid;
wp_item->name.epoch = proc->epoch;
wp_item->state = state;
opal_list_append(procs_pending_recovery, &(wp_item->super));
@ -617,7 +612,7 @@ static void errmgr_autor_process_fault_daemon(orte_job_t *jdata,
/*
* Record the dead daemon
*/
orte_errmgr_hnp_record_dead_process(proc);
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, 0);
return;
}
@ -626,7 +621,6 @@ void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp)
{
wp->name.jobid = ORTE_JOBID_INVALID;
wp->name.vpid = ORTE_VPID_INVALID;
wp->name.epoch = ORTE_EPOCH_MIN;
wp->state = 0;
}
@ -635,7 +629,6 @@ void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp)
{
wp->name.jobid = ORTE_JOBID_INVALID;
wp->name.vpid = ORTE_VPID_INVALID;
wp->name.epoch = ORTE_EPOCH_INVALID;
wp->state = 0;
}

Просмотреть файл

@ -2,9 +2,6 @@
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -750,7 +747,6 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
}
}
}
@ -807,7 +803,6 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
}
}
}
@ -855,7 +850,6 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
}
}
}

12
orte/mca/errmgr/hnpresil/.windows Обычный файл
Просмотреть файл

@ -0,0 +1,12 @@
#
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
mca_link_libraries=libopen-rte

40
orte/mca/errmgr/hnpresil/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,40 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
dist_pkgdata_DATA = help-orte-errmgr-hnp.txt
sources = \
errmgr_hnpresil.h \
errmgr_hnpresil_component.c \
errmgr_hnpresil.c \
errmgr_hnpresil_autor.c \
errmgr_hnpresil_crmig.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_errmgr_hnpresil_DSO
component_noinst =
component_install = mca_errmgr_hnpresil.la
else
component_noinst = libmca_errmgr_hnpresil.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_hnpresil_la_SOURCES = $(sources)
mca_errmgr_hnpresil_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_hnpresil_la_SOURCES =$(sources)
libmca_errmgr_hnpresil_la_LDFLAGS = -module -avoid-version

2112
orte/mca/errmgr/hnpresil/errmgr_hnpresil.c Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

137
orte/mca/errmgr/hnpresil/errmgr_hnpresil.h Обычный файл
Просмотреть файл

@ -0,0 +1,137 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_HNPRESIL_EXPORT_H
#define MCA_ERRMGR_HNPRESIL_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
struct orte_errmgr_hnpresil_component_t {
orte_errmgr_base_component_t super; /** Base Errmgr component */
bool ignore_current_update;
bool term_in_progress;
#if OPAL_ENABLE_FT_CR
/* State of the Recovery */
bool crmig_in_progress;
bool autor_in_progress;
/* CRMig Options */
bool crmig_enabled;
bool crmig_timing_enabled;
/* AutoR Options */
bool autor_enabled;
bool autor_timing_enabled;
int autor_recovery_delay;
bool autor_skip_oldnode;
#endif
};
typedef struct orte_errmgr_hnpresil_component_t orte_errmgr_hnpresil_component_t;
OPAL_MODULE_DECLSPEC extern orte_errmgr_hnpresil_component_t mca_errmgr_hnpresil_component;
int orte_errmgr_hnpresil_component_query(mca_base_module_t **module, int *priority);
void orte_errmgr_hnpresil_update_proc(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
/***************************
* Module functions: Global
***************************/
int orte_errmgr_hnpresil_global_module_init(void);
int orte_errmgr_hnpresil_global_module_finalize(void);
int orte_errmgr_hnpresil_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnpresil_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
int orte_errmgr_hnpresil_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnpresil_global_ft_event(int state);
int orte_errmgr_hnpresil_global_post_startup(void);
int orte_errmgr_hnpresil_global_pre_shutdown(void);
int orte_errmgr_hnpresil_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs);
int orte_errmgr_hnpresil_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
int orte_errmgr_hnpresil_record_dead_process(orte_process_name_t *proc);
/* hnpresil Versions */
int orte_errmgr_hnpresil_base_global_init(void);
int orte_errmgr_hnpresil_base_global_finalize(void);
int orte_errmgr_hnpresil_base_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnpresil_base_global_ft_event(int state);
#if OPAL_ENABLE_FT_CR
/* CRMig Versions */
int orte_errmgr_hnpresil_crmig_global_module_init(void);
int orte_errmgr_hnpresil_crmig_global_module_finalize(void);
int orte_errmgr_hnpresil_crmig_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnpresil_crmig_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
int orte_errmgr_hnpresil_crmig_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnpresil_crmig_global_ft_event(int state);
/* AutoR Versions */
int orte_errmgr_hnpresil_autor_global_module_init(void);
int orte_errmgr_hnpresil_autor_global_module_finalize(void);
int orte_errmgr_hnpresil_autor_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnpresil_autor_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnpresil_autor_global_ft_event(int state);
#endif
END_C_DECLS
#endif /* MCA_ERRMGR_HNPRESIL_EXPORT_H */

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,201 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_hnpresil.h"
/*
* Public string for version number
*/
const char *orte_errmgr_hnpresil_component_version_string =
"ORTE ERRMGR hnpresil MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int orte_errmgr_hnpresil_open(void);
static int orte_errmgr_hnpresil_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_hnpresil_component_t mca_errmgr_hnpresil_component = {
/* First do the base component stuff */
{
/* Handle the general mca_component_t struct containing
* meta information about the component hnp
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"hnpresil",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
orte_errmgr_hnpresil_open,
orte_errmgr_hnpresil_close,
orte_errmgr_hnpresil_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
0
}
};
static int orte_errmgr_hnpresil_open(void)
{
int val;
/*
* This should be the last componet to ever get used since
* it doesn't do anything.
*/
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"priority",
"Priority of the ERRMGR hnp component",
false, false,
mca_errmgr_hnpresil_component.super.priority,
&mca_errmgr_hnpresil_component.super.priority);
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"verbose",
"Verbose level for the ERRMGR hnp component",
false, false,
mca_errmgr_hnpresil_component.super.verbose,
&mca_errmgr_hnpresil_component.super.verbose);
/* If there is a custom verbose level for this component than use it
* otherwise take our parents level and output channel
*/
if ( 0 != mca_errmgr_hnpresil_component.super.verbose) {
mca_errmgr_hnpresil_component.super.output_handle = opal_output_open(NULL);
opal_output_set_verbosity(mca_errmgr_hnpresil_component.super.output_handle,
mca_errmgr_hnpresil_component.super.verbose);
} else {
mca_errmgr_hnpresil_component.super.output_handle = orte_errmgr_base.output;
}
#if OPAL_ENABLE_FT_CR
/****************************
* CRMig (C/R Process Migration) MCA Options
****************************/
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"crmig_timing",
"Enable Process Migration timer",
false, false,
0, &val);
mca_errmgr_hnpresil_component.crmig_timing_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"crmig_enable",
"Enable Process Migration (Default: 0/off)",
false, false,
0, &val);
mca_errmgr_hnpresil_component.crmig_enabled = OPAL_INT_TO_BOOL(val);
/****************************
* AutoR (Automatic Recovery) MCA Options
****************************/
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"autor_timing",
"Enable Automatic Recovery timer",
false, false,
0, &val);
mca_errmgr_hnpresil_component.autor_timing_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"autor_enable",
"Enable Automatic Recovery (Default: 0/off)",
false, false,
0, &val);
mca_errmgr_hnpresil_component.autor_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"autor_recovery_delay",
"Number of seconds to wait before starting to recover the job after a failure"
" [Default: 1 sec]",
false, false,
1, &val);
mca_errmgr_hnpresil_component.autor_recovery_delay = val;
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"autor_skip_oldnode",
"Skip the old node from failed proc, even if it is still available"
" [Default: Enabled]",
false, false,
1, &val);
mca_errmgr_hnpresil_component.autor_skip_oldnode = OPAL_INT_TO_BOOL(val);
#else
val = 0; /* Silence compiler warning */
#endif /* OPAL_ENABLE_FT_CR */
/*
* Debug Output
*/
opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open()");
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: priority = %d",
mca_errmgr_hnpresil_component.super.priority);
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: verbosity = %d",
mca_errmgr_hnpresil_component.super.verbose);
#if OPAL_ENABLE_FT_CR
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: --- CR Migration Options ---");
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: Process Migration = %s",
(mca_errmgr_hnpresil_component.crmig_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: timing = %s",
(mca_errmgr_hnpresil_component.crmig_timing_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: --- Auto. Recovery Options ---");
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: Auto. Recover = %s",
(mca_errmgr_hnpresil_component.autor_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: timing = %s",
(mca_errmgr_hnpresil_component.autor_timing_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: recover_delay = %d",
mca_errmgr_hnpresil_component.autor_recovery_delay);
mca_errmgr_hnpresil_component.crmig_in_progress = false;
mca_errmgr_hnpresil_component.autor_in_progress = false;
mca_errmgr_hnpresil_component.term_in_progress = false;
#endif /* OPAL_ENABLE_FT_CR */
return ORTE_SUCCESS;
}
static int orte_errmgr_hnpresil_close(void)
{
opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: close()");
return ORTE_SUCCESS;
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,71 @@
-*- text -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for ORTE Errmgr HNP module.
#
[errmgr-hnp:unknown-job-error]
An error has occurred in an unknown job. This generally should not happen
except due to an internal ORTE error.
Job state: %s
This information should probably be reported to the OMPI developers.
#
[errmgr-hnp:daemon-died]
The system has lost communication with the following daemon:
Daemon: %s
Node: %s
The reason for the lost communication channel is unknown. Possible
reasons include failure of the daemon itself, failure of the
connecting fabric/switch, and loss of the host node. Please
check with your system administrator to try and determine the
source of the problem.
Your job is being terminated as a result.
#
[errmgr-hnp:cannot-relocate]
The system is unable to relocate the specified process:
Process: %s
because the application for that process could not be found. This
appears to be a system error. Please report it to the ORTE
developers.
[autor_recovering_job]
Notice: The processes listed below failed unexpectedly.
Using the last checkpoint to recover the job.
Please standby.
%s
[autor_recovery_complete]
Notice: The job has been successfully recovered from the
last checkpoint.
[autor_failed_to_recover_proc]
Error: The process below has failed. There is no checkpoint available for
this job, so we are terminating the application since automatic
recovery cannot occur.
Internal Name: %s
MCW Rank: %d
[crmig_migrating_job]
Notice: A migration of this job has been requested.
The processes below will be migrated.
Please standby.
%s
[crmig_migrated_job]
Notice: The processes have been successfully migrated to/from the specified
machines.
[crmig_no_migrating_procs]
Warning: Could not find any processes to migrate on the nodes specified.
You provided the following:
Nodes: %s
Procs: %s

Просмотреть файл

@ -3,9 +3,6 @@
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -32,11 +29,9 @@
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/plm/plm_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/sensor/sensor.h"
@ -58,9 +53,8 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code);
static void update_local_children(orte_odls_job_t *jobdat,
orte_job_state_t jobstate,
orte_proc_state_t state);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch);
static int record_dead_process(orte_process_name_t *proc);
static int send_to_local_applications(opal_pointer_array_t *dead_names);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
/*
* Module functions: Global
@ -85,11 +79,7 @@ static int suggest_map_targets(orte_proc_t *proc,
static int ft_event(int state);
static int post_startup(void);
static int pre_shutdown(void);
static int mark_processes_as_dead(opal_pointer_array_t *dead_procs);
static int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
/******************
* ORTED module
@ -105,11 +95,11 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = {
suggest_map_targets,
ft_event,
orte_errmgr_base_register_migration_warning,
post_startup,
pre_shutdown,
mark_processes_as_dead,
orte_errmgr_base_set_fault_callback, /* Set callback function */
failure_notification
NULL, /* post_startup */
NULL, /* pre_shutdown */
NULL, /* mark_processes_as_dead */
NULL, /* set_fault_callback */
NULL /* failure_notification */
};
/************************
@ -140,29 +130,20 @@ static int update_state(orte_jobid_t job,
int rc=ORTE_SUCCESS;
orte_vpid_t null=ORTE_VPID_INVALID;
orte_app_context_t *app;
orte_ns_cmp_bitmask_t mask;
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:orted:update_state() %s) "
"------- %s state updated for process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
((NULL == proc) ? "App. Process" :
(proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")),
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));
/* if this is a heartbeat failure, let the HNP handle it */
if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate ||
ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
return ORTE_SUCCESS;
}
/*** UPDATE COMMAND FOR A JOB ***/
if (NULL == proc) {
/* this is an update for an entire job */
@ -199,7 +180,7 @@ static int update_state(orte_jobid_t job,
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == job) {
break;
@ -208,7 +189,7 @@ static int update_state(orte_jobid_t job,
if (NULL == jobdat) {
return ORTE_ERR_NOT_FOUND;
}
switch (jobstate) {
case ORTE_JOB_STATE_FAILED_TO_START:
failed_start(jobdat, exit_code);
@ -221,10 +202,10 @@ static int update_state(orte_jobid_t job,
/* update all procs in job */
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
/* order all local procs for this job to be killed */
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
case ORTE_JOB_STATE_COMM_FAILED:
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* tell the caller we can't recover */
return ORTE_ERR_UNRECOVERABLE;
break;
@ -261,16 +242,15 @@ static int update_state(orte_jobid_t job,
* lifeline
*/
if (ORTE_PROC_STATE_COMM_FAILED == state) {
mask = ORTE_NS_CMP_ALL;
/* if it is our own connection, ignore it */
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
if (ORTE_PROC_MY_NAME->jobid == proc->jobid &&
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
return ORTE_SUCCESS;
}
/* see if this was a lifeline */
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
/* kill our children */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* terminate - our routed children will see
* us leave and automatically die
*/
@ -281,25 +261,21 @@ static int update_state(orte_jobid_t job,
/* was it a daemon that failed? */
if (proc->jobid == ORTE_PROC_MY_NAME->jobid) {
/* if all my routes are gone, then terminate ourselves */
if (0 == orte_routed.num_routes() &&
0 == opal_list_get_size(&orte_local_children)) {
if (0 == orte_routed.num_routes()) {
orte_quit();
}
}
record_dead_process(proc);
/* if not, then indicate we can continue */
return ORTE_SUCCESS;
}
/* lookup the local jobdat for this job */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == proc->jobid) {
break;
@ -309,7 +285,7 @@ static int update_state(orte_jobid_t job,
/* must already be complete */
return ORTE_SUCCESS;
}
/* if there are no local procs for this job, we can
* ignore this call
*/
@ -330,15 +306,15 @@ static int update_state(orte_jobid_t job,
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
child->exit_code = exit_code;
/* Decrement the number of local procs */
jobdat->num_local_procs--;
/* kill this proc */
killprocs(proc->jobid, proc->vpid, proc->epoch);
killprocs(proc->jobid, proc->vpid);
}
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx);
if( jobdat->enable_recovery && child->restarts < app->max_restarts ) {
@ -364,7 +340,7 @@ static int update_state(orte_jobid_t job,
/* treat this as normal termination */
goto REPORT_STATE;
}
if (ORTE_PROC_STATE_TERMINATED < state) {
if( jobdat->enable_recovery ) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
@ -375,8 +351,8 @@ static int update_state(orte_jobid_t job,
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
/* see if this child has reached its local restart limit */
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx);
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
@ -403,8 +379,8 @@ static int update_state(orte_jobid_t job,
}
}
}
REPORT_ABORT:
REPORT_ABORT:
/* if the job hasn't completed and the state is abnormally
* terminated, then we need to alert the HNP right away
*/
@ -427,8 +403,8 @@ REPORT_ABORT:
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
child->exit_code = exit_code;
@ -442,7 +418,7 @@ REPORT_ABORT:
opal_list_remove_item(&orte_local_children, &child->super);
/* Decrement the number of local procs */
jobdat->num_local_procs--;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted reporting proc %s aborted to HNP (local procs = %d)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -464,15 +440,15 @@ REPORT_ABORT:
OBJ_DESTRUCT(&alert);
return rc;
}
REPORT_STATE:
REPORT_STATE:
/* find this proc in the local children so we can update its state */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
if (0 < pid) {
@ -492,7 +468,7 @@ REPORT_ABORT:
* the HNP so it is available to debuggers and anyone
* else that needs it
*/
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted: sending contact info to HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -509,7 +485,7 @@ REPORT_ABORT:
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
/* pack all the local child vpids and epochs */
/* pack all the local child vpids */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
@ -546,7 +522,7 @@ REPORT_ABORT:
}
return rc;
}
/* only other state is terminated - see if anyone is left alive */
if (!any_live_children(proc->jobid)) {
/* lookup the local jobdat for this job */
@ -555,7 +531,7 @@ REPORT_ABORT:
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == proc->jobid) {
break;
@ -577,8 +553,8 @@ REPORT_ABORT:
if (ORTE_SUCCESS != (rc = pack_state_update(&alert, jobdat))) {
ORTE_ERROR_LOG(rc);
}
FINAL_CLEANUP:
FINAL_CLEANUP:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted reporting all procs in %s terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -592,7 +568,7 @@ FINAL_CLEANUP:
item = next) {
child = (orte_odls_child_t*)item;
next = opal_list_get_next(item);
if (jobdat->jobid == child->name->jobid) {
opal_list_remove_item(&orte_local_children, &child->super);
OBJ_RELEASE(child);
@ -601,11 +577,11 @@ FINAL_CLEANUP:
/* ensure the job's local session directory tree is removed */
orte_session_dir_cleanup(jobdat->jobid);
/* remove this job from our local job data since it is complete */
opal_list_remove_item(&orte_local_jobdata, &jobdat->super);
OBJ_RELEASE(jobdat);
/* send it */
if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) {
ORTE_ERROR_LOG(rc);
@ -613,7 +589,6 @@ FINAL_CLEANUP:
rc = ORTE_SUCCESS;
}
OBJ_DESTRUCT(&alert);
/* indicate that the job is complete */
return rc;
}
@ -639,131 +614,6 @@ int ft_event(int state)
return ORTE_SUCCESS;
}
int post_startup(void) {
return ORTE_SUCCESS;
}
int pre_shutdown(void) {
return ORTE_SUCCESS;
}
int mark_processes_as_dead(opal_pointer_array_t *dead_procs) {
int i;
orte_process_name_t *name_item;
opal_list_item_t *item;
orte_odls_child_t *child;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"ORTED %s marking procs as dead",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) {
if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) {
opal_output(0, "NULL found in dead process list.");
continue;
} else {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"ORTED %s marking %s as dead",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name_item)));
}
if (name_item->epoch < orte_util_lookup_epoch(name_item)) {
continue;
}
/* Increment the epoch */
orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED);
orte_util_set_epoch(name_item, name_item->epoch + 1);
/* Remove the dead process from my list of children if applicable */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t *) item;
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID,
child->name, name_item)) {
opal_list_remove_item(&orte_local_children, item);
break;
}
}
/* Remove the route from the routing layer */
orte_routed.delete_route(name_item);
}
/* Update the routing module */
orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid);
if (NULL != fault_cbfunc) {
(*fault_cbfunc)(dead_procs);
}
return ORTE_SUCCESS;
}
int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) {
opal_pointer_array_t *dead_names;
orte_std_cntr_t n;
int ret = ORTE_SUCCESS, num_failed;
int32_t i;
orte_process_name_t *name_item, proc;
dead_names = OBJ_NEW(opal_pointer_array_t);
n = 1;
/* Get the number of failed procs */
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
return ret;
}
for (i = 0; i < num_failed; i++) {
/* Unpack the buffer to get the dead process' name. */
n = 1;
name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t));
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (orte_debug_daemons_flag) {
opal_output(0, "%s errmgr:orted ORTED received process %s failed from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name_item),
ORTE_NAME_PRINT(sender));
}
/* There shouldn't be an issue of receiving this message multiple
* times but it doesn't hurt to double check.
*/
if (proc.epoch < orte_util_lookup_epoch(name_item)) {
opal_output(1, "Received from proc %s local epoch %d", ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item));
continue;
}
opal_pointer_array_add(dead_names, name_item);
}
/* Tell the errmgr so it can handle changing the epoch, routes, etc. */
orte_errmgr.mark_processes_as_dead(dead_names);
/* Tell the applications' ORTE layers that there is a failure. */
if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) {
return ret;
}
for (i = 0; i < num_failed; i++) {
name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i);
free(name_item);
}
return ret;
}
/*****************
* Local Functions
*****************/
@ -771,14 +621,14 @@ static bool any_live_children(orte_jobid_t job)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) &&
child->alive) {
@ -788,13 +638,13 @@ static bool any_live_children(orte_jobid_t job)
/* if we get here, then nobody is left alive from that job */
return false;
}
static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child)
{
int rc;
/* pack the child's vpid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name->vpid), 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
@ -829,70 +679,70 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child)
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat)
{
int rc;
opal_list_item_t *item, *next;
orte_odls_child_t *child;
orte_vpid_t null=ORTE_VPID_INVALID;
/* pack the jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we are timing things, pack the time the launch msg for this job was recvd */
if (orte_timing) {
int64_t tmp;
tmp = jobdat->launch_msg_recvd.tv_sec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat)
{
int rc;
opal_list_item_t *item, *next;
orte_odls_child_t *child;
orte_vpid_t null=ORTE_VPID_INVALID;
/* pack the jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
tmp = jobdat->launch_msg_recvd.tv_usec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
child = (orte_odls_child_t*)item;
next = opal_list_get_next(item);
/* if this child is part of the job... */
if (child->name->jobid == jobdat->jobid) {
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
/* if we are timing things, pack the time the launch msg for this job was recvd */
if (orte_timing) {
int64_t tmp;
tmp = jobdat->launch_msg_recvd.tv_sec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
tmp = jobdat->launch_msg_recvd.tv_usec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
child = (orte_odls_child_t*)item;
next = opal_list_get_next(item);
/* if this child is part of the job... */
if (child->name->jobid == jobdat->jobid) {
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
/* flag that this job is complete so the receiver can know */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
/* flag that this job is complete so the receiver can know */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
static bool all_children_registered(orte_jobid_t job)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
/* if this child has terminated, we consider it as having
@ -918,10 +768,10 @@ static bool all_children_registered(orte_jobid_t job)
}
}
}
/* if we get here, then everyone in the job is currently registered */
return true;
}
static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
@ -929,14 +779,14 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
opal_list_item_t *item;
orte_odls_child_t *child;
int rc;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
/* pack the child's vpid - must be done in case rml_uri is NULL */
@ -944,11 +794,10 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
ORTE_ERROR_LOG(rc);
return rc;
}
/* Pack the child's epoch. */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->epoch), 1, ORTE_EPOCH))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* pack the contact info */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
@ -956,19 +805,19 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
}
}
}
return ORTE_SUCCESS;
}
static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* set the state */
jobdat->state = ORTE_JOB_STATE_FAILED_TO_START;
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
@ -997,7 +846,7 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* update job state */
jobdat->state = jobstate;
/* update children */
@ -1011,29 +860,28 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs
}
}
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
{
opal_pointer_array_t cmd;
orte_proc_t proc;
int rc;
/* stop local sensors for this job */
if (ORTE_VPID_WILDCARD == vpid) {
orte_sensor.stop(job);
}
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) {
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) {
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
ORTE_ERROR_LOG(rc);
}
return;
}
OBJ_CONSTRUCT(&cmd, opal_pointer_array_t);
OBJ_CONSTRUCT(&proc, orte_proc_t);
proc.name.jobid = job;
proc.name.vpid = vpid;
proc.name.epoch = epoch;
opal_pointer_array_add(&cmd, &proc);
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
ORTE_ERROR_LOG(rc);
@ -1041,85 +889,3 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
OBJ_DESTRUCT(&cmd);
OBJ_DESTRUCT(&proc);
}
static int record_dead_process(orte_process_name_t *proc) {
opal_pointer_array_t *dead_name;
opal_buffer_t *buffer;
orte_daemon_cmd_flag_t command;
int rc = ORTE_SUCCESS;
int num_failed;
if (orte_odls_base_default_check_finished(proc)) {
return rc;
}
dead_name = OBJ_NEW(opal_pointer_array_t);
opal_pointer_array_add(dead_name, proc);
/* Mark the process as dead */
mark_processes_as_dead(dead_name);
/* Send a message to the HNP */
buffer = OBJ_NEW(opal_buffer_t);
command = ORTE_PROCESS_FAILED_NOTIFICATION;
num_failed = 1;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
} else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
} else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
}
orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0);
OBJ_RELEASE(buffer);
OBJ_RELEASE(dead_name);
return rc;
}
int send_to_local_applications(opal_pointer_array_t *dead_names) {
opal_buffer_t *buf;
int ret;
orte_process_name_t *name_item;
int size, i;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"%s Sending failure to local applications.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
buf = OBJ_NEW(opal_buffer_t);
size = opal_pointer_array_get_size(dead_names);
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
for (i = 0; i < size; i++) {
if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
}
}
if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
OBJ_RELEASE(buf);
return ORTE_SUCCESS;
}

12
orte/mca/errmgr/ortedresil/.windows Обычный файл
Просмотреть файл

@ -0,0 +1,12 @@
#
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
mca_link_libraries=libopen-rte

38
orte/mca/errmgr/ortedresil/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,38 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
dist_pkgdata_DATA = help-orte-errmgr-orted.txt
sources = \
errmgr_ortedresil.h \
errmgr_ortedresil_component.c \
errmgr_ortedresil.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_errmgr_ortedresil_DSO
component_noinst =
component_install = mca_errmgr_ortedresil.la
else
component_noinst = libmca_errmgr_ortedresil.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_ortedresil_la_SOURCES = $(sources)
mca_errmgr_ortedresil_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_ortedresil_la_SOURCES =$(sources)
libmca_errmgr_ortedresil_la_LDFLAGS = -module -avoid-version

1126
orte/mca/errmgr/ortedresil/errmgr_ortedresil.c Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,35 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_ORTEDRESIL_EXPORT_H
#define MCA_ERRMGR_ORTEDRESIL_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_ortedresil_component;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_ortedresil_module;
END_C_DECLS
#endif /* MCA_ERRMGR_ORTEDRESIL_EXPORT_H */

Просмотреть файл

@ -0,0 +1,84 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_ortedresil.h"
/*
* Public string for version number
*/
const char *orte_errmgr_ortedresil_component_version_string =
"ORTE ERRMGR ortedresil MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_ortedresil_open(void);
static int errmgr_ortedresil_close(void);
static int errmgr_ortedresil_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_ortedresil_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component itortedresil
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"ortedresil",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_ortedresil_open,
errmgr_ortedresil_close,
errmgr_ortedresil_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int errmgr_ortedresil_open(void)
{
return ORTE_SUCCESS;
}
static int errmgr_ortedresil_close(void)
{
return ORTE_SUCCESS;
}
static int errmgr_ortedresil_component_query(mca_base_module_t **module, int *priority)
{
if (ORTE_PROC_IS_DAEMON) {
/* keep our priority low so that other modules are higher
* and will run before us
*/
*priority = 0;
*module = (mca_base_module_t *)&orte_errmgr_ortedresil_module;
return ORTE_SUCCESS;
}
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}

Просмотреть файл

@ -0,0 +1,14 @@
-*- text -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for ORTE RecoS IGNORE framework.
#

Просмотреть файл

@ -240,10 +240,12 @@ int orte_ess_base_app_setup(void)
}
/* Execute the post-startup errmgr code */
if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr.post_startup";
goto error;
if (NULL != orte_errmgr.post_startup) {
if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr.post_startup";
goto error;
}
}
/* if we are an ORTE app - and not an MPI app - then
@ -278,7 +280,9 @@ error:
int orte_ess_base_app_finalize(void)
{
orte_errmgr.pre_shutdown();
if (NULL != orte_errmgr.pre_shutdown) {
orte_errmgr.pre_shutdown();
}
orte_notifier_base_close();

Просмотреть файл

@ -505,10 +505,12 @@ int orte_ess_base_orted_setup(char **hosts)
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
/* Execute the post-startup errmgr code */
if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr.post_startup";
goto error;
if (NULL != orte_errmgr.post_startup) {
if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr.post_startup";
goto error;
}
}
return ORTE_SUCCESS;
@ -523,7 +525,9 @@ int orte_ess_base_orted_setup(char **hosts)
int orte_ess_base_orted_finalize(void)
{
orte_errmgr.pre_shutdown();
if (NULL != orte_errmgr.pre_shutdown) {
orte_errmgr.pre_shutdown();
}
/* stop the local sensors */
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);