1
1

Move the errmgr code back. This shouldn't cause the svn problems that I

apparently caused last time. Sorry about that. This one will just be a big
changelog.  

This commit was SVN r25016.
Этот коммит содержится в:
Wesley Bland 2011-08-08 16:01:08 +00:00
родитель 09274cd047
Коммит 67feeb6aca
30 изменённых файлов: 1159 добавлений и 7306 удалений

Просмотреть файл

@ -1,9 +1,13 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -22,11 +26,15 @@
#endif
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "opal/mca/event/event.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
@ -48,9 +56,22 @@ static int update_state(orte_jobid_t job,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
static int post_startup(void);
static int pre_shutdown(void);
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata);
void epoch_change(int fd,
short event,
void *data);
/******************
* HNP module
******************/
@ -65,11 +86,11 @@ orte_errmgr_base_module_t orte_errmgr_app_module = {
NULL,
NULL,
orte_errmgr_base_register_migration_warning,
NULL, /* post_startup */
NULL, /* pre_shutdown */
NULL, /* mark_processes_as_dead */
NULL, /* set_fault_callback */
NULL /* failure_notification */
post_startup,
pre_shutdown,
NULL,
orte_errmgr_base_set_fault_callback,
NULL
};
/************************
@ -92,6 +113,8 @@ static int update_state(orte_jobid_t job,
pid_t pid,
orte_exit_code_t exit_code)
{
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app: job %s reported state %s"
" for proc %s state %s exit_code %d",
@ -109,9 +132,9 @@ static int update_state(orte_jobid_t job,
}
if (ORTE_PROC_STATE_COMM_FAILED == state) {
mask = ORTE_NS_CMP_ALL;
/* if it is our own connection, ignore it */
if (ORTE_PROC_MY_NAME->jobid == proc->vpid &&
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
return ORTE_SUCCESS;
}
@ -125,6 +148,95 @@ static int update_state(orte_jobid_t job,
return ORTE_SUCCESS;
}
static int post_startup(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE,
ORTE_RML_PERSISTENT,
epoch_change_recv,
NULL);
return ret;
}
static int pre_shutdown(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE);
return ret;
}
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata) {
ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change);
}
void epoch_change(int fd,
short event,
void *data) {
orte_message_event_t *mev = (orte_message_event_t *) data;
opal_buffer_t *buffer = mev->buffer;
orte_process_name_t *proc;
int n = 1, ret, num_dead, i;
opal_pointer_array_t *procs;
if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Received epoch change notification",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
procs = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead);
for (i = 0; i < num_dead; i++) {
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc[i].epoch++;
orte_util_set_epoch(&proc[i], proc[i].epoch);
opal_pointer_array_add(procs, &proc[i]);
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Epoch for %s updated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc[i])));
}
if (NULL != fault_cbfunc && 0 < num_dead) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Calling fault callback",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
(*fault_cbfunc)(procs);
} else {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Calling fault callback failed!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
free(proc);
OBJ_RELEASE(procs);
}
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
{
int ret, exit_status = ORTE_SUCCESS;
@ -166,7 +278,7 @@ static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr
goto cleanup;
}
cleanup:
cleanup:
OBJ_DESTRUCT(&buffer);
return exit_status;

Просмотреть файл

@ -13,8 +13,8 @@
*
*/
#ifndef MCA_ERRMGR_APP_EXPORT_H
#define MCA_ERRMGR_APP_EXPORT_H
#ifndef MCA_ERRMGR_app_EXPORT_H
#define MCA_ERRMGR_app_EXPORT_H
#include "orte_config.h"
@ -32,4 +32,4 @@ ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_app_module;
END_C_DECLS
#endif /* MCA_ERRMGR_APP_EXPORT_H */
#endif /* MCA_ERRMGR_app_EXPORT_H */

Просмотреть файл

@ -59,7 +59,7 @@ orte_errmgr_base_component_t mca_errmgr_app_component =
/* opal_output handler */
-1,
/* Default priority */
10
0
};
static int errmgr_app_open(void)
@ -78,7 +78,7 @@ static int errmgr_app_component_query(mca_base_module_t **module, int *priority)
/* keep our priority low so that other modules are higher
* and will run before us
*/
*priority = 10;
*priority = 0;
*module = (mca_base_module_t *)&orte_errmgr_app_module;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,12 +0,0 @@
#
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
mca_link_libraries=libopen-rte

Просмотреть файл

@ -1,36 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
sources = \
errmgr_appresil.h \
errmgr_appresil_component.c \
errmgr_appresil.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_errmgr_appresil_DSO
component_noinst =
component_install = mca_errmgr_appresil.la
else
component_noinst = libmca_errmgr_appresil.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_appresil_la_SOURCES = $(sources)
mca_errmgr_appresil_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_appresil_la_SOURCES =$(sources)
libmca_errmgr_appresil_la_LDFLAGS = -module -avoid-version

Просмотреть файл

@ -1,285 +0,0 @@
/*
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* All rights reserved.
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "opal/mca/event/event.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_appresil.h"
/*
* Module functions: Global
*/
static int init(void);
static int finalize(void);
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
static int orte_errmgr_appresil_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
static int post_startup(void);
static int pre_shutdown(void);
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata);
void epoch_change(int fd,
short event,
void *data);
/******************
* HNP module
******************/
orte_errmgr_base_module_t orte_errmgr_appresil_module = {
init,
finalize,
orte_errmgr_base_log,
orte_errmgr_base_abort,
orte_errmgr_appresil_abort_peers,
update_state,
NULL,
NULL,
NULL,
orte_errmgr_base_register_migration_warning,
post_startup,
pre_shutdown,
NULL,
orte_errmgr_base_set_fault_callback,
NULL
};
/************************
* API Definitions
************************/
static int init(void)
{
return ORTE_SUCCESS;
}
static int finalize(void)
{
return ORTE_SUCCESS;
}
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code)
{
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil: job %s reported state %s"
" for proc %s state %s exit_code %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job),
orte_job_state_to_str(jobstate),
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state), exit_code));
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
return ORTE_SUCCESS;
}
if (ORTE_PROC_STATE_COMM_FAILED == state) {
mask = ORTE_NS_CMP_ALL;
/* if it is our own connection, ignore it */
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
return ORTE_SUCCESS;
}
/* delete the route */
orte_routed.delete_route(proc);
/* see is this was a lifeline */
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
return ORTE_ERR_UNRECOVERABLE;
}
}
return ORTE_SUCCESS;
}
static int post_startup(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE,
ORTE_RML_PERSISTENT,
epoch_change_recv,
NULL);
return ret;
}
static int pre_shutdown(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE);
return ret;
}
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata) {
ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change);
}
void epoch_change(int fd,
short event,
void *data) {
orte_message_event_t *mev = (orte_message_event_t *) data;
opal_buffer_t *buffer = mev->buffer;
orte_process_name_t *proc;
int n = 1, ret, num_dead, i;
opal_pointer_array_t *procs;
if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil Received epoch change notification",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
procs = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead);
for (i = 0; i < num_dead; i++) {
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc[i].epoch++;
orte_util_set_epoch(&proc[i], proc[i].epoch);
opal_pointer_array_add(procs, &proc[i]);
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil Epoch for %s updated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc[i])));
}
if (NULL != fault_cbfunc && 0 < num_dead) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil Calling fault callback",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
(*fault_cbfunc)(procs);
} else {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil Calling fault callback failed!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
free(proc);
OBJ_RELEASE(procs);
}
static int orte_errmgr_appresil_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
{
int ret, exit_status = ORTE_SUCCESS;
opal_buffer_t buffer;
orte_std_cntr_t i;
orte_daemon_cmd_flag_t command = ORTE_DAEMON_ABORT_PROCS_CALLED;
/*
* Pack up the list of processes and send them to the HNP
*/
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/* pack number of processes */
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(num_procs), 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/* Pack the list of names */
for( i = 0; i < num_procs; ++i ) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(procs[i]), 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
}
/* Send to HNP for termination */
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_DAEMON, 0))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
cleanup:
OBJ_DESTRUCT(&buffer);
return exit_status;
}

Просмотреть файл

@ -1,35 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_APPRESIL_EXPORT_H
#define MCA_ERRMGR_APPRESIL_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_appresil_component;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_appresil_module;
END_C_DECLS
#endif /* MCA_ERRMGR_APPRESIL_EXPORT_H */

Просмотреть файл

@ -1,89 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_appresil.h"
/*
* Public string for version number
*/
const char *orte_errmgr_appresil_component_version_string =
"ORTE ERRMGR appresil MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_appresil_open(void);
static int errmgr_appresil_close(void);
static int errmgr_appresil_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_appresil_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component itapp
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"appresil",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_appresil_open,
errmgr_appresil_close,
errmgr_appresil_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
0
};
static int errmgr_appresil_open(void)
{
return ORTE_SUCCESS;
}
static int errmgr_appresil_close(void)
{
return ORTE_SUCCESS;
}
static int errmgr_appresil_component_query(mca_base_module_t **module, int *priority)
{
if (ORTE_PROC_IS_APP) {
/* keep our priority low so that other modules are higher
* and will run before us
*/
*priority = 0;
*module = (mca_base_module_t *)&orte_errmgr_appresil_module;
return ORTE_SUCCESS;
}
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,5 +1,8 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -13,8 +16,8 @@
*
*/
#ifndef MCA_ERRMGR_HNP_EXPORT_H
#define MCA_ERRMGR_HNP_EXPORT_H
#ifndef MCA_ERRMGR_hnp_EXPORT_H
#define MCA_ERRMGR_hnp_EXPORT_H
#include "orte_config.h"
@ -57,10 +60,6 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat,
orte_vpid_t vpid,
orte_proc_state_t state,
orte_exit_code_t exit_code);
/***************************
* Module functions: Global
@ -81,8 +80,13 @@ int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnp_global_ft_event(int state);
int orte_errmgr_hnp_global_post_startup(void);
int orte_errmgr_hnp_global_pre_shutdown(void);
int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs);
int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc);
/* HNP Versions */
/* hnp Versions */
int orte_errmgr_hnp_base_global_init(void);
int orte_errmgr_hnp_base_global_finalize(void);
int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
@ -130,4 +134,4 @@ int orte_errmgr_hnp_autor_global_ft_event(int state);
END_C_DECLS
#endif /* MCA_ERRMGR_HNP_EXPORT_H */
#endif /* MCA_ERRMGR_hnp_EXPORT_H */

Просмотреть файл

@ -1,7 +1,10 @@
/*
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -391,6 +394,7 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *node = NULL;
bool found = false;
int num_removed = 0, num_to_remove;
orte_ns_cmp_bitmask_t mask;
if( NULL == current_global_jobdata ) {
return ORTE_SUCCESS;
@ -410,8 +414,8 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
item = opal_list_get_next(item) ) {
wp_item = (errmgr_autor_wp_item_t*)item;
if( wp_item->name.vpid == proc->name.vpid &&
wp_item->name.jobid == proc->name.jobid ) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &wp_item->name, &proc->name)) {
found = true;
break;
}
@ -518,6 +522,7 @@ static void errmgr_autor_process_fault_app(orte_job_t *jdata,
wp_item = OBJ_NEW(errmgr_autor_wp_item_t);
wp_item->name.jobid = proc->jobid;
wp_item->name.vpid = proc->vpid;
wp_item->name.epoch = proc->epoch;
wp_item->state = state;
opal_list_append(procs_pending_recovery, &(wp_item->super));
@ -612,7 +617,7 @@ static void errmgr_autor_process_fault_daemon(orte_job_t *jdata,
/*
* Record the dead daemon
*/
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, 0);
orte_errmgr_hnp_record_dead_process(proc);
return;
}
@ -621,6 +626,7 @@ void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp)
{
wp->name.jobid = ORTE_JOBID_INVALID;
wp->name.vpid = ORTE_VPID_INVALID;
wp->name.epoch = ORTE_EPOCH_MIN;
wp->state = 0;
}
@ -629,6 +635,7 @@ void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp)
{
wp->name.jobid = ORTE_JOBID_INVALID;
wp->name.vpid = ORTE_VPID_INVALID;
wp->name.epoch = ORTE_EPOCH_INVALID;
wp->state = 0;
}

Просмотреть файл

@ -20,7 +20,7 @@
* Public string for version number
*/
const char *orte_errmgr_hnp_component_version_string =
"ORTE ERRMGR Hnp MCA component version " ORTE_VERSION;
"ORTE ERRMGR hnp MCA component version " ORTE_VERSION;
/*
* Local functionality
@ -61,7 +61,7 @@ orte_errmgr_hnp_component_t mca_errmgr_hnp_component = {
/* opal_output handler */
-1,
/* Default priority */
50
0
}
};

Просмотреть файл

@ -2,6 +2,9 @@
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -747,6 +750,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
}
}
}
@ -803,6 +807,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
}
}
}
@ -850,6 +855,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
}
}
}

Просмотреть файл

@ -1,12 +0,0 @@
#
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
mca_link_libraries=libopen-rte

Просмотреть файл

@ -1,40 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
dist_pkgdata_DATA = help-orte-errmgr-hnp.txt
sources = \
errmgr_hnpresil.h \
errmgr_hnpresil_component.c \
errmgr_hnpresil.c \
errmgr_hnpresil_autor.c \
errmgr_hnpresil_crmig.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_errmgr_hnpresil_DSO
component_noinst =
component_install = mca_errmgr_hnpresil.la
else
component_noinst = libmca_errmgr_hnpresil.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_hnpresil_la_SOURCES = $(sources)
mca_errmgr_hnpresil_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_hnpresil_la_SOURCES =$(sources)
libmca_errmgr_hnpresil_la_LDFLAGS = -module -avoid-version

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,137 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_HNPRESIL_EXPORT_H
#define MCA_ERRMGR_HNPRESIL_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
struct orte_errmgr_hnpresil_component_t {
orte_errmgr_base_component_t super; /** Base Errmgr component */
bool ignore_current_update;
bool term_in_progress;
#if OPAL_ENABLE_FT_CR
/* State of the Recovery */
bool crmig_in_progress;
bool autor_in_progress;
/* CRMig Options */
bool crmig_enabled;
bool crmig_timing_enabled;
/* AutoR Options */
bool autor_enabled;
bool autor_timing_enabled;
int autor_recovery_delay;
bool autor_skip_oldnode;
#endif
};
typedef struct orte_errmgr_hnpresil_component_t orte_errmgr_hnpresil_component_t;
OPAL_MODULE_DECLSPEC extern orte_errmgr_hnpresil_component_t mca_errmgr_hnpresil_component;
int orte_errmgr_hnpresil_component_query(mca_base_module_t **module, int *priority);
void orte_errmgr_hnpresil_update_proc(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
/***************************
* Module functions: Global
***************************/
int orte_errmgr_hnpresil_global_module_init(void);
int orte_errmgr_hnpresil_global_module_finalize(void);
int orte_errmgr_hnpresil_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnpresil_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
int orte_errmgr_hnpresil_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnpresil_global_ft_event(int state);
int orte_errmgr_hnpresil_global_post_startup(void);
int orte_errmgr_hnpresil_global_pre_shutdown(void);
int orte_errmgr_hnpresil_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs);
int orte_errmgr_hnpresil_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
int orte_errmgr_hnpresil_record_dead_process(orte_process_name_t *proc);
/* hnpresil Versions */
int orte_errmgr_hnpresil_base_global_init(void);
int orte_errmgr_hnpresil_base_global_finalize(void);
int orte_errmgr_hnpresil_base_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnpresil_base_global_ft_event(int state);
#if OPAL_ENABLE_FT_CR
/* CRMig Versions */
int orte_errmgr_hnpresil_crmig_global_module_init(void);
int orte_errmgr_hnpresil_crmig_global_module_finalize(void);
int orte_errmgr_hnpresil_crmig_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnpresil_crmig_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
int orte_errmgr_hnpresil_crmig_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnpresil_crmig_global_ft_event(int state);
/* AutoR Versions */
int orte_errmgr_hnpresil_autor_global_module_init(void);
int orte_errmgr_hnpresil_autor_global_module_finalize(void);
int orte_errmgr_hnpresil_autor_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnpresil_autor_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnpresil_autor_global_ft_event(int state);
#endif
END_C_DECLS
#endif /* MCA_ERRMGR_HNPRESIL_EXPORT_H */

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,201 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_hnpresil.h"
/*
* Public string for version number
*/
const char *orte_errmgr_hnpresil_component_version_string =
"ORTE ERRMGR hnpresil MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int orte_errmgr_hnpresil_open(void);
static int orte_errmgr_hnpresil_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_hnpresil_component_t mca_errmgr_hnpresil_component = {
/* First do the base component stuff */
{
/* Handle the general mca_component_t struct containing
* meta information about the component hnp
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"hnpresil",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
orte_errmgr_hnpresil_open,
orte_errmgr_hnpresil_close,
orte_errmgr_hnpresil_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
0
}
};
static int orte_errmgr_hnpresil_open(void)
{
int val;
/*
* This should be the last componet to ever get used since
* it doesn't do anything.
*/
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"priority",
"Priority of the ERRMGR hnp component",
false, false,
mca_errmgr_hnpresil_component.super.priority,
&mca_errmgr_hnpresil_component.super.priority);
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"verbose",
"Verbose level for the ERRMGR hnp component",
false, false,
mca_errmgr_hnpresil_component.super.verbose,
&mca_errmgr_hnpresil_component.super.verbose);
/* If there is a custom verbose level for this component than use it
* otherwise take our parents level and output channel
*/
if ( 0 != mca_errmgr_hnpresil_component.super.verbose) {
mca_errmgr_hnpresil_component.super.output_handle = opal_output_open(NULL);
opal_output_set_verbosity(mca_errmgr_hnpresil_component.super.output_handle,
mca_errmgr_hnpresil_component.super.verbose);
} else {
mca_errmgr_hnpresil_component.super.output_handle = orte_errmgr_base.output;
}
#if OPAL_ENABLE_FT_CR
/****************************
* CRMig (C/R Process Migration) MCA Options
****************************/
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"crmig_timing",
"Enable Process Migration timer",
false, false,
0, &val);
mca_errmgr_hnpresil_component.crmig_timing_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"crmig_enable",
"Enable Process Migration (Default: 0/off)",
false, false,
0, &val);
mca_errmgr_hnpresil_component.crmig_enabled = OPAL_INT_TO_BOOL(val);
/****************************
* AutoR (Automatic Recovery) MCA Options
****************************/
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"autor_timing",
"Enable Automatic Recovery timer",
false, false,
0, &val);
mca_errmgr_hnpresil_component.autor_timing_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"autor_enable",
"Enable Automatic Recovery (Default: 0/off)",
false, false,
0, &val);
mca_errmgr_hnpresil_component.autor_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"autor_recovery_delay",
"Number of seconds to wait before starting to recover the job after a failure"
" [Default: 1 sec]",
false, false,
1, &val);
mca_errmgr_hnpresil_component.autor_recovery_delay = val;
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"autor_skip_oldnode",
"Skip the old node from failed proc, even if it is still available"
" [Default: Enabled]",
false, false,
1, &val);
mca_errmgr_hnpresil_component.autor_skip_oldnode = OPAL_INT_TO_BOOL(val);
#else
val = 0; /* Silence compiler warning */
#endif /* OPAL_ENABLE_FT_CR */
/*
* Debug Output
*/
opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open()");
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: priority = %d",
mca_errmgr_hnpresil_component.super.priority);
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: verbosity = %d",
mca_errmgr_hnpresil_component.super.verbose);
#if OPAL_ENABLE_FT_CR
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: --- CR Migration Options ---");
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: Process Migration = %s",
(mca_errmgr_hnpresil_component.crmig_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: timing = %s",
(mca_errmgr_hnpresil_component.crmig_timing_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: --- Auto. Recovery Options ---");
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: Auto. Recover = %s",
(mca_errmgr_hnpresil_component.autor_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: timing = %s",
(mca_errmgr_hnpresil_component.autor_timing_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: recover_delay = %d",
mca_errmgr_hnpresil_component.autor_recovery_delay);
mca_errmgr_hnpresil_component.crmig_in_progress = false;
mca_errmgr_hnpresil_component.autor_in_progress = false;
mca_errmgr_hnpresil_component.term_in_progress = false;
#endif /* OPAL_ENABLE_FT_CR */
return ORTE_SUCCESS;
}
static int orte_errmgr_hnpresil_close(void)
{
opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: close()");
return ORTE_SUCCESS;
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,71 +0,0 @@
-*- text -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for ORTE Errmgr HNP module.
#
[errmgr-hnp:unknown-job-error]
An error has occurred in an unknown job. This generally should not happen
except due to an internal ORTE error.
Job state: %s
This information should probably be reported to the OMPI developers.
#
[errmgr-hnp:daemon-died]
The system has lost communication with the following daemon:
Daemon: %s
Node: %s
The reason for the lost communication channel is unknown. Possible
reasons include failure of the daemon itself, failure of the
connecting fabric/switch, and loss of the host node. Please
check with your system administrator to try and determine the
source of the problem.
Your job is being terminated as a result.
#
[errmgr-hnp:cannot-relocate]
The system is unable to relocate the specified process:
Process: %s
because the application for that process could not be found. This
appears to be a system error. Please report it to the ORTE
developers.
[autor_recovering_job]
Notice: The processes listed below failed unexpectedly.
Using the last checkpoint to recover the job.
Please standby.
%s
[autor_recovery_complete]
Notice: The job has been successfully recovered from the
last checkpoint.
[autor_failed_to_recover_proc]
Error: The process below has failed. There is no checkpoint available for
this job, so we are terminating the application since automatic
recovery cannot occur.
Internal Name: %s
MCW Rank: %d
[crmig_migrating_job]
Notice: A migration of this job has been requested.
The processes below will be migrated.
Please standby.
%s
[crmig_migrated_job]
Notice: The processes have been successfully migrated to/from the specified
machines.
[crmig_no_migrating_procs]
Warning: Could not find any processes to migrate on the nodes specified.
You provided the following:
Nodes: %s
Procs: %s

Просмотреть файл

@ -3,6 +3,9 @@
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -29,9 +32,11 @@
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/plm/plm_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/sensor/sensor.h"
@ -53,8 +58,9 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code);
static void update_local_children(orte_odls_job_t *jobdat,
orte_job_state_t jobstate,
orte_proc_state_t state);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch);
static int record_dead_process(orte_process_name_t *proc);
static int send_to_local_applications(opal_pointer_array_t *dead_names);
/*
* Module functions: Global
@ -79,10 +85,14 @@ static int suggest_map_targets(orte_proc_t *proc,
static int ft_event(int state);
static int post_startup(void);
static int pre_shutdown(void);
static int mark_processes_as_dead(opal_pointer_array_t *dead_procs);
static int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
/******************
* ORTED module
* orted module
******************/
orte_errmgr_base_module_t orte_errmgr_orted_module = {
init,
@ -95,11 +105,11 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = {
suggest_map_targets,
ft_event,
orte_errmgr_base_register_migration_warning,
NULL, /* post_startup */
NULL, /* pre_shutdown */
NULL, /* mark_processes_as_dead */
NULL, /* set_fault_callback */
NULL /* failure_notification */
post_startup,
pre_shutdown,
mark_processes_as_dead,
orte_errmgr_base_set_fault_callback, /* Set callback function */
failure_notification
};
/************************
@ -130,20 +140,29 @@ static int update_state(orte_jobid_t job,
int rc=ORTE_SUCCESS;
orte_vpid_t null=ORTE_VPID_INVALID;
orte_app_context_t *app;
orte_ns_cmp_bitmask_t mask;
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:orted:update_state() %s) "
"------- %s state updated for process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
((NULL == proc) ? "App. Process" :
(proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")),
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));
/* if this is a heartbeat failure, let the HNP handle it */
if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate ||
ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
return ORTE_SUCCESS;
}
/*** UPDATE COMMAND FOR A JOB ***/
if (NULL == proc) {
/* this is an update for an entire job */
@ -180,7 +199,7 @@ static int update_state(orte_jobid_t job,
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == job) {
break;
@ -189,7 +208,7 @@ static int update_state(orte_jobid_t job,
if (NULL == jobdat) {
return ORTE_ERR_NOT_FOUND;
}
switch (jobstate) {
case ORTE_JOB_STATE_FAILED_TO_START:
failed_start(jobdat, exit_code);
@ -202,10 +221,10 @@ static int update_state(orte_jobid_t job,
/* update all procs in job */
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
/* order all local procs for this job to be killed */
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
case ORTE_JOB_STATE_COMM_FAILED:
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
/* tell the caller we can't recover */
return ORTE_ERR_UNRECOVERABLE;
break;
@ -242,15 +261,16 @@ static int update_state(orte_jobid_t job,
* lifeline
*/
if (ORTE_PROC_STATE_COMM_FAILED == state) {
mask = ORTE_NS_CMP_ALL;
/* if it is our own connection, ignore it */
if (ORTE_PROC_MY_NAME->jobid == proc->jobid &&
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
return ORTE_SUCCESS;
}
/* see if this was a lifeline */
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
/* kill our children */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
/* terminate - our routed children will see
* us leave and automatically die
*/
@ -261,21 +281,25 @@ static int update_state(orte_jobid_t job,
/* was it a daemon that failed? */
if (proc->jobid == ORTE_PROC_MY_NAME->jobid) {
/* if all my routes are gone, then terminate ourselves */
if (0 == orte_routed.num_routes()) {
if (0 == orte_routed.num_routes() &&
0 == opal_list_get_size(&orte_local_children)) {
orte_quit();
}
}
record_dead_process(proc);
/* if not, then indicate we can continue */
return ORTE_SUCCESS;
}
/* lookup the local jobdat for this job */
jobdat = NULL;
for (item = opal_list_get_first(&orte_local_jobdata);
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == proc->jobid) {
break;
@ -285,7 +309,7 @@ static int update_state(orte_jobid_t job,
/* must already be complete */
return ORTE_SUCCESS;
}
/* if there are no local procs for this job, we can
* ignore this call
*/
@ -306,15 +330,15 @@ static int update_state(orte_jobid_t job,
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
child->exit_code = exit_code;
/* Decrement the number of local procs */
jobdat->num_local_procs--;
/* kill this proc */
killprocs(proc->jobid, proc->vpid);
killprocs(proc->jobid, proc->vpid, proc->epoch);
}
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx);
if( jobdat->enable_recovery && child->restarts < app->max_restarts ) {
@ -340,7 +364,7 @@ static int update_state(orte_jobid_t job,
/* treat this as normal termination */
goto REPORT_STATE;
}
if (ORTE_PROC_STATE_TERMINATED < state) {
if( jobdat->enable_recovery ) {
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
@ -351,8 +375,8 @@ static int update_state(orte_jobid_t job,
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
/* see if this child has reached its local restart limit */
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx);
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
@ -379,8 +403,8 @@ static int update_state(orte_jobid_t job,
}
}
}
REPORT_ABORT:
REPORT_ABORT:
/* if the job hasn't completed and the state is abnormally
* terminated, then we need to alert the HNP right away
*/
@ -403,8 +427,8 @@ static int update_state(orte_jobid_t job,
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
child->exit_code = exit_code;
@ -418,7 +442,7 @@ static int update_state(orte_jobid_t job,
opal_list_remove_item(&orte_local_children, &child->super);
/* Decrement the number of local procs */
jobdat->num_local_procs--;
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted reporting proc %s aborted to HNP (local procs = %d)",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -440,15 +464,15 @@ static int update_state(orte_jobid_t job,
OBJ_DESTRUCT(&alert);
return rc;
}
REPORT_STATE:
REPORT_STATE:
/* find this proc in the local children so we can update its state */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
if (0 < pid) {
@ -468,7 +492,7 @@ REPORT_STATE:
* the HNP so it is available to debuggers and anyone
* else that needs it
*/
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted: sending contact info to HNP",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
@ -485,7 +509,7 @@ REPORT_STATE:
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
/* pack all the local child vpids */
/* pack all the local child vpids and epochs */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
@ -522,7 +546,7 @@ REPORT_STATE:
}
return rc;
}
/* only other state is terminated - see if anyone is left alive */
if (!any_live_children(proc->jobid)) {
/* lookup the local jobdat for this job */
@ -531,7 +555,7 @@ REPORT_STATE:
item != opal_list_get_end(&orte_local_jobdata);
item = opal_list_get_next(item)) {
jobdat = (orte_odls_job_t*)item;
/* is this the specified job? */
if (jobdat->jobid == proc->jobid) {
break;
@ -553,8 +577,8 @@ REPORT_STATE:
if (ORTE_SUCCESS != (rc = pack_state_update(&alert, jobdat))) {
ORTE_ERROR_LOG(rc);
}
FINAL_CLEANUP:
FINAL_CLEANUP:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted reporting all procs in %s terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -568,7 +592,7 @@ REPORT_STATE:
item = next) {
child = (orte_odls_child_t*)item;
next = opal_list_get_next(item);
if (jobdat->jobid == child->name->jobid) {
opal_list_remove_item(&orte_local_children, &child->super);
OBJ_RELEASE(child);
@ -577,11 +601,11 @@ REPORT_STATE:
/* ensure the job's local session directory tree is removed */
orte_session_dir_cleanup(jobdat->jobid);
/* remove this job from our local job data since it is complete */
opal_list_remove_item(&orte_local_jobdata, &jobdat->super);
OBJ_RELEASE(jobdat);
/* send it */
if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) {
ORTE_ERROR_LOG(rc);
@ -589,6 +613,7 @@ REPORT_STATE:
rc = ORTE_SUCCESS;
}
OBJ_DESTRUCT(&alert);
/* indicate that the job is complete */
return rc;
}
@ -609,11 +634,137 @@ static int suggest_map_targets(orte_proc_t *proc,
return ORTE_ERR_NOT_IMPLEMENTED;
}
int ft_event(int state)
static int ft_event(int state)
{
return ORTE_SUCCESS;
}
static int post_startup(void) {
return ORTE_SUCCESS;
}
static int pre_shutdown(void) {
return ORTE_SUCCESS;
}
static int mark_processes_as_dead(opal_pointer_array_t *dead_procs) {
int i;
orte_process_name_t *name_item;
opal_list_item_t *item;
orte_odls_child_t *child;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"ORTED %s marking procs as dead",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) {
if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) {
opal_output(0, "NULL found in dead process list.");
continue;
}
if (name_item->epoch < orte_util_lookup_epoch(name_item)) {
continue;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"ORTED %s marking %s as dead",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name_item)));
/* Increment the epoch */
orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED);
orte_util_set_epoch(name_item, name_item->epoch + 1);
/* Remove the dead process from my list of children if applicable */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t *) item;
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID,
child->name, name_item)) {
opal_list_remove_item(&orte_local_children, item);
OBJ_RELEASE(item);
break;
}
}
/* Remove the route from the routing layer */
orte_routed.delete_route(name_item);
}
/* Update the routing module */
orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid);
if (NULL != fault_cbfunc) {
(*fault_cbfunc)(dead_procs);
}
return ORTE_SUCCESS;
}
static int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) {
opal_pointer_array_t *dead_names;
orte_std_cntr_t n;
int ret = ORTE_SUCCESS, num_failed;
int32_t i;
orte_process_name_t *name_item, proc;
dead_names = OBJ_NEW(opal_pointer_array_t);
n = 1;
/* Get the number of failed procs */
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
return ret;
}
for (i = 0; i < num_failed; i++) {
/* Unpack the buffer to get the dead process' name. */
n = 1;
name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t));
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (orte_debug_daemons_flag) {
opal_output(0, "%s errmgr:orted ORTED received process %s failed from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name_item),
ORTE_NAME_PRINT(sender));
}
/* There shouldn't be an issue of receiving this message multiple
* times but it doesn't hurt to double check.
*/
if (proc.epoch < orte_util_lookup_epoch(name_item)) {
opal_output(1, "Received from proc %s local epoch %d", ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item));
continue;
}
opal_pointer_array_add(dead_names, name_item);
}
/* Tell the errmgr so it can handle changing the epoch, routes, etc. */
mark_processes_as_dead(dead_names);
/* Tell the applications' ORTE layers that there is a failure. */
if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) {
return ret;
}
for (i = 0; i < num_failed; i++) {
name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i);
free(name_item);
}
return ret;
}
/*****************
* Local Functions
*****************/
@ -621,14 +772,14 @@ static bool any_live_children(orte_jobid_t job)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) &&
child->alive) {
@ -638,13 +789,13 @@ static bool any_live_children(orte_jobid_t job)
/* if we get here, then nobody is left alive from that job */
return false;
}
static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child)
{
int rc;
/* pack the child's vpid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name->vpid), 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
@ -679,70 +830,70 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child)
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat)
{
int rc;
opal_list_item_t *item, *next;
orte_odls_child_t *child;
orte_vpid_t null=ORTE_VPID_INVALID;
/* pack the jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we are timing things, pack the time the launch msg for this job was recvd */
if (orte_timing) {
int64_t tmp;
tmp = jobdat->launch_msg_recvd.tv_sec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
tmp = jobdat->launch_msg_recvd.tv_usec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
child = (orte_odls_child_t*)item;
next = opal_list_get_next(item);
/* if this child is part of the job... */
if (child->name->jobid == jobdat->jobid) {
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
/* flag that this job is complete so the receiver can know */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat)
{
int rc;
opal_list_item_t *item, *next;
orte_odls_child_t *child;
orte_vpid_t null=ORTE_VPID_INVALID;
/* pack the jobid */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
/* if we are timing things, pack the time the launch msg for this job was recvd */
if (orte_timing) {
int64_t tmp;
tmp = jobdat->launch_msg_recvd.tv_sec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
tmp = jobdat->launch_msg_recvd.tv_usec;
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = next) {
child = (orte_odls_child_t*)item;
next = opal_list_get_next(item);
/* if this child is part of the job... */
if (child->name->jobid == jobdat->jobid) {
if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
}
/* flag that this job is complete so the receiver can know */
if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
return rc;
}
return ORTE_SUCCESS;
}
static bool all_children_registered(orte_jobid_t job)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
/* if this child has terminated, we consider it as having
@ -768,10 +919,10 @@ static bool all_children_registered(orte_jobid_t job)
}
}
}
/* if we get here, then everyone in the job is currently registered */
return true;
}
static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
@ -779,14 +930,14 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
opal_list_item_t *item;
orte_odls_child_t *child;
int rc;
/* the thread is locked elsewhere - don't try to do it again here */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
/* is this child part of the specified job? */
if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) {
/* pack the child's vpid - must be done in case rml_uri is NULL */
@ -794,10 +945,11 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
ORTE_ERROR_LOG(rc);
return rc;
}
/* Pack the child's epoch. */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->epoch), 1, ORTE_EPOCH))) {
ORTE_ERROR_LOG(rc);
return rc;
}
}
/* pack the contact info */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(rc);
@ -805,19 +957,19 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
}
}
}
return ORTE_SUCCESS;
}
static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code)
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* set the state */
jobdat->state = ORTE_JOB_STATE_FAILED_TO_START;
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
@ -836,7 +988,7 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code)
}
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:orted: job %s reported incomplete start",
"%s errmgr:hnp: job %s reported incomplete start",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(jobdat->jobid)));
return;
@ -846,7 +998,7 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs
{
opal_list_item_t *item;
orte_odls_child_t *child;
/* update job state */
jobdat->state = jobstate;
/* update children */
@ -860,28 +1012,29 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs
}
}
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
{
opal_pointer_array_t cmd;
orte_proc_t proc;
int rc;
/* stop local sensors for this job */
if (ORTE_VPID_WILDCARD == vpid) {
orte_sensor.stop(job);
}
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) {
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) {
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
ORTE_ERROR_LOG(rc);
}
return;
}
OBJ_CONSTRUCT(&cmd, opal_pointer_array_t);
OBJ_CONSTRUCT(&proc, orte_proc_t);
proc.name.jobid = job;
proc.name.vpid = vpid;
proc.name.epoch = epoch;
opal_pointer_array_add(&cmd, &proc);
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
ORTE_ERROR_LOG(rc);
@ -889,3 +1042,85 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
OBJ_DESTRUCT(&cmd);
OBJ_DESTRUCT(&proc);
}
static int record_dead_process(orte_process_name_t *proc) {
opal_pointer_array_t *dead_name;
opal_buffer_t *buffer;
orte_daemon_cmd_flag_t command;
int rc = ORTE_SUCCESS;
int num_failed;
if (orte_odls_base_default_check_finished(proc)) {
return rc;
}
dead_name = OBJ_NEW(opal_pointer_array_t);
opal_pointer_array_add(dead_name, proc);
/* Mark the process as dead */
mark_processes_as_dead(dead_name);
/* Send a message to the HNP */
buffer = OBJ_NEW(opal_buffer_t);
command = ORTE_PROCESS_FAILED_NOTIFICATION;
num_failed = 1;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
} else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
} else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
}
orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0);
OBJ_RELEASE(buffer);
OBJ_RELEASE(dead_name);
return rc;
}
int send_to_local_applications(opal_pointer_array_t *dead_names) {
opal_buffer_t *buf;
int ret;
orte_process_name_t *name_item;
int size, i;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"%s Sending failure to local applications.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
buf = OBJ_NEW(opal_buffer_t);
size = opal_pointer_array_get_size(dead_names);
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
for (i = 0; i < size; i++) {
if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
}
}
if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
OBJ_RELEASE(buf);
return ORTE_SUCCESS;
}

Просмотреть файл

@ -13,8 +13,8 @@
*
*/
#ifndef MCA_ERRMGR_ORTED_EXPORT_H
#define MCA_ERRMGR_ORTED_EXPORT_H
#ifndef MCA_ERRMGR_orted_EXPORT_H
#define MCA_ERRMGR_orted_EXPORT_H
#include "orte_config.h"
@ -32,4 +32,4 @@ ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_orted_module;
END_C_DECLS
#endif /* MCA_ERRMGR_ORTED_EXPORT_H */
#endif /* MCA_ERRMGR_orted_EXPORT_H */

Просмотреть файл

@ -72,7 +72,7 @@ static int errmgr_orted_component_query(mca_base_module_t **module, int *priorit
/* keep our priority low so that other modules are higher
* and will run before us
*/
*priority = 10;
*priority = 0;
*module = (mca_base_module_t *)&orte_errmgr_orted_module;
return ORTE_SUCCESS;
}

Просмотреть файл

@ -1,12 +0,0 @@
#
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
mca_link_libraries=libopen-rte

Просмотреть файл

@ -1,38 +0,0 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
dist_pkgdata_DATA = help-orte-errmgr-orted.txt
sources = \
errmgr_ortedresil.h \
errmgr_ortedresil_component.c \
errmgr_ortedresil.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_errmgr_ortedresil_DSO
component_noinst =
component_install = mca_errmgr_ortedresil.la
else
component_noinst = libmca_errmgr_ortedresil.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_ortedresil_la_SOURCES = $(sources)
mca_errmgr_ortedresil_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_ortedresil_la_SOURCES =$(sources)
libmca_errmgr_ortedresil_la_LDFLAGS = -module -avoid-version

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,35 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_ORTEDRESIL_EXPORT_H
#define MCA_ERRMGR_ORTEDRESIL_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_ortedresil_component;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_ortedresil_module;
END_C_DECLS
#endif /* MCA_ERRMGR_ORTEDRESIL_EXPORT_H */

Просмотреть файл

@ -1,84 +0,0 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_ortedresil.h"
/*
* Public string for version number
*/
const char *orte_errmgr_ortedresil_component_version_string =
"ORTE ERRMGR ortedresil MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_ortedresil_open(void);
static int errmgr_ortedresil_close(void);
static int errmgr_ortedresil_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_ortedresil_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component itortedresil
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"ortedresil",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_ortedresil_open,
errmgr_ortedresil_close,
errmgr_ortedresil_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int errmgr_ortedresil_open(void)
{
return ORTE_SUCCESS;
}
static int errmgr_ortedresil_close(void)
{
return ORTE_SUCCESS;
}
static int errmgr_ortedresil_component_query(mca_base_module_t **module, int *priority)
{
if (ORTE_PROC_IS_DAEMON) {
/* keep our priority low so that other modules are higher
* and will run before us
*/
*priority = 0;
*module = (mca_base_module_t *)&orte_errmgr_ortedresil_module;
return ORTE_SUCCESS;
}
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}

Просмотреть файл

@ -1,14 +0,0 @@
-*- text -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for ORTE RecoS IGNORE framework.
#