1
1

Move the resilient orte errmgr code into a seperate errmgr for now while it's

still unstable. Reverted errmgr modules back to the original errmgr (with the
updates since the resilient code was brought into the trunk).

This commit was SVN r24958.
Этот коммит содержится в:
Wesley Bland 2011-07-28 21:24:34 +00:00
родитель 6c879f87fb
Коммит 5fde3e0e00
28 изменённых файлов: 7288 добавлений и 1102 удалений

Просмотреть файл

@ -371,7 +371,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
}
/* Register errhandler callback with orte errmgr */
if (NULL != orte_errmgr.set_fault_callback) {
orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback);
}
/* Figure out the final MPI thread levels. If we were not
compiled for support for MPI threads, then don't allow

Просмотреть файл

@ -1,12 +1,8 @@
/*
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
*
* $COPYRIGHT$
*
@ -26,15 +22,11 @@
#endif
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "opal/mca/event/event.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
@ -56,22 +48,9 @@ static int update_state(orte_jobid_t job,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
static int post_startup(void);
static int pre_shutdown(void);
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata);
void epoch_change(int fd,
short event,
void *data);
/******************
* HNP module
******************/
@ -86,11 +65,11 @@ orte_errmgr_base_module_t orte_errmgr_app_module = {
NULL,
NULL,
orte_errmgr_base_register_migration_warning,
post_startup,
pre_shutdown,
NULL,
orte_errmgr_base_set_fault_callback,
NULL
NULL, /* post_startup */
NULL, /* pre_shutdown */
NULL, /* mark_processes_as_dead */
NULL, /* set_fault_callback */
NULL /* failure_notification */
};
/************************
@ -113,8 +92,6 @@ static int update_state(orte_jobid_t job,
pid_t pid,
orte_exit_code_t exit_code)
{
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app: job %s reported state %s"
" for proc %s state %s exit_code %d",
@ -132,9 +109,9 @@ static int update_state(orte_jobid_t job,
}
if (ORTE_PROC_STATE_COMM_FAILED == state) {
mask = ORTE_NS_CMP_ALL;
/* if it is our own connection, ignore it */
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
if (ORTE_PROC_MY_NAME->jobid == proc->vpid &&
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
return ORTE_SUCCESS;
}
@ -148,95 +125,6 @@ static int update_state(orte_jobid_t job,
return ORTE_SUCCESS;
}
static int post_startup(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE,
ORTE_RML_PERSISTENT,
epoch_change_recv,
NULL);
return ret;
}
static int pre_shutdown(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE);
return ret;
}
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata) {
ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change);
}
void epoch_change(int fd,
short event,
void *data) {
orte_message_event_t *mev = (orte_message_event_t *) data;
opal_buffer_t *buffer = mev->buffer;
orte_process_name_t *proc;
int n = 1, ret, num_dead, i;
opal_pointer_array_t *procs;
if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Received epoch change notification",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
procs = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead);
for (i = 0; i < num_dead; i++) {
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc[i].epoch++;
orte_util_set_epoch(&proc[i], proc[i].epoch);
opal_pointer_array_add(procs, &proc[i]);
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Epoch for %s updated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc[i])));
}
if (NULL != fault_cbfunc && 0 < num_dead) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Calling fault callback",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
(*fault_cbfunc)(procs);
} else {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:app Calling fault callback failed!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
free(proc);
OBJ_RELEASE(procs);
}
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
{
int ret, exit_status = ORTE_SUCCESS;
@ -278,7 +166,7 @@ static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr
goto cleanup;
}
cleanup:
cleanup:
OBJ_DESTRUCT(&buffer);
return exit_status;

12
orte/mca/errmgr/appresil/.windows Обычный файл
Просмотреть файл

@ -0,0 +1,12 @@
#
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
mca_link_libraries=libopen-rte

36
orte/mca/errmgr/appresil/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,36 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
sources = \
errmgr_appresil.h \
errmgr_appresil_component.c \
errmgr_appresil.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_errmgr_appresil_DSO
component_noinst =
component_install = mca_errmgr_appresil.la
else
component_noinst = libmca_errmgr_appresil.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_appresil_la_SOURCES = $(sources)
mca_errmgr_appresil_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_appresil_la_SOURCES =$(sources)
libmca_errmgr_appresil_la_LDFLAGS = -module -avoid-version

285
orte/mca/errmgr/appresil/errmgr_appresil.c Обычный файл
Просмотреть файл

@ -0,0 +1,285 @@
/*
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* All rights reserved.
*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* Copyright (c) 2004-2006 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/util/output.h"
#include "opal/dss/dss.h"
#include "opal/mca/event/event.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_appresil.h"
/*
* Module functions: Global
*/
static int init(void);
static int finalize(void);
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
static int orte_errmgr_appresil_abort_peers(orte_process_name_t *procs,
orte_std_cntr_t num_procs);
static int post_startup(void);
static int pre_shutdown(void);
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata);
void epoch_change(int fd,
short event,
void *data);
/******************
* HNP module
******************/
orte_errmgr_base_module_t orte_errmgr_appresil_module = {
init,
finalize,
orte_errmgr_base_log,
orte_errmgr_base_abort,
orte_errmgr_appresil_abort_peers,
update_state,
NULL,
NULL,
NULL,
orte_errmgr_base_register_migration_warning,
post_startup,
pre_shutdown,
NULL,
orte_errmgr_base_set_fault_callback,
NULL
};
/************************
* API Definitions
************************/
static int init(void)
{
return ORTE_SUCCESS;
}
static int finalize(void)
{
return ORTE_SUCCESS;
}
static int update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code)
{
orte_ns_cmp_bitmask_t mask;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil: job %s reported state %s"
" for proc %s state %s exit_code %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_JOBID_PRINT(job),
orte_job_state_to_str(jobstate),
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
orte_proc_state_to_str(state), exit_code));
/*
* if orte is trying to shutdown, just let it
*/
if (orte_finalizing) {
return ORTE_SUCCESS;
}
if (ORTE_PROC_STATE_COMM_FAILED == state) {
mask = ORTE_NS_CMP_ALL;
/* if it is our own connection, ignore it */
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
return ORTE_SUCCESS;
}
/* delete the route */
orte_routed.delete_route(proc);
/* see is this was a lifeline */
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
return ORTE_ERR_UNRECOVERABLE;
}
}
return ORTE_SUCCESS;
}
static int post_startup(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE,
ORTE_RML_PERSISTENT,
epoch_change_recv,
NULL);
return ret;
}
static int pre_shutdown(void) {
int ret = ORTE_SUCCESS;
ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON,
ORTE_RML_TAG_EPOCH_CHANGE);
return ret;
}
void epoch_change_recv(int status,
orte_process_name_t *sender,
opal_buffer_t *buffer,
orte_rml_tag_t tag,
void *cbdata) {
ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change);
}
void epoch_change(int fd,
short event,
void *data) {
orte_message_event_t *mev = (orte_message_event_t *) data;
opal_buffer_t *buffer = mev->buffer;
orte_process_name_t *proc;
int n = 1, ret, num_dead, i;
opal_pointer_array_t *procs;
if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) {
return;
}
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil Received epoch change notification",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
procs = OBJ_NEW(opal_pointer_array_t);
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead);
for (i = 0; i < num_dead; i++) {
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
return;
}
proc[i].epoch++;
orte_util_set_epoch(&proc[i], proc[i].epoch);
opal_pointer_array_add(procs, &proc[i]);
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil Epoch for %s updated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&proc[i])));
}
if (NULL != fault_cbfunc && 0 < num_dead) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil Calling fault callback",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
(*fault_cbfunc)(procs);
} else {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:appresil Calling fault callback failed!",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
}
free(proc);
OBJ_RELEASE(procs);
}
static int orte_errmgr_appresil_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
{
int ret, exit_status = ORTE_SUCCESS;
opal_buffer_t buffer;
orte_std_cntr_t i;
orte_daemon_cmd_flag_t command = ORTE_DAEMON_ABORT_PROCS_CALLED;
/*
* Pack up the list of processes and send them to the HNP
*/
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/* pack number of processes */
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(num_procs), 1, ORTE_STD_CNTR))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
/* Pack the list of names */
for( i = 0; i < num_procs; ++i ) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(procs[i]), 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
}
/* Send to HNP for termination */
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_DAEMON, 0))) {
ORTE_ERROR_LOG(ret);
exit_status = ret;
goto cleanup;
}
cleanup:
OBJ_DESTRUCT(&buffer);
return exit_status;
}

35
orte/mca/errmgr/appresil/errmgr_appresil.h Обычный файл
Просмотреть файл

@ -0,0 +1,35 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_APPRESIL_EXPORT_H
#define MCA_ERRMGR_APPRESIL_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_appresil_component;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_appresil_module;
END_C_DECLS
#endif /* MCA_ERRMGR_APPRESIL_EXPORT_H */

Просмотреть файл

@ -0,0 +1,89 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_appresil.h"
/*
* Public string for version number
*/
const char *orte_errmgr_appresil_component_version_string =
"ORTE ERRMGR appresil MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_appresil_open(void);
static int errmgr_appresil_close(void);
static int errmgr_appresil_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_appresil_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component itapp
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"appresil",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_appresil_open,
errmgr_appresil_close,
errmgr_appresil_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
0
};
static int errmgr_appresil_open(void)
{
return ORTE_SUCCESS;
}
static int errmgr_appresil_close(void)
{
return ORTE_SUCCESS;
}
static int errmgr_appresil_component_query(mca_base_module_t **module, int *priority)
{
if (ORTE_PROC_IS_APP) {
/* keep our priority low so that other modules are higher
* and will run before us
*/
*priority = 0;
*module = (mca_base_module_t *)&orte_errmgr_appresil_module;
return ORTE_SUCCESS;
}
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}

Просмотреть файл

@ -1,11 +1,8 @@
/*
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -40,14 +37,11 @@
#include "orte/mca/routed/routed.h"
#include "orte/mca/debugger/base/base.h"
#include "orte/mca/notifier/notifier.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/ess/ess.h"
#include "orte/util/error_strings.h"
#include "orte/util/name_fns.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_locks.h"
@ -56,7 +50,6 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_hnp.h"
/**********************
@ -83,16 +76,11 @@ static orte_errmgr_base_module_t global_module = {
/* FT Event hook */
orte_errmgr_hnp_global_ft_event,
orte_errmgr_base_register_migration_warning,
/* Post-startup */
orte_errmgr_hnp_global_post_startup,
/* Pre-shutdown */
orte_errmgr_hnp_global_pre_shutdown,
/* Mark as dead */
orte_errmgr_hnp_global_mark_processes_as_dead,
/* Set the callback */
orte_errmgr_base_set_fault_callback,
/* Receive failure notification */
orte_errmgr_hnp_global_failure_notification
NULL, /* post_startup */
NULL, /* pre_shutdown */
NULL, /* mark_processes_as_dead */
NULL, /* set_fault_callback */
NULL /* failure_notification */
};
@ -104,11 +92,10 @@ static void failed_start(orte_job_t *jdata);
static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate,
orte_proc_state_t state, orte_exit_code_t exit_code);
static void check_job_complete(orte_job_t *jdata);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
orte_proc_state_t state, orte_exit_code_t exit_code);
static orte_odls_child_t* proc_is_local(orte_process_name_t *proc);
static int send_to_local_applications(opal_pointer_array_t *dead_names);
/************************
* API Definitions
@ -168,7 +155,7 @@ int orte_errmgr_hnp_global_module_init(void)
goto cleanup;
}
cleanup:
cleanup:
return exit_status;
}
@ -206,7 +193,7 @@ int orte_errmgr_hnp_global_module_finalize(void)
goto cleanup;
}
cleanup:
cleanup:
return exit_status;
}
@ -275,7 +262,7 @@ int orte_errmgr_hnp_global_update_state(orte_jobid_t job,
}
}
cleanup:
cleanup:
return exit_status;
}
@ -306,7 +293,7 @@ int orte_errmgr_hnp_global_predicted_fault(opal_list_t *proc_list,
goto cleanup;
}
cleanup:
cleanup:
return exit_status;
#else
return ORTE_ERR_NOT_IMPLEMENTED;
@ -342,7 +329,7 @@ int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc,
}
}
cleanup:
cleanup:
return exit_status;
#else
return ORTE_ERR_NOT_IMPLEMENTED;
@ -374,7 +361,7 @@ int orte_errmgr_hnp_global_ft_event(int state)
goto cleanup;
}
cleanup:
cleanup:
return exit_status;
}
@ -404,6 +391,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
orte_odls_child_t *child;
int rc;
orte_app_context_t *app;
orte_proc_t *pdat;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s errmgr:hnp: job %s reported state %s"
@ -536,7 +524,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED,
exit_code);
/* order all local procs for this job to be killed */
killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
@ -548,7 +536,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
break;
case ORTE_JOB_STATE_COMM_FAILED:
/* order all local procs for this job to be killed */
killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
@ -560,7 +548,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
break;
case ORTE_JOB_STATE_HEARTBEAT_FAILED:
/* order all local procs for this job to be killed */
killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
check_job_complete(jdata); /* set the local proc states */
/* the job object for this job will have been NULL'd
* in the array if the job was solely local. If it isn't
@ -629,11 +617,6 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
/* guess not - let it fall thru to abort */
}
}
if (ORTE_PROC_STATE_ABORTED_BY_SIG == state) {
exit_code = 0;
}
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
check_job_complete(jdata); /* need to set the job state */
/* the job object for this job will have been NULL'd
@ -677,7 +660,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
if (jdata->enable_recovery) {
killprocs(proc->jobid, proc->vpid, proc->epoch);
killprocs(proc->jobid, proc->vpid);
/* is this a local proc */
if (NULL != (child = proc_is_local(proc))) {
/* local proc - see if it has reached its restart limit */
@ -735,11 +718,9 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
/* remove from dependent routes, if it is one */
orte_routed.route_lost(proc);
/* update daemon job */
orte_errmgr_hnp_record_dead_process(proc);
/* We'll check if the job was complete when we get the
* message back from the HNP notifying us of the dead
* process
*/
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, 0);
/* check for complete */
check_job_complete(jdata);
break;
}
/* if abort is in progress, see if this one failed to tell
@ -752,11 +733,9 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
/* remove from dependent routes, if it is one */
orte_routed.route_lost(proc);
/* update daemon job */
orte_errmgr_hnp_record_dead_process(proc);
/* We'll check if the job was complete when we get the
* message back from the HNP notifying us of the dead
* process
*/
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
/* check for complete */
check_job_complete(jdata);
break;
}
@ -772,21 +751,31 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc));
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
/* check if all is complete so we can terminate */
check_job_complete(jdata);
}
} else {
if (ORTE_SUCCESS != orte_errmgr_hnp_record_dead_process(proc)) {
/* The process is already dead so don't keep trying to do
* this stuff. */
return ORTE_SUCCESS;
if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true,
ORTE_VPID_PRINT(proc->vpid), "Unknown");
} else {
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true,
ORTE_VPID_PRINT(proc->vpid),
(NULL == pdat->node) ? "Unknown" :
((NULL == pdat->node->name) ? "Unknown" : pdat->node->name));
}
/* We'll check if the job was complete when we get the
* message back from the HNP notifying us of the dead
* process */
/* remove this proc from the daemon job */
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
/* check if all is complete so we can terminate */
check_job_complete(jdata);
}
}
break;
@ -796,9 +785,9 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
if( orte_enable_recovery ) {
/* relocate its processes */
} else {
orte_errmgr_hnp_record_dead_process(proc);
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* kill all jobs */
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
return ORTE_ERR_UNRECOVERABLE;
@ -817,170 +806,6 @@ int orte_errmgr_hnp_base_global_ft_event(int state)
return ORTE_SUCCESS;
}
int orte_errmgr_hnp_global_post_startup(void) {
return ORTE_SUCCESS;
}
int orte_errmgr_hnp_global_pre_shutdown(void) {
return ORTE_SUCCESS;
}
int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) {
orte_std_cntr_t n;
int ret = ORTE_SUCCESS, num_failed;
opal_pointer_array_t *dead_names;
int32_t i;
orte_process_name_t *name_item;
orte_epoch_t epoch;
orte_job_t *jdat;
orte_proc_t *pdat, *pdat2;
opal_buffer_t *answer;
orte_daemon_cmd_flag_t command;
if (orte_debug_daemons_flag) {
opal_output(0, "%s errmgr:hnp HNP received process failed from orted %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(sender));
}
n = 1;
/* Get the number of failed procs */
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
return ret;
}
dead_names = OBJ_NEW(opal_pointer_array_t);
for (i = 0; i < num_failed; i++) {
name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t));
/* Unpack the buffer to get the dead process' name. */
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
return ret;
}
/* Check to see if the message is telling us about an old epoch.
* If so ignore the message.
*/
epoch = orte_util_lookup_epoch(name_item);
if (name_item->epoch < epoch) {
if (orte_debug_daemons_flag) {
opal_output(0, "%s errmgr:hnp HNP ignoring duplicate notification for %s failure (reported epoch: %s local epoch: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name_item),
ORTE_EPOCH_PRINT(name_item->epoch),
ORTE_EPOCH_PRINT(epoch));
}
free(name_item);
continue;
} else {
if (orte_debug_daemons_flag) {
opal_output(0, "%s errmgr:hnp HNP received notification for %s failure (reported epoch: %s local epoch: %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name_item),
ORTE_EPOCH_PRINT(name_item->epoch),
ORTE_EPOCH_PRINT(epoch));
}
}
opal_pointer_array_add(dead_names, name_item);
/* Check to see if the message is telling us about an orted and
* it is from another orted. Orteds don't have the list of all
* the application processes so they don't know if there were
* any child processes on the nodes that they are reporting. */
if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, sender, ORTE_PROC_MY_NAME)) {
if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) {
continue;
} else if (NULL == (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid))) {
continue;
} else if (NULL == pdat->node) {
continue;
}
if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) {
for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) {
if (NULL == (pdat2 = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) {
continue;
} else {
name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t));
name_item->jobid = pdat2->name.jobid;
name_item->vpid = pdat2->name.vpid;
name_item->epoch = orte_util_lookup_epoch(&(pdat2->name));
opal_pointer_array_add(dead_names, name_item);
}
}
}
}
}
/* Update the number of failed process so any duplicates don't get
* re-reported.
*/
num_failed = opal_pointer_array_get_size(dead_names);
if (num_failed > 0) {
orte_errmgr.mark_processes_as_dead(dead_names);
if (!orte_orteds_term_ordered) {
/* Send a message out to all the orteds to inform them that the
* process is dead. Long live the process (or not if it is so
* decided)!
*/
answer = OBJ_NEW(opal_buffer_t);
command = ORTE_PROCESS_FAILED_NOTIFICATION;
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
return ret;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_failed, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
return ret;
}
for (i = 0; i < opal_pointer_array_get_size(dead_names); i++) {
if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, name_item, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
return ret;
}
}
}
if (ORTE_SUCCESS != (ret = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, answer, ORTE_RML_TAG_DAEMON))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
return ret;
}
/* Tell the applications' ORTE layers that there is a failure. */
if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) {
return ret;
}
}
for (i = 0; i < num_failed; i++) {
name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i);
free(name_item);
}
}
OBJ_RELEASE(dead_names);
return ret;
}
/*****************
* Local Functions
*****************/
@ -1333,7 +1158,6 @@ static void check_job_complete(orte_job_t *jdata)
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
break;
#if 0
case ORTE_PROC_STATE_ABORTED_BY_SIG:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed proc %s aborted by signal",
@ -1349,7 +1173,6 @@ static void check_job_complete(orte_job_t *jdata)
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
break;
#endif
case ORTE_PROC_STATE_TERM_WO_SYNC:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:hnp:check_job_completed proc %s terminated without sync",
@ -1372,7 +1195,6 @@ static void check_job_complete(orte_job_t *jdata)
}
break;
case ORTE_PROC_STATE_COMM_FAILED:
#if 0
if (!jdata->abort) {
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
/* point to the lowest rank to cause the problem */
@ -1382,7 +1204,6 @@ static void check_job_complete(orte_job_t *jdata)
jdata->abort = true;
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
}
#endif
break;
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
if (!jdata->abort) {
@ -1507,9 +1328,9 @@ static void check_job_complete(orte_job_t *jdata)
* This can happen if a ctrl-c hits in the "wrong" place
* while launching
*/
CHECK_DAEMONS:
CHECK_DAEMONS:
if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
if ((jdata->num_procs - 1) <= jdata->num_terminated) { /* Subtract one for the HNP */
if (0 == orte_routed.num_routes()) {
/* orteds are done! */
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s orteds complete - exiting",
@ -1566,7 +1387,7 @@ CHECK_DAEMONS:
jdata->map = NULL;
}
CHECK_ALIVE:
CHECK_ALIVE:
/* now check to see if all jobs are done - release this jdata
* object when we find it
*/
@ -1672,7 +1493,7 @@ CHECK_ALIVE:
}
}
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
{
opal_pointer_array_t cmd;
orte_proc_t proc;
@ -1683,7 +1504,7 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
orte_sensor.stop(job);
}
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) {
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) {
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
ORTE_ERROR_LOG(rc);
}
@ -1694,7 +1515,6 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
OBJ_CONSTRUCT(&proc, orte_proc_t);
proc.name.jobid = job;
proc.name.vpid = vpid;
proc.name.epoch = epoch;
opal_pointer_array_add(&cmd, &proc);
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
ORTE_ERROR_LOG(rc);
@ -1731,7 +1551,7 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
*/
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
/* remove this proc from the daemon job */
orte_errmgr_hnp_record_dead_process(proc);
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
/* check to see if any other nodes are "alive" */
if (!orte_hnp_is_allocated && jdata->num_procs == 1) {
return ORTE_ERR_FATAL;
@ -1856,229 +1676,59 @@ static orte_odls_child_t* proc_is_local(orte_process_name_t *proc)
return NULL;
}
int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) {
orte_job_t *jdat;
orte_proc_t *pdat;
opal_buffer_t *buffer;
orte_daemon_cmd_flag_t command;
int i, rc, num_failed;
opal_pointer_array_t *dead_names;
orte_process_name_t *name_item;
orte_proc_t *proc_item;
if (NULL == (jdat = orte_get_job_data_object(proc->jobid))) {
opal_output(0, "Can't find job object");
return ORTE_ERR_NOT_FOUND;
}
if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) &&
ORTE_PROC_STATE_TERMINATED != pdat->state) {
/* Make sure that the epochs match. */
if (proc->epoch != pdat->name.epoch) {
opal_output(1, "The epoch does not match the current epoch. Throwing the request out.");
return ORTE_SUCCESS;
}
dead_names = OBJ_NEW(opal_pointer_array_t);
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
opal_pointer_array_add(dead_names, &(pdat->name));
for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) {
if (NULL == (proc_item = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) {
continue;
}
opal_pointer_array_add(dead_names, &(proc_item->name));
}
}
if (!orte_orteds_term_ordered) {
/*
* Send a message to the other daemons so they know that a daemon has
* died.
*/
buffer = OBJ_NEW(opal_buffer_t);
command = ORTE_PROCESS_FAILED_NOTIFICATION;
num_failed = opal_pointer_array_get_size(dead_names);
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
} else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
} else {
/* Iterate of the list of dead procs and send them along with
* the rest. The HNP needs this info so it can tell the other
* ORTEDs and they can inform the appropriate applications.
*/
for (i = 0; i < num_failed; i++) {
if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) {
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, name_item, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
OBJ_RELEASE(buffer);
}
}
}
OBJ_RELEASE(dead_names);
orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0);
OBJ_RELEASE(buffer);
}
} else {
orte_errmgr_hnp_global_mark_processes_as_dead(dead_names);
}
}
return ORTE_SUCCESS;
}
int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs) {
int i;
orte_process_name_t *name_item;
orte_job_t *jdat;
void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat,
orte_vpid_t vpid,
orte_proc_state_t state,
orte_exit_code_t exit_code)
{
orte_job_t *jdt;
orte_proc_t *pdat;
orte_node_t *node;
int i;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"HNP %s marking procs as dead",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
/* Iterate over the list of processes */
for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) {
if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) {
opal_output(1, "NULL found in dead process list.");
continue;
}
if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"%s Job data not found.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
return ORTE_ERR_NOT_FOUND;
}
if (NULL != (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid)) &&
if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, vpid)) &&
ORTE_PROC_STATE_TERMINATED != pdat->state) {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"HNP %s marking %s as dead",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pdat->name)));
/* Make sure the epochs match, if not it probably means that we
* already reported this failure. */
if (name_item->epoch != pdat->name.epoch) {
continue;
}
orte_util_set_epoch(name_item, name_item->epoch + 1);
/* Remove it from the job array */
opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL);
/* need to record that this one died */
pdat->state = state;
pdat->exit_code = exit_code;
ORTE_UPDATE_EXIT_STATUS(exit_code);
/* remove it from the job array */
opal_pointer_array_set_item(jdat->procs, vpid, NULL);
orte_process_info.num_procs--;
jdat->num_procs--;
/* Check if this is an ORTED */
if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) {
/* Mark the node as down so it won't be used in mapping anymore. */
/* mark the node as down so it won't be used in mapping
* procs to be relaunched
*/
node = pdat->node;
node->state = ORTE_NODE_STATE_DOWN;
node->daemon = NULL;
OBJ_RELEASE(pdat); /* maintain accounting */
/* mark all procs on this node as having terminated */
for (i=0; i < node->procs->size; i++) {
if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
continue;
}
OBJ_RELEASE(pdat);
/* Create a new proc object that will keep track of the epoch
* information */
pdat = OBJ_NEW(orte_proc_t);
pdat->name.jobid = jdat->jobid;
pdat->name.vpid = name_item->vpid;
pdat->name.epoch = name_item->epoch + 1;
/* Set the state as terminated so we'll know the process isn't
* actually there. */
pdat->state = ORTE_PROC_STATE_TERMINATED;
opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat);
jdat->num_procs++;
jdat->num_terminated++;
} else {
opal_output(0, "Proc data not found for %s", ORTE_NAME_PRINT(name_item));
/* Create a new proc object that will keep track of the epoch
* information */
pdat = OBJ_NEW(orte_proc_t);
pdat->name.jobid = jdat->jobid;
pdat->name.vpid = name_item->vpid;
pdat->name.epoch = name_item->epoch + 1;
/* Set the state as terminated so we'll know the process isn't
* actually there. */
pdat->state = ORTE_PROC_STATE_TERMINATED;
opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat);
jdat->num_procs++;
jdat->num_terminated++;
/* get the job data object for this process */
if (NULL == (jdt = orte_get_job_data_object(pdat->name.jobid))) {
/* It is possible that the process job finishes before the daemons.
* In that case the process state is set to normal termination, and
* the job data has already been cleared. So no need to throw an
* error.
*/
if( ORTE_PROC_STATE_TERMINATED != pdat->state ) {
opal_output(0,
"%s Error: Failed to find job_data for proc %s (%s) on node %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(&pdat->name),
orte_proc_state_to_str(pdat->state),
node->name );
/* major problem */
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
}
check_job_complete(jdat);
continue;
}
if (!orte_orteds_term_ordered) {
/* Need to update the orted routing module. */
orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid);
if (NULL != fault_cbfunc) {
(*fault_cbfunc)(dead_procs);
pdat->state = ORTE_PROC_STATE_ABORTED;
jdt->num_terminated++;
}
}
return ORTE_SUCCESS;
}
int send_to_local_applications(opal_pointer_array_t *dead_names) {
opal_buffer_t *buf;
int ret = ORTE_SUCCESS;
orte_process_name_t *name_item;
int size, i;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"%s Sending failure to local applications.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
buf = OBJ_NEW(opal_buffer_t);
size = opal_pointer_array_get_size(dead_names);
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
for (i = 0; i < size; i++) {
if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
}
}
if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
OBJ_RELEASE(buf);
return ret;
}

Просмотреть файл

@ -1,8 +1,5 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -60,6 +57,10 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat,
orte_vpid_t vpid,
orte_proc_state_t state,
orte_exit_code_t exit_code);
/***************************
* Module functions: Global
@ -80,11 +81,6 @@ int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnp_global_ft_event(int state);
int orte_errmgr_hnp_global_post_startup(void);
int orte_errmgr_hnp_global_pre_shutdown(void);
int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs);
int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc);
/* HNP Versions */
int orte_errmgr_hnp_base_global_init(void);

Просмотреть файл

@ -1,10 +1,7 @@
/*
* Copyright (c) 2009-2011 The Trustees of Indiana University.
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -394,7 +391,6 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *node = NULL;
bool found = false;
int num_removed = 0, num_to_remove;
orte_ns_cmp_bitmask_t mask;
if( NULL == current_global_jobdata ) {
return ORTE_SUCCESS;
@ -414,8 +410,8 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
item = opal_list_get_next(item) ) {
wp_item = (errmgr_autor_wp_item_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &wp_item->name, &proc->name)) {
if( wp_item->name.vpid == proc->name.vpid &&
wp_item->name.jobid == proc->name.jobid ) {
found = true;
break;
}
@ -522,7 +518,6 @@ static void errmgr_autor_process_fault_app(orte_job_t *jdata,
wp_item = OBJ_NEW(errmgr_autor_wp_item_t);
wp_item->name.jobid = proc->jobid;
wp_item->name.vpid = proc->vpid;
wp_item->name.epoch = proc->epoch;
wp_item->state = state;
opal_list_append(procs_pending_recovery, &(wp_item->super));
@ -617,7 +612,7 @@ static void errmgr_autor_process_fault_daemon(orte_job_t *jdata,
/*
* Record the dead daemon
*/
orte_errmgr_hnp_record_dead_process(proc);
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, 0);
return;
}
@ -626,7 +621,6 @@ void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp)
{
wp->name.jobid = ORTE_JOBID_INVALID;
wp->name.vpid = ORTE_VPID_INVALID;
wp->name.epoch = ORTE_EPOCH_MIN;
wp->state = 0;
}
@ -635,7 +629,6 @@ void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp)
{
wp->name.jobid = ORTE_JOBID_INVALID;
wp->name.vpid = ORTE_VPID_INVALID;
wp->name.epoch = ORTE_EPOCH_INVALID;
wp->state = 0;
}

Просмотреть файл

@ -2,9 +2,6 @@
* Copyright (c) 2009-2010 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -750,7 +747,6 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
}
}
}
@ -807,7 +803,6 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
}
}
}
@ -855,7 +850,6 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
close_iof_stdin = true;
iof_name.jobid = proc->name.jobid;
iof_name.vpid = proc->name.vpid;
iof_name.epoch = proc->name.epoch;
}
}
}

12
orte/mca/errmgr/hnpresil/.windows Обычный файл
Просмотреть файл

@ -0,0 +1,12 @@
#
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
mca_link_libraries=libopen-rte

40
orte/mca/errmgr/hnpresil/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,40 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
dist_pkgdata_DATA = help-orte-errmgr-hnp.txt
sources = \
errmgr_hnpresil.h \
errmgr_hnpresil_component.c \
errmgr_hnpresil.c \
errmgr_hnpresil_autor.c \
errmgr_hnpresil_crmig.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_errmgr_hnpresil_DSO
component_noinst =
component_install = mca_errmgr_hnpresil.la
else
component_noinst = libmca_errmgr_hnpresil.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_hnpresil_la_SOURCES = $(sources)
mca_errmgr_hnpresil_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_hnpresil_la_SOURCES =$(sources)
libmca_errmgr_hnpresil_la_LDFLAGS = -module -avoid-version

2112
orte/mca/errmgr/hnpresil/errmgr_hnpresil.c Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

137
orte/mca/errmgr/hnpresil/errmgr_hnpresil.h Обычный файл
Просмотреть файл

@ -0,0 +1,137 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_HNPRESIL_EXPORT_H
#define MCA_ERRMGR_HNPRESIL_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
struct orte_errmgr_hnpresil_component_t {
orte_errmgr_base_component_t super; /** Base Errmgr component */
bool ignore_current_update;
bool term_in_progress;
#if OPAL_ENABLE_FT_CR
/* State of the Recovery */
bool crmig_in_progress;
bool autor_in_progress;
/* CRMig Options */
bool crmig_enabled;
bool crmig_timing_enabled;
/* AutoR Options */
bool autor_enabled;
bool autor_timing_enabled;
int autor_recovery_delay;
bool autor_skip_oldnode;
#endif
};
typedef struct orte_errmgr_hnpresil_component_t orte_errmgr_hnpresil_component_t;
OPAL_MODULE_DECLSPEC extern orte_errmgr_hnpresil_component_t mca_errmgr_hnpresil_component;
int orte_errmgr_hnpresil_component_query(mca_base_module_t **module, int *priority);
void orte_errmgr_hnpresil_update_proc(orte_job_t *jdata,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
/***************************
* Module functions: Global
***************************/
int orte_errmgr_hnpresil_global_module_init(void);
int orte_errmgr_hnpresil_global_module_finalize(void);
int orte_errmgr_hnpresil_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnpresil_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
int orte_errmgr_hnpresil_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnpresil_global_ft_event(int state);
int orte_errmgr_hnpresil_global_post_startup(void);
int orte_errmgr_hnpresil_global_pre_shutdown(void);
int orte_errmgr_hnpresil_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs);
int orte_errmgr_hnpresil_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
int orte_errmgr_hnpresil_record_dead_process(orte_process_name_t *proc);
/* hnpresil Versions */
int orte_errmgr_hnpresil_base_global_init(void);
int orte_errmgr_hnpresil_base_global_finalize(void);
int orte_errmgr_hnpresil_base_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnpresil_base_global_ft_event(int state);
#if OPAL_ENABLE_FT_CR
/* CRMig Versions */
int orte_errmgr_hnpresil_crmig_global_module_init(void);
int orte_errmgr_hnpresil_crmig_global_module_finalize(void);
int orte_errmgr_hnpresil_crmig_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnpresil_crmig_global_predicted_fault(opal_list_t *proc_list,
opal_list_t *node_list,
opal_list_t *suggested_map);
int orte_errmgr_hnpresil_crmig_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnpresil_crmig_global_ft_event(int state);
/* AutoR Versions */
int orte_errmgr_hnpresil_autor_global_module_init(void);
int orte_errmgr_hnpresil_autor_global_module_finalize(void);
int orte_errmgr_hnpresil_autor_global_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
int orte_errmgr_hnpresil_autor_global_suggest_map_targets(orte_proc_t *proc,
orte_node_t *oldnode,
opal_list_t *node_list);
int orte_errmgr_hnpresil_autor_global_ft_event(int state);
#endif
END_C_DECLS
#endif /* MCA_ERRMGR_HNPRESIL_EXPORT_H */

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,201 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_hnpresil.h"
/*
* Public string for version number
*/
const char *orte_errmgr_hnpresil_component_version_string =
"ORTE ERRMGR hnpresil MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int orte_errmgr_hnpresil_open(void);
static int orte_errmgr_hnpresil_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_hnpresil_component_t mca_errmgr_hnpresil_component = {
/* First do the base component stuff */
{
/* Handle the general mca_component_t struct containing
* meta information about the component hnp
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"hnpresil",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
orte_errmgr_hnpresil_open,
orte_errmgr_hnpresil_close,
orte_errmgr_hnpresil_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
0
}
};
static int orte_errmgr_hnpresil_open(void)
{
int val;
/*
* This should be the last componet to ever get used since
* it doesn't do anything.
*/
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"priority",
"Priority of the ERRMGR hnp component",
false, false,
mca_errmgr_hnpresil_component.super.priority,
&mca_errmgr_hnpresil_component.super.priority);
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"verbose",
"Verbose level for the ERRMGR hnp component",
false, false,
mca_errmgr_hnpresil_component.super.verbose,
&mca_errmgr_hnpresil_component.super.verbose);
/* If there is a custom verbose level for this component than use it
* otherwise take our parents level and output channel
*/
if ( 0 != mca_errmgr_hnpresil_component.super.verbose) {
mca_errmgr_hnpresil_component.super.output_handle = opal_output_open(NULL);
opal_output_set_verbosity(mca_errmgr_hnpresil_component.super.output_handle,
mca_errmgr_hnpresil_component.super.verbose);
} else {
mca_errmgr_hnpresil_component.super.output_handle = orte_errmgr_base.output;
}
#if OPAL_ENABLE_FT_CR
/****************************
* CRMig (C/R Process Migration) MCA Options
****************************/
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"crmig_timing",
"Enable Process Migration timer",
false, false,
0, &val);
mca_errmgr_hnpresil_component.crmig_timing_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"crmig_enable",
"Enable Process Migration (Default: 0/off)",
false, false,
0, &val);
mca_errmgr_hnpresil_component.crmig_enabled = OPAL_INT_TO_BOOL(val);
/****************************
* AutoR (Automatic Recovery) MCA Options
****************************/
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"autor_timing",
"Enable Automatic Recovery timer",
false, false,
0, &val);
mca_errmgr_hnpresil_component.autor_timing_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"autor_enable",
"Enable Automatic Recovery (Default: 0/off)",
false, false,
0, &val);
mca_errmgr_hnpresil_component.autor_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"autor_recovery_delay",
"Number of seconds to wait before starting to recover the job after a failure"
" [Default: 1 sec]",
false, false,
1, &val);
mca_errmgr_hnpresil_component.autor_recovery_delay = val;
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
"autor_skip_oldnode",
"Skip the old node from failed proc, even if it is still available"
" [Default: Enabled]",
false, false,
1, &val);
mca_errmgr_hnpresil_component.autor_skip_oldnode = OPAL_INT_TO_BOOL(val);
#else
val = 0; /* Silence compiler warning */
#endif /* OPAL_ENABLE_FT_CR */
/*
* Debug Output
*/
opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open()");
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: priority = %d",
mca_errmgr_hnpresil_component.super.priority);
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: verbosity = %d",
mca_errmgr_hnpresil_component.super.verbose);
#if OPAL_ENABLE_FT_CR
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: --- CR Migration Options ---");
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: Process Migration = %s",
(mca_errmgr_hnpresil_component.crmig_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: timing = %s",
(mca_errmgr_hnpresil_component.crmig_timing_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: --- Auto. Recovery Options ---");
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: Auto. Recover = %s",
(mca_errmgr_hnpresil_component.autor_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: timing = %s",
(mca_errmgr_hnpresil_component.autor_timing_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: open: recover_delay = %d",
mca_errmgr_hnpresil_component.autor_recovery_delay);
mca_errmgr_hnpresil_component.crmig_in_progress = false;
mca_errmgr_hnpresil_component.autor_in_progress = false;
mca_errmgr_hnpresil_component.term_in_progress = false;
#endif /* OPAL_ENABLE_FT_CR */
return ORTE_SUCCESS;
}
static int orte_errmgr_hnpresil_close(void)
{
opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle,
"errmgr:hnp: close()");
return ORTE_SUCCESS;
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,71 @@
-*- text -*-
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for ORTE Errmgr HNP module.
#
[errmgr-hnp:unknown-job-error]
An error has occurred in an unknown job. This generally should not happen
except due to an internal ORTE error.
Job state: %s
This information should probably be reported to the OMPI developers.
#
[errmgr-hnp:daemon-died]
The system has lost communication with the following daemon:
Daemon: %s
Node: %s
The reason for the lost communication channel is unknown. Possible
reasons include failure of the daemon itself, failure of the
connecting fabric/switch, and loss of the host node. Please
check with your system administrator to try and determine the
source of the problem.
Your job is being terminated as a result.
#
[errmgr-hnp:cannot-relocate]
The system is unable to relocate the specified process:
Process: %s
because the application for that process could not be found. This
appears to be a system error. Please report it to the ORTE
developers.
[autor_recovering_job]
Notice: The processes listed below failed unexpectedly.
Using the last checkpoint to recover the job.
Please standby.
%s
[autor_recovery_complete]
Notice: The job has been successfully recovered from the
last checkpoint.
[autor_failed_to_recover_proc]
Error: The process below has failed. There is no checkpoint available for
this job, so we are terminating the application since automatic
recovery cannot occur.
Internal Name: %s
MCW Rank: %d
[crmig_migrating_job]
Notice: A migration of this job has been requested.
The processes below will be migrated.
Please standby.
%s
[crmig_migrated_job]
Notice: The processes have been successfully migrated to/from the specified
machines.
[crmig_no_migrating_procs]
Warning: Could not find any processes to migrate on the nodes specified.
You provided the following:
Nodes: %s
Procs: %s

Просмотреть файл

@ -3,9 +3,6 @@
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2004-2011 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -32,11 +29,9 @@
#include "orte/util/proc_info.h"
#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
#include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/odls/base/base.h"
#include "orte/mca/plm/plm_types.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/sensor/sensor.h"
@ -58,9 +53,8 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code);
static void update_local_children(orte_odls_job_t *jobdat,
orte_job_state_t jobstate,
orte_proc_state_t state);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch);
static int record_dead_process(orte_process_name_t *proc);
static int send_to_local_applications(opal_pointer_array_t *dead_names);
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
/*
* Module functions: Global
@ -85,11 +79,7 @@ static int suggest_map_targets(orte_proc_t *proc,
static int ft_event(int state);
static int post_startup(void);
static int pre_shutdown(void);
static int mark_processes_as_dead(opal_pointer_array_t *dead_procs);
static int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
/******************
* ORTED module
@ -105,11 +95,11 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = {
suggest_map_targets,
ft_event,
orte_errmgr_base_register_migration_warning,
post_startup,
pre_shutdown,
mark_processes_as_dead,
orte_errmgr_base_set_fault_callback, /* Set callback function */
failure_notification
NULL, /* post_startup */
NULL, /* pre_shutdown */
NULL, /* mark_processes_as_dead */
NULL, /* set_fault_callback */
NULL /* failure_notification */
};
/************************
@ -140,7 +130,6 @@ static int update_state(orte_jobid_t job,
int rc=ORTE_SUCCESS;
orte_vpid_t null=ORTE_VPID_INVALID;
orte_app_context_t *app;
orte_ns_cmp_bitmask_t mask;
/*
* if orte is trying to shutdown, just let it
@ -149,14 +138,6 @@ static int update_state(orte_jobid_t job,
return ORTE_SUCCESS;
}
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"errmgr:orted:update_state() %s) "
"------- %s state updated for process %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
((NULL == proc) ? "App. Process" :
(proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")),
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));
/* if this is a heartbeat failure, let the HNP handle it */
if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate ||
ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
@ -221,10 +202,10 @@ static int update_state(orte_jobid_t job,
/* update all procs in job */
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
/* order all local procs for this job to be killed */
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
case ORTE_JOB_STATE_COMM_FAILED:
/* kill all local procs */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* tell the caller we can't recover */
return ORTE_ERR_UNRECOVERABLE;
break;
@ -261,16 +242,15 @@ static int update_state(orte_jobid_t job,
* lifeline
*/
if (ORTE_PROC_STATE_COMM_FAILED == state) {
mask = ORTE_NS_CMP_ALL;
/* if it is our own connection, ignore it */
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
if (ORTE_PROC_MY_NAME->jobid == proc->jobid &&
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
return ORTE_SUCCESS;
}
/* see if this was a lifeline */
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
/* kill our children */
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
/* terminate - our routed children will see
* us leave and automatically die
*/
@ -281,14 +261,10 @@ static int update_state(orte_jobid_t job,
/* was it a daemon that failed? */
if (proc->jobid == ORTE_PROC_MY_NAME->jobid) {
/* if all my routes are gone, then terminate ourselves */
if (0 == orte_routed.num_routes() &&
0 == opal_list_get_size(&orte_local_children)) {
if (0 == orte_routed.num_routes()) {
orte_quit();
}
}
record_dead_process(proc);
/* if not, then indicate we can continue */
return ORTE_SUCCESS;
}
@ -330,15 +306,15 @@ static int update_state(orte_jobid_t job,
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
child->exit_code = exit_code;
/* Decrement the number of local procs */
jobdat->num_local_procs--;
/* kill this proc */
killprocs(proc->jobid, proc->vpid, proc->epoch);
killprocs(proc->jobid, proc->vpid);
}
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx);
if( jobdat->enable_recovery && child->restarts < app->max_restarts ) {
@ -375,8 +351,8 @@ static int update_state(orte_jobid_t job,
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
/* see if this child has reached its local restart limit */
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx);
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
@ -404,7 +380,7 @@ static int update_state(orte_jobid_t job,
}
}
REPORT_ABORT:
REPORT_ABORT:
/* if the job hasn't completed and the state is abnormally
* terminated, then we need to alert the HNP right away
*/
@ -427,8 +403,8 @@ REPORT_ABORT:
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
child->exit_code = exit_code;
@ -465,14 +441,14 @@ REPORT_ABORT:
return rc;
}
REPORT_STATE:
REPORT_STATE:
/* find this proc in the local children so we can update its state */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t*)item;
mask = ORTE_NS_CMP_ALL;
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
if (child->name->jobid == proc->jobid &&
child->name->vpid == proc->vpid) {
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
child->state = state;
if (0 < pid) {
@ -509,7 +485,7 @@ REPORT_ABORT:
ORTE_ERROR_LOG(rc);
goto FINAL_CLEANUP;
}
/* pack all the local child vpids and epochs */
/* pack all the local child vpids */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
@ -578,7 +554,7 @@ REPORT_ABORT:
ORTE_ERROR_LOG(rc);
}
FINAL_CLEANUP:
FINAL_CLEANUP:
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
"%s errmgr:orted reporting all procs in %s terminated",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@ -613,7 +589,6 @@ FINAL_CLEANUP:
rc = ORTE_SUCCESS;
}
OBJ_DESTRUCT(&alert);
/* indicate that the job is complete */
return rc;
}
@ -639,131 +614,6 @@ int ft_event(int state)
return ORTE_SUCCESS;
}
int post_startup(void) {
return ORTE_SUCCESS;
}
int pre_shutdown(void) {
return ORTE_SUCCESS;
}
int mark_processes_as_dead(opal_pointer_array_t *dead_procs) {
int i;
orte_process_name_t *name_item;
opal_list_item_t *item;
orte_odls_child_t *child;
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"ORTED %s marking procs as dead",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) {
if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) {
opal_output(0, "NULL found in dead process list.");
continue;
} else {
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
"ORTED %s marking %s as dead",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name_item)));
}
if (name_item->epoch < orte_util_lookup_epoch(name_item)) {
continue;
}
/* Increment the epoch */
orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED);
orte_util_set_epoch(name_item, name_item->epoch + 1);
/* Remove the dead process from my list of children if applicable */
for (item = opal_list_get_first(&orte_local_children);
item != opal_list_get_end(&orte_local_children);
item = opal_list_get_next(item)) {
child = (orte_odls_child_t *) item;
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID,
child->name, name_item)) {
opal_list_remove_item(&orte_local_children, item);
break;
}
}
/* Remove the route from the routing layer */
orte_routed.delete_route(name_item);
}
/* Update the routing module */
orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid);
if (NULL != fault_cbfunc) {
(*fault_cbfunc)(dead_procs);
}
return ORTE_SUCCESS;
}
int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) {
opal_pointer_array_t *dead_names;
orte_std_cntr_t n;
int ret = ORTE_SUCCESS, num_failed;
int32_t i;
orte_process_name_t *name_item, proc;
dead_names = OBJ_NEW(opal_pointer_array_t);
n = 1;
/* Get the number of failed procs */
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
return ret;
}
for (i = 0; i < num_failed; i++) {
/* Unpack the buffer to get the dead process' name. */
n = 1;
name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t));
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
return ret;
}
if (orte_debug_daemons_flag) {
opal_output(0, "%s errmgr:orted ORTED received process %s failed from %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(name_item),
ORTE_NAME_PRINT(sender));
}
/* There shouldn't be an issue of receiving this message multiple
* times but it doesn't hurt to double check.
*/
if (proc.epoch < orte_util_lookup_epoch(name_item)) {
opal_output(1, "Received from proc %s local epoch %d", ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item));
continue;
}
opal_pointer_array_add(dead_names, name_item);
}
/* Tell the errmgr so it can handle changing the epoch, routes, etc. */
orte_errmgr.mark_processes_as_dead(dead_names);
/* Tell the applications' ORTE layers that there is a failure. */
if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) {
return ret;
}
for (i = 0; i < num_failed; i++) {
name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i);
free(name_item);
}
return ret;
}
/*****************
* Local Functions
*****************/
@ -833,8 +683,8 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child)
return ORTE_SUCCESS;
}
static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat)
{
static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat)
{
int rc;
opal_list_item_t *item, *next;
orte_odls_child_t *child;
@ -879,7 +729,7 @@ static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat)
}
return ORTE_SUCCESS;
}
}
static bool all_children_registered(orte_jobid_t job)
{
@ -944,7 +794,6 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
ORTE_ERROR_LOG(rc);
return rc;
}
/* Pack the child's epoch. */
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->epoch), 1, ORTE_EPOCH))) {
ORTE_ERROR_LOG(rc);
return rc;
@ -1011,7 +860,7 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs
}
}
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
{
opal_pointer_array_t cmd;
orte_proc_t proc;
@ -1022,7 +871,7 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
orte_sensor.stop(job);
}
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) {
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) {
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
ORTE_ERROR_LOG(rc);
}
@ -1033,7 +882,6 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
OBJ_CONSTRUCT(&proc, orte_proc_t);
proc.name.jobid = job;
proc.name.vpid = vpid;
proc.name.epoch = epoch;
opal_pointer_array_add(&cmd, &proc);
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
ORTE_ERROR_LOG(rc);
@ -1041,85 +889,3 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
OBJ_DESTRUCT(&cmd);
OBJ_DESTRUCT(&proc);
}
static int record_dead_process(orte_process_name_t *proc) {
opal_pointer_array_t *dead_name;
opal_buffer_t *buffer;
orte_daemon_cmd_flag_t command;
int rc = ORTE_SUCCESS;
int num_failed;
if (orte_odls_base_default_check_finished(proc)) {
return rc;
}
dead_name = OBJ_NEW(opal_pointer_array_t);
opal_pointer_array_add(dead_name, proc);
/* Mark the process as dead */
mark_processes_as_dead(dead_name);
/* Send a message to the HNP */
buffer = OBJ_NEW(opal_buffer_t);
command = ORTE_PROCESS_FAILED_NOTIFICATION;
num_failed = 1;
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
ORTE_ERROR_LOG(rc);
} else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(rc);
} else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(rc);
}
orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0);
OBJ_RELEASE(buffer);
OBJ_RELEASE(dead_name);
return rc;
}
int send_to_local_applications(opal_pointer_array_t *dead_names) {
opal_buffer_t *buf;
int ret;
orte_process_name_t *name_item;
int size, i;
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
"%s Sending failure to local applications.",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
buf = OBJ_NEW(opal_buffer_t);
size = opal_pointer_array_get_size(dead_names);
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
for (i = 0; i < size; i++) {
if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) {
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
}
}
if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buf);
return ret;
}
OBJ_RELEASE(buf);
return ORTE_SUCCESS;
}

12
orte/mca/errmgr/ortedresil/.windows Обычный файл
Просмотреть файл

@ -0,0 +1,12 @@
#
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# Specific to this module
mca_link_libraries=libopen-rte

38
orte/mca/errmgr/ortedresil/Makefile.am Обычный файл
Просмотреть файл

@ -0,0 +1,38 @@
#
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
EXTRA_DIST = .windows
dist_pkgdata_DATA = help-orte-errmgr-orted.txt
sources = \
errmgr_ortedresil.h \
errmgr_ortedresil_component.c \
errmgr_ortedresil.c
# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).
if MCA_BUILD_orte_errmgr_ortedresil_DSO
component_noinst =
component_install = mca_errmgr_ortedresil.la
else
component_noinst = libmca_errmgr_ortedresil.la
component_install =
endif
mcacomponentdir = $(pkglibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_errmgr_ortedresil_la_SOURCES = $(sources)
mca_errmgr_ortedresil_la_LDFLAGS = -module -avoid-version
noinst_LTLIBRARIES = $(component_noinst)
libmca_errmgr_ortedresil_la_SOURCES =$(sources)
libmca_errmgr_ortedresil_la_LDFLAGS = -module -avoid-version

1126
orte/mca/errmgr/ortedresil/errmgr_ortedresil.c Обычный файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,35 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
*/
#ifndef MCA_ERRMGR_ORTEDRESIL_EXPORT_H
#define MCA_ERRMGR_ORTEDRESIL_EXPORT_H
#include "orte_config.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* Local Component structures
*/
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_ortedresil_component;
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_ortedresil_module;
END_C_DECLS
#endif /* MCA_ERRMGR_ORTEDRESIL_EXPORT_H */

Просмотреть файл

@ -0,0 +1,84 @@
/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_ortedresil.h"
/*
* Public string for version number
*/
const char *orte_errmgr_ortedresil_component_version_string =
"ORTE ERRMGR ortedresil MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_ortedresil_open(void);
static int errmgr_ortedresil_close(void);
static int errmgr_ortedresil_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_ortedresil_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component itortedresil
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"ortedresil",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_ortedresil_open,
errmgr_ortedresil_close,
errmgr_ortedresil_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int errmgr_ortedresil_open(void)
{
return ORTE_SUCCESS;
}
static int errmgr_ortedresil_close(void)
{
return ORTE_SUCCESS;
}
static int errmgr_ortedresil_component_query(mca_base_module_t **module, int *priority)
{
if (ORTE_PROC_IS_DAEMON) {
/* keep our priority low so that other modules are higher
* and will run before us
*/
*priority = 0;
*module = (mca_base_module_t *)&orte_errmgr_ortedresil_module;
return ORTE_SUCCESS;
}
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}

Просмотреть файл

@ -0,0 +1,14 @@
-*- text -*-
#
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
#
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English general help file for ORTE RecoS IGNORE framework.
#

Просмотреть файл

@ -240,11 +240,13 @@ int orte_ess_base_app_setup(void)
}
/* Execute the post-startup errmgr code */
if (NULL != orte_errmgr.post_startup) {
if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr.post_startup";
goto error;
}
}
/* if we are an ORTE app - and not an MPI app - then
* we need to barrier here. MPI_Init has its own barrier,
@ -278,7 +280,9 @@ error:
int orte_ess_base_app_finalize(void)
{
if (NULL != orte_errmgr.pre_shutdown) {
orte_errmgr.pre_shutdown();
}
orte_notifier_base_close();

Просмотреть файл

@ -505,11 +505,13 @@ int orte_ess_base_orted_setup(char **hosts)
orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
/* Execute the post-startup errmgr code */
if (NULL != orte_errmgr.post_startup) {
if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr.post_startup";
goto error;
}
}
return ORTE_SUCCESS;
@ -523,7 +525,9 @@ int orte_ess_base_orted_setup(char **hosts)
int orte_ess_base_orted_finalize(void)
{
if (NULL != orte_errmgr.pre_shutdown) {
orte_errmgr.pre_shutdown();
}
/* stop the local sensors */
orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);