Move the errmgr code back. This shouldn't cause the svn problems that I
apparently caused last time. Sorry about that. This one will just be a big changelog. This commit was SVN r25016.
Этот коммит содержится в:
родитель
09274cd047
Коммит
67feeb6aca
@ -1,8 +1,12 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* Copyright (c) 2009-2011 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -22,11 +26,15 @@
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
@ -48,9 +56,22 @@ static int update_state(orte_jobid_t job,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs);
|
||||
|
||||
static int post_startup(void);
|
||||
static int pre_shutdown(void);
|
||||
|
||||
void epoch_change_recv(int status,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void *cbdata);
|
||||
void epoch_change(int fd,
|
||||
short event,
|
||||
void *data);
|
||||
|
||||
/******************
|
||||
* HNP module
|
||||
******************/
|
||||
@ -65,11 +86,11 @@ orte_errmgr_base_module_t orte_errmgr_app_module = {
|
||||
NULL,
|
||||
NULL,
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
NULL, /* post_startup */
|
||||
NULL, /* pre_shutdown */
|
||||
NULL, /* mark_processes_as_dead */
|
||||
NULL, /* set_fault_callback */
|
||||
NULL /* failure_notification */
|
||||
post_startup,
|
||||
pre_shutdown,
|
||||
NULL,
|
||||
orte_errmgr_base_set_fault_callback,
|
||||
NULL
|
||||
};
|
||||
|
||||
/************************
|
||||
@ -92,6 +113,8 @@ static int update_state(orte_jobid_t job,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app: job %s reported state %s"
|
||||
" for proc %s state %s exit_code %d",
|
||||
@ -109,9 +132,9 @@ static int update_state(orte_jobid_t job,
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_COMM_FAILED == state) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
/* if it is our own connection, ignore it */
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->vpid &&
|
||||
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
@ -125,6 +148,95 @@ static int update_state(orte_jobid_t job,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int post_startup(void) {
|
||||
int ret = ORTE_SUCCESS;
|
||||
|
||||
ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON,
|
||||
ORTE_RML_TAG_EPOCH_CHANGE,
|
||||
ORTE_RML_PERSISTENT,
|
||||
epoch_change_recv,
|
||||
NULL);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int pre_shutdown(void) {
|
||||
int ret = ORTE_SUCCESS;
|
||||
|
||||
ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON,
|
||||
ORTE_RML_TAG_EPOCH_CHANGE);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void epoch_change_recv(int status,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void *cbdata) {
|
||||
|
||||
ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change);
|
||||
}
|
||||
|
||||
void epoch_change(int fd,
|
||||
short event,
|
||||
void *data) {
|
||||
orte_message_event_t *mev = (orte_message_event_t *) data;
|
||||
opal_buffer_t *buffer = mev->buffer;
|
||||
orte_process_name_t *proc;
|
||||
int n = 1, ret, num_dead, i;
|
||||
opal_pointer_array_t *procs;
|
||||
|
||||
if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) {
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Received epoch change notification",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
procs = OBJ_NEW(opal_pointer_array_t);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead);
|
||||
for (i = 0; i < num_dead; i++) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
proc[i].epoch++;
|
||||
orte_util_set_epoch(&proc[i], proc[i].epoch);
|
||||
|
||||
opal_pointer_array_add(procs, &proc[i]);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Epoch for %s updated",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc[i])));
|
||||
}
|
||||
|
||||
if (NULL != fault_cbfunc && 0 < num_dead) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Calling fault callback",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
(*fault_cbfunc)(procs);
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:app Calling fault callback failed!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
}
|
||||
|
||||
free(proc);
|
||||
OBJ_RELEASE(procs);
|
||||
}
|
||||
|
||||
static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
|
@ -13,8 +13,8 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_APP_EXPORT_H
|
||||
#define MCA_ERRMGR_APP_EXPORT_H
|
||||
#ifndef MCA_ERRMGR_app_EXPORT_H
|
||||
#define MCA_ERRMGR_app_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
@ -32,4 +32,4 @@ ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_app_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_APP_EXPORT_H */
|
||||
#endif /* MCA_ERRMGR_app_EXPORT_H */
|
||||
|
@ -59,7 +59,7 @@ orte_errmgr_base_component_t mca_errmgr_app_component =
|
||||
/* opal_output handler */
|
||||
-1,
|
||||
/* Default priority */
|
||||
10
|
||||
0
|
||||
};
|
||||
|
||||
static int errmgr_app_open(void)
|
||||
@ -78,7 +78,7 @@ static int errmgr_app_component_query(mca_base_module_t **module, int *priority)
|
||||
/* keep our priority low so that other modules are higher
|
||||
* and will run before us
|
||||
*/
|
||||
*priority = 10;
|
||||
*priority = 0;
|
||||
*module = (mca_base_module_t *)&orte_errmgr_app_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -1,12 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
mca_link_libraries=libopen-rte
|
@ -1,36 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
EXTRA_DIST = .windows
|
||||
|
||||
sources = \
|
||||
errmgr_appresil.h \
|
||||
errmgr_appresil_component.c \
|
||||
errmgr_appresil.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_errmgr_appresil_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_appresil.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_appresil.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_appresil_la_SOURCES = $(sources)
|
||||
mca_errmgr_appresil_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_appresil_la_SOURCES =$(sources)
|
||||
libmca_errmgr_appresil_la_LDFLAGS = -module -avoid-version
|
@ -1,285 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2011 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2004-2006 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
#ifdef HAVE_STRING_H
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#include "opal/util/output.h"
|
||||
#include "opal/dss/dss.h"
|
||||
#include "opal/mca/event/event.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_wait.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/rml/rml_types.h"
|
||||
#include "orte/mca/odls/odls_types.h"
|
||||
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "errmgr_appresil.h"
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
*/
|
||||
static int init(void);
|
||||
static int finalize(void);
|
||||
|
||||
static int update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
static int orte_errmgr_appresil_abort_peers(orte_process_name_t *procs,
|
||||
orte_std_cntr_t num_procs);
|
||||
|
||||
static int post_startup(void);
|
||||
static int pre_shutdown(void);
|
||||
|
||||
void epoch_change_recv(int status,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void *cbdata);
|
||||
void epoch_change(int fd,
|
||||
short event,
|
||||
void *data);
|
||||
|
||||
/******************
|
||||
* HNP module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_appresil_module = {
|
||||
init,
|
||||
finalize,
|
||||
orte_errmgr_base_log,
|
||||
orte_errmgr_base_abort,
|
||||
orte_errmgr_appresil_abort_peers,
|
||||
update_state,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
post_startup,
|
||||
pre_shutdown,
|
||||
NULL,
|
||||
orte_errmgr_base_set_fault_callback,
|
||||
NULL
|
||||
};
|
||||
|
||||
/************************
|
||||
* API Definitions
|
||||
************************/
|
||||
static int init(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int finalize(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:appresil: job %s reported state %s"
|
||||
" for proc %s state %s exit_code %d",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(job),
|
||||
orte_job_state_to_str(jobstate),
|
||||
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
|
||||
orte_proc_state_to_str(state), exit_code));
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
*/
|
||||
if (orte_finalizing) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_COMM_FAILED == state) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
/* if it is our own connection, ignore it */
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/* delete the route */
|
||||
orte_routed.delete_route(proc);
|
||||
/* see is this was a lifeline */
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
|
||||
return ORTE_ERR_UNRECOVERABLE;
|
||||
}
|
||||
}
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int post_startup(void) {
|
||||
int ret = ORTE_SUCCESS;
|
||||
|
||||
ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON,
|
||||
ORTE_RML_TAG_EPOCH_CHANGE,
|
||||
ORTE_RML_PERSISTENT,
|
||||
epoch_change_recv,
|
||||
NULL);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int pre_shutdown(void) {
|
||||
int ret = ORTE_SUCCESS;
|
||||
|
||||
ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON,
|
||||
ORTE_RML_TAG_EPOCH_CHANGE);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void epoch_change_recv(int status,
|
||||
orte_process_name_t *sender,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void *cbdata) {
|
||||
|
||||
ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change);
|
||||
}
|
||||
|
||||
void epoch_change(int fd,
|
||||
short event,
|
||||
void *data) {
|
||||
orte_message_event_t *mev = (orte_message_event_t *) data;
|
||||
opal_buffer_t *buffer = mev->buffer;
|
||||
orte_process_name_t *proc;
|
||||
int n = 1, ret, num_dead, i;
|
||||
opal_pointer_array_t *procs;
|
||||
|
||||
if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) {
|
||||
return;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:appresil Received epoch change notification",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
procs = OBJ_NEW(opal_pointer_array_t);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead);
|
||||
for (i = 0; i < num_dead; i++) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
return;
|
||||
}
|
||||
proc[i].epoch++;
|
||||
orte_util_set_epoch(&proc[i], proc[i].epoch);
|
||||
|
||||
opal_pointer_array_add(procs, &proc[i]);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:appresil Epoch for %s updated",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&proc[i])));
|
||||
}
|
||||
|
||||
if (NULL != fault_cbfunc && 0 < num_dead) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:appresil Calling fault callback",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
(*fault_cbfunc)(procs);
|
||||
} else {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:appresil Calling fault callback failed!",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
}
|
||||
|
||||
free(proc);
|
||||
OBJ_RELEASE(procs);
|
||||
}
|
||||
|
||||
static int orte_errmgr_appresil_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
opal_buffer_t buffer;
|
||||
orte_std_cntr_t i;
|
||||
orte_daemon_cmd_flag_t command = ORTE_DAEMON_ABORT_PROCS_CALLED;
|
||||
|
||||
/*
|
||||
* Pack up the list of processes and send them to the HNP
|
||||
*/
|
||||
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* pack number of processes */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(num_procs), 1, ORTE_STD_CNTR))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* Pack the list of names */
|
||||
for( i = 0; i < num_procs; ++i ) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(procs[i]), 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
/* Send to HNP for termination */
|
||||
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_DAEMON, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OBJ_DESTRUCT(&buffer);
|
||||
|
||||
return exit_status;
|
||||
}
|
@ -1,35 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_APPRESIL_EXPORT_H
|
||||
#define MCA_ERRMGR_APPRESIL_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_appresil_component;
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_appresil_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_APPRESIL_EXPORT_H */
|
@ -1,89 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "errmgr_appresil.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_errmgr_appresil_component_version_string =
|
||||
"ORTE ERRMGR appresil MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int errmgr_appresil_open(void);
|
||||
static int errmgr_appresil_close(void);
|
||||
static int errmgr_appresil_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_errmgr_base_component_t mca_errmgr_appresil_component =
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component itapp
|
||||
*/
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
||||
/* Component name and version */
|
||||
"appresil",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
errmgr_appresil_open,
|
||||
errmgr_appresil_close,
|
||||
errmgr_appresil_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
/* Verbosity level */
|
||||
0,
|
||||
/* opal_output handler */
|
||||
-1,
|
||||
/* Default priority */
|
||||
0
|
||||
};
|
||||
|
||||
static int errmgr_appresil_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_appresil_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_appresil_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (ORTE_PROC_IS_APP) {
|
||||
/* keep our priority low so that other modules are higher
|
||||
* and will run before us
|
||||
*/
|
||||
*priority = 0;
|
||||
*module = (mca_base_module_t *)&orte_errmgr_appresil_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
@ -1,8 +1,11 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* Copyright (c) 2009-2011 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -37,11 +40,14 @@
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/debugger/base/base.h"
|
||||
#include "orte/mca/notifier/notifier.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
|
||||
#include "orte/util/error_strings.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/runtime/orte_locks.h"
|
||||
@ -50,6 +56,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
|
||||
#include "errmgr_hnp.h"
|
||||
|
||||
/**********************
|
||||
@ -76,11 +83,16 @@ static orte_errmgr_base_module_t global_module = {
|
||||
/* FT Event hook */
|
||||
orte_errmgr_hnp_global_ft_event,
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
NULL, /* post_startup */
|
||||
NULL, /* pre_shutdown */
|
||||
NULL, /* mark_processes_as_dead */
|
||||
NULL, /* set_fault_callback */
|
||||
NULL /* failure_notification */
|
||||
/* Post-startup */
|
||||
orte_errmgr_hnp_global_post_startup,
|
||||
/* Pre-shutdown */
|
||||
orte_errmgr_hnp_global_pre_shutdown,
|
||||
/* Mark as dead */
|
||||
orte_errmgr_hnp_global_mark_processes_as_dead,
|
||||
/* Set the callback */
|
||||
orte_errmgr_base_set_fault_callback,
|
||||
/* Receive failure notification */
|
||||
orte_errmgr_hnp_global_failure_notification
|
||||
};
|
||||
|
||||
|
||||
@ -92,10 +104,11 @@ static void failed_start(orte_job_t *jdata);
|
||||
static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate,
|
||||
orte_proc_state_t state, orte_exit_code_t exit_code);
|
||||
static void check_job_complete(orte_job_t *jdata);
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch);
|
||||
static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
|
||||
orte_proc_state_t state, orte_exit_code_t exit_code);
|
||||
static orte_odls_child_t* proc_is_local(orte_process_name_t *proc);
|
||||
static int send_to_local_applications(opal_pointer_array_t *dead_names);
|
||||
|
||||
/************************
|
||||
* API Definitions
|
||||
@ -391,7 +404,6 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
orte_odls_child_t *child;
|
||||
int rc;
|
||||
orte_app_context_t *app;
|
||||
orte_proc_t *pdat;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp: job %s reported state %s"
|
||||
@ -524,7 +536,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED,
|
||||
exit_code);
|
||||
/* order all local procs for this job to be killed */
|
||||
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
|
||||
killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
|
||||
check_job_complete(jdata); /* set the local proc states */
|
||||
/* the job object for this job will have been NULL'd
|
||||
* in the array if the job was solely local. If it isn't
|
||||
@ -536,7 +548,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
break;
|
||||
case ORTE_JOB_STATE_COMM_FAILED:
|
||||
/* order all local procs for this job to be killed */
|
||||
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
|
||||
killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
|
||||
check_job_complete(jdata); /* set the local proc states */
|
||||
/* the job object for this job will have been NULL'd
|
||||
* in the array if the job was solely local. If it isn't
|
||||
@ -548,7 +560,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
break;
|
||||
case ORTE_JOB_STATE_HEARTBEAT_FAILED:
|
||||
/* order all local procs for this job to be killed */
|
||||
killprocs(jdata->jobid, ORTE_VPID_WILDCARD);
|
||||
killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
|
||||
check_job_complete(jdata); /* set the local proc states */
|
||||
/* the job object for this job will have been NULL'd
|
||||
* in the array if the job was solely local. If it isn't
|
||||
@ -617,6 +629,11 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
/* guess not - let it fall thru to abort */
|
||||
}
|
||||
}
|
||||
|
||||
if (ORTE_PROC_STATE_ABORTED_BY_SIG == state) {
|
||||
exit_code = 0;
|
||||
}
|
||||
|
||||
orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code);
|
||||
check_job_complete(jdata); /* need to set the job state */
|
||||
/* the job object for this job will have been NULL'd
|
||||
@ -660,7 +677,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
|
||||
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
|
||||
if (jdata->enable_recovery) {
|
||||
killprocs(proc->jobid, proc->vpid);
|
||||
killprocs(proc->jobid, proc->vpid, proc->epoch);
|
||||
/* is this a local proc */
|
||||
if (NULL != (child = proc_is_local(proc))) {
|
||||
/* local proc - see if it has reached its restart limit */
|
||||
@ -718,8 +735,11 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
/* remove from dependent routes, if it is one */
|
||||
orte_routed.route_lost(proc);
|
||||
/* update daemon job */
|
||||
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, 0);
|
||||
/* check for complete */
|
||||
orte_errmgr_hnp_record_dead_process(proc);
|
||||
/* We'll check if the job was complete when we get the
|
||||
* message back from the HNP notifying us of the dead
|
||||
* process
|
||||
*/
|
||||
check_job_complete(jdata);
|
||||
break;
|
||||
}
|
||||
@ -733,12 +753,17 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
/* remove from dependent routes, if it is one */
|
||||
orte_routed.route_lost(proc);
|
||||
/* update daemon job */
|
||||
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
/* check for complete */
|
||||
orte_errmgr_hnp_record_dead_process(proc);
|
||||
/* We'll check if the job was complete when we get the
|
||||
* message back from the HNP notifying us of the dead
|
||||
* process
|
||||
*/
|
||||
check_job_complete(jdata);
|
||||
break;
|
||||
}
|
||||
|
||||
/* remove from dependent routes, if it is one */
|
||||
orte_routed.route_lost(proc);
|
||||
/* delete the route */
|
||||
orte_routed.delete_route(proc);
|
||||
/* purge the oob */
|
||||
@ -751,30 +776,21 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc));
|
||||
/* kill all local procs */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
|
||||
/* kill all jobs */
|
||||
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
|
||||
/* check if all is complete so we can terminate */
|
||||
check_job_complete(jdata);
|
||||
}
|
||||
} else {
|
||||
if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true,
|
||||
ORTE_VPID_PRINT(proc->vpid), "Unknown");
|
||||
} else {
|
||||
orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true,
|
||||
ORTE_VPID_PRINT(proc->vpid),
|
||||
(NULL == pdat->node) ? "Unknown" :
|
||||
((NULL == pdat->node->name) ? "Unknown" : pdat->node->name));
|
||||
if (ORTE_SUCCESS != orte_errmgr_hnp_record_dead_process(proc)) {
|
||||
/* The process is already dead so don't keep trying to do
|
||||
* this stuff. */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* remove this proc from the daemon job */
|
||||
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
/* kill all local procs */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
/* kill all jobs */
|
||||
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
|
||||
/* check if all is complete so we can terminate */
|
||||
/* We'll check if the job was complete when we get the
|
||||
* message back from the HNP notifying us of the dead
|
||||
* process */
|
||||
check_job_complete(jdata);
|
||||
}
|
||||
}
|
||||
@ -785,9 +801,9 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
if( orte_enable_recovery ) {
|
||||
/* relocate its processes */
|
||||
} else {
|
||||
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
orte_errmgr_hnp_record_dead_process(proc);
|
||||
/* kill all local procs */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
|
||||
/* kill all jobs */
|
||||
hnp_abort(ORTE_JOBID_WILDCARD, exit_code);
|
||||
return ORTE_ERR_UNRECOVERABLE;
|
||||
@ -806,6 +822,182 @@ int orte_errmgr_hnp_base_global_ft_event(int state)
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_hnp_global_post_startup(void) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_hnp_global_pre_shutdown(void) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) {
|
||||
orte_std_cntr_t n;
|
||||
int ret = ORTE_SUCCESS, num_failed;
|
||||
opal_pointer_array_t *dead_names;
|
||||
int32_t i;
|
||||
orte_process_name_t *name_item;
|
||||
orte_epoch_t epoch;
|
||||
orte_job_t *jdat;
|
||||
orte_proc_t *pdat, *pdat2;
|
||||
opal_buffer_t *answer;
|
||||
orte_daemon_cmd_flag_t command;
|
||||
|
||||
/* If processes have started terminating, don't worry about reported
|
||||
* failures. The ORTEDs don't know the difference. */
|
||||
if (mca_errmgr_hnp_component.term_in_progress) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (orte_debug_daemons_flag) {
|
||||
opal_output(0, "%s errmgr:hnp HNP received process failed from orted %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(sender));
|
||||
}
|
||||
|
||||
n = 1;
|
||||
/* Get the number of failed procs */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
dead_names = OBJ_NEW(opal_pointer_array_t);
|
||||
|
||||
for (i = 0; i < num_failed; i++) {
|
||||
name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t));
|
||||
|
||||
/* Unpack the buffer to get the dead process' name. */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Check to see if the message is telling us about an old epoch.
|
||||
* If so ignore the message.
|
||||
*/
|
||||
epoch = orte_util_lookup_epoch(name_item);
|
||||
if (name_item->epoch < epoch) {
|
||||
if (orte_debug_daemons_flag) {
|
||||
opal_output(0, "%s errmgr:hnp HNP ignoring duplicate notification for %s failure (reported epoch: %s local epoch: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name_item),
|
||||
ORTE_EPOCH_PRINT(name_item->epoch),
|
||||
ORTE_EPOCH_PRINT(epoch));
|
||||
}
|
||||
free(name_item);
|
||||
continue;
|
||||
} else {
|
||||
if (orte_debug_daemons_flag) {
|
||||
opal_output(0, "%s errmgr:hnp HNP received notification for %s failure (reported epoch: %s local epoch: %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name_item),
|
||||
ORTE_EPOCH_PRINT(name_item->epoch),
|
||||
ORTE_EPOCH_PRINT(epoch));
|
||||
}
|
||||
}
|
||||
|
||||
opal_pointer_array_add(dead_names, name_item);
|
||||
|
||||
/* Check to see if the message is telling us about an orted and
|
||||
* it is from another orted. Orteds don't have the list of all
|
||||
* the application processes so they don't know if there were
|
||||
* any child processes on the nodes that they are reporting. */
|
||||
if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, sender, ORTE_PROC_MY_NAME)) {
|
||||
if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) {
|
||||
continue;
|
||||
} else if (NULL == (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid))) {
|
||||
continue;
|
||||
} else if (NULL == pdat->node) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) {
|
||||
for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) {
|
||||
if (NULL == (pdat2 = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* ignore this process if it has already terminated */
|
||||
if (ORTE_PROC_STATE_TERMINATED <= pdat2->state) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* the proc must have been alive, so notify everyone that it died */
|
||||
name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t));
|
||||
|
||||
name_item->jobid = pdat2->name.jobid;
|
||||
name_item->vpid = pdat2->name.vpid;
|
||||
name_item->epoch = orte_util_lookup_epoch(&(pdat2->name));
|
||||
|
||||
opal_pointer_array_add(dead_names, name_item);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* Update the number of failed process so any duplicates don't get
|
||||
* re-reported.
|
||||
*/
|
||||
num_failed = opal_pointer_array_get_size(dead_names);
|
||||
|
||||
if (num_failed > 0) {
|
||||
orte_errmgr.mark_processes_as_dead(dead_names);
|
||||
|
||||
if (!orte_orteds_term_ordered) {
|
||||
/* Send a message out to all the orteds to inform them that the
|
||||
* process is dead. Long live the process (or not if it is so
|
||||
* decided)!
|
||||
*/
|
||||
answer = OBJ_NEW(opal_buffer_t);
|
||||
command = ORTE_PROCESS_FAILED_NOTIFICATION;
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_failed, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (i = 0; i < opal_pointer_array_get_size(dead_names); i++) {
|
||||
if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, name_item, 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, answer, ORTE_RML_TAG_DAEMON))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(answer);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Tell the applications' ORTE layers that there is a failure. */
|
||||
if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < num_failed; i++) {
|
||||
name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i);
|
||||
free(name_item);
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_RELEASE(dead_names);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*****************
|
||||
* Local Functions
|
||||
*****************/
|
||||
@ -1158,6 +1350,7 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||
}
|
||||
break;
|
||||
#if 0
|
||||
case ORTE_PROC_STATE_ABORTED_BY_SIG:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed proc %s aborted by signal",
|
||||
@ -1173,6 +1366,7 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||
}
|
||||
break;
|
||||
#endif
|
||||
case ORTE_PROC_STATE_TERM_WO_SYNC:
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
"%s errmgr:hnp:check_job_completed proc %s terminated without sync",
|
||||
@ -1195,6 +1389,7 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
}
|
||||
break;
|
||||
case ORTE_PROC_STATE_COMM_FAILED:
|
||||
#if 1
|
||||
if (!jdata->abort) {
|
||||
jdata->state = ORTE_JOB_STATE_COMM_FAILED;
|
||||
/* point to the lowest rank to cause the problem */
|
||||
@ -1204,6 +1399,7 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
jdata->abort = true;
|
||||
ORTE_UPDATE_EXIT_STATUS(proc->exit_code);
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED:
|
||||
if (!jdata->abort) {
|
||||
@ -1330,6 +1526,9 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
*/
|
||||
CHECK_DAEMONS:
|
||||
if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
#if 0
|
||||
if ((jdata->num_procs - 1) <= jdata->num_terminated) { /* Subtract one for the HNP */
|
||||
#endif
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
/* orteds are done! */
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
@ -1493,7 +1692,7 @@ static void check_job_complete(orte_job_t *jdata)
|
||||
}
|
||||
}
|
||||
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
|
||||
{
|
||||
opal_pointer_array_t cmd;
|
||||
orte_proc_t proc;
|
||||
@ -1504,7 +1703,7 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
orte_sensor.stop(job);
|
||||
}
|
||||
|
||||
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) {
|
||||
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) {
|
||||
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
@ -1515,6 +1714,7 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
OBJ_CONSTRUCT(&proc, orte_proc_t);
|
||||
proc.name.jobid = job;
|
||||
proc.name.vpid = vpid;
|
||||
proc.name.epoch = epoch;
|
||||
opal_pointer_array_add(&cmd, &proc);
|
||||
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -1533,6 +1733,11 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
|
||||
char *app_name;
|
||||
int rc, i, n;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
|
||||
"%s CHECKING ON RELOCATE FOR APP %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
|
||||
/* get the proc_t object for this process */
|
||||
pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
|
||||
if (NULL == pdata) {
|
||||
@ -1551,7 +1756,7 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc,
|
||||
*/
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
/* remove this proc from the daemon job */
|
||||
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code);
|
||||
orte_errmgr_hnp_record_dead_process(proc);
|
||||
/* check to see if any other nodes are "alive" */
|
||||
if (!orte_hnp_is_allocated && jdata->num_procs == 1) {
|
||||
return ORTE_ERR_FATAL;
|
||||
@ -1676,59 +1881,244 @@ static orte_odls_child_t* proc_is_local(orte_process_name_t *proc)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat,
|
||||
orte_vpid_t vpid,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
orte_job_t *jdt;
|
||||
static void cbfunc(int status,
|
||||
orte_process_name_t *peer,
|
||||
opal_buffer_t *buffer,
|
||||
orte_rml_tag_t tag,
|
||||
void* cbdata) {
|
||||
OBJ_RELEASE(buffer);
|
||||
}
|
||||
|
||||
int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) {
|
||||
orte_job_t *jdat;
|
||||
orte_proc_t *pdat;
|
||||
opal_buffer_t *buffer;
|
||||
orte_daemon_cmd_flag_t command;
|
||||
int i, rc, num_failed;
|
||||
opal_pointer_array_t *dead_names;
|
||||
orte_process_name_t *name_item;
|
||||
orte_proc_t *proc_item;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
|
||||
"%s RECORDING DEAD PROCESS %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(proc)));
|
||||
|
||||
if (NULL == (jdat = orte_get_job_data_object(proc->jobid))) {
|
||||
opal_output(0, "Can't find job object");
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) &&
|
||||
ORTE_PROC_STATE_TERMINATED < pdat->state) {
|
||||
|
||||
/* Make sure that the epochs match. */
|
||||
if (proc->epoch != pdat->name.epoch) {
|
||||
opal_output(1, "The epoch does not match the current epoch. Throwing the request out.");
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
dead_names = OBJ_NEW(opal_pointer_array_t);
|
||||
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
|
||||
opal_pointer_array_add(dead_names, &(pdat->name));
|
||||
|
||||
for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) {
|
||||
if (NULL == (proc_item = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
opal_pointer_array_add(dead_names, &(proc_item->name));
|
||||
}
|
||||
}
|
||||
|
||||
if (!mca_errmgr_hnp_component.term_in_progress) {
|
||||
/*
|
||||
* Send a message to the other daemons so they know that a daemon has
|
||||
* died.
|
||||
*/
|
||||
buffer = OBJ_NEW(opal_buffer_t);
|
||||
command = ORTE_PROCESS_FAILED_NOTIFICATION;
|
||||
|
||||
num_failed = opal_pointer_array_get_size(dead_names);
|
||||
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buffer);
|
||||
} else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buffer);
|
||||
} else {
|
||||
|
||||
/* Iterate of the list of dead procs and send them along with
|
||||
* the rest. The HNP needs this info so it can tell the other
|
||||
* ORTEDs and they can inform the appropriate applications.
|
||||
*/
|
||||
for (i = 0; i < num_failed; i++) {
|
||||
if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) {
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, name_item, 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
OBJ_RELEASE(buffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OBJ_RELEASE(dead_names);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output,
|
||||
"%s SENDING DEAD PROCESS MESSAGE TO HNP",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0, cbfunc, NULL);
|
||||
}
|
||||
} else {
|
||||
orte_errmgr_hnp_global_mark_processes_as_dead(dead_names);
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs) {
|
||||
int i;
|
||||
orte_process_name_t *name_item;
|
||||
orte_job_t *jdat;
|
||||
orte_proc_t *pdat;
|
||||
orte_node_t *node;
|
||||
int i;
|
||||
|
||||
if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, vpid)) &&
|
||||
ORTE_PROC_STATE_TERMINATED != pdat->state) {
|
||||
/* need to record that this one died */
|
||||
pdat->state = state;
|
||||
pdat->exit_code = exit_code;
|
||||
ORTE_UPDATE_EXIT_STATUS(exit_code);
|
||||
/* remove it from the job array */
|
||||
opal_pointer_array_set_item(jdat->procs, vpid, NULL);
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"HNP %s marking procs as dead",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
/* Iterate over the list of processes */
|
||||
for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) {
|
||||
if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) {
|
||||
opal_output(1, "NULL found in dead process list.");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s Job data not found.",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
return ORTE_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
if (NULL != (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid)) &&
|
||||
pdat->state < ORTE_PROC_STATE_TERMINATED) {
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"HNP %s marking %s as dead",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&pdat->name)));
|
||||
|
||||
/* Make sure the epochs match, if not it probably means that we
|
||||
* already reported this failure. */
|
||||
if (name_item->epoch != pdat->name.epoch) {
|
||||
continue;
|
||||
}
|
||||
|
||||
orte_util_set_epoch(name_item, name_item->epoch + 1);
|
||||
|
||||
/* Remove it from the job array */
|
||||
opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL);
|
||||
orte_process_info.num_procs--;
|
||||
jdat->num_procs--;
|
||||
/* mark the node as down so it won't be used in mapping
|
||||
* procs to be relaunched
|
||||
*/
|
||||
|
||||
/* Check if this is an ORTED */
|
||||
if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) {
|
||||
/* Mark the node as down so it won't be used in mapping anymore. */
|
||||
node = pdat->node;
|
||||
node->state = ORTE_NODE_STATE_DOWN;
|
||||
node->daemon = NULL;
|
||||
OBJ_RELEASE(pdat); /* maintain accounting */
|
||||
/* mark all procs on this node as having terminated */
|
||||
for (i=0; i < node->procs->size; i++) {
|
||||
if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
|
||||
continue;
|
||||
}
|
||||
/* get the job data object for this process */
|
||||
if (NULL == (jdt = orte_get_job_data_object(pdat->name.jobid))) {
|
||||
/* It is possible that the process job finishes before the daemons.
|
||||
* In that case the process state is set to normal termination, and
|
||||
* the job data has already been cleared. So no need to throw an
|
||||
* error.
|
||||
*/
|
||||
if( ORTE_PROC_STATE_TERMINATED != pdat->state ) {
|
||||
opal_output(0,
|
||||
"%s Error: Failed to find job_data for proc %s (%s) on node %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(&pdat->name),
|
||||
orte_proc_state_to_str(pdat->state),
|
||||
node->name );
|
||||
/* major problem */
|
||||
ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
|
||||
|
||||
OBJ_RELEASE(pdat);
|
||||
|
||||
/* Create a new proc object that will keep track of the epoch
|
||||
* information */
|
||||
pdat = OBJ_NEW(orte_proc_t);
|
||||
pdat->name.jobid = jdat->jobid;
|
||||
pdat->name.vpid = name_item->vpid;
|
||||
pdat->name.epoch = name_item->epoch + 1;
|
||||
|
||||
/* Set the state as terminated so we'll know the process isn't
|
||||
* actually there. */
|
||||
pdat->state = ORTE_PROC_STATE_TERMINATED;
|
||||
|
||||
opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat);
|
||||
jdat->num_procs++;
|
||||
jdat->num_terminated++;
|
||||
} else {
|
||||
opal_output(0, "Proc data not found for %s", ORTE_NAME_PRINT(name_item));
|
||||
/* Create a new proc object that will keep track of the epoch
|
||||
* information */
|
||||
pdat = OBJ_NEW(orte_proc_t);
|
||||
pdat->name.jobid = jdat->jobid;
|
||||
pdat->name.vpid = name_item->vpid;
|
||||
pdat->name.epoch = name_item->epoch + 1;
|
||||
|
||||
/* Set the state as terminated so we'll know the process isn't
|
||||
* actually there. */
|
||||
pdat->state = ORTE_PROC_STATE_TERMINATED;
|
||||
|
||||
opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat);
|
||||
jdat->num_procs++;
|
||||
jdat->num_terminated++;
|
||||
}
|
||||
continue;
|
||||
|
||||
check_job_complete(jdat);
|
||||
}
|
||||
pdat->state = ORTE_PROC_STATE_ABORTED;
|
||||
jdt->num_terminated++;
|
||||
|
||||
if (!orte_orteds_term_ordered) {
|
||||
/* Need to update the orted routing module. */
|
||||
orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
if (NULL != fault_cbfunc) {
|
||||
(*fault_cbfunc)(dead_procs);
|
||||
}
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
int send_to_local_applications(opal_pointer_array_t *dead_names) {
|
||||
opal_buffer_t *buf;
|
||||
int ret = ORTE_SUCCESS;
|
||||
orte_process_name_t *name_item;
|
||||
int size, i;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"%s Sending failure to local applications.",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
buf = OBJ_NEW(opal_buffer_t);
|
||||
|
||||
size = opal_pointer_array_get_size(dead_names);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buf);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
OBJ_RELEASE(buf);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -1,5 +1,8 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -13,8 +16,8 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_HNP_EXPORT_H
|
||||
#define MCA_ERRMGR_HNP_EXPORT_H
|
||||
#ifndef MCA_ERRMGR_hnp_EXPORT_H
|
||||
#define MCA_ERRMGR_hnp_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
@ -57,10 +60,6 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat,
|
||||
orte_vpid_t vpid,
|
||||
orte_proc_state_t state,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
/***************************
|
||||
* Module functions: Global
|
||||
@ -81,8 +80,13 @@ int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
int orte_errmgr_hnp_global_ft_event(int state);
|
||||
int orte_errmgr_hnp_global_post_startup(void);
|
||||
int orte_errmgr_hnp_global_pre_shutdown(void);
|
||||
int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs);
|
||||
int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
|
||||
int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc);
|
||||
|
||||
/* HNP Versions */
|
||||
/* hnp Versions */
|
||||
int orte_errmgr_hnp_base_global_init(void);
|
||||
int orte_errmgr_hnp_base_global_finalize(void);
|
||||
int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job,
|
||||
@ -130,4 +134,4 @@ int orte_errmgr_hnp_autor_global_ft_event(int state);
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_HNP_EXPORT_H */
|
||||
#endif /* MCA_ERRMGR_hnp_EXPORT_H */
|
||||
|
@ -1,7 +1,10 @@
|
||||
/*
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* Copyright (c) 2009-2011 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -391,6 +394,7 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *node = NULL;
|
||||
bool found = false;
|
||||
int num_removed = 0, num_to_remove;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
if( NULL == current_global_jobdata ) {
|
||||
return ORTE_SUCCESS;
|
||||
@ -410,8 +414,8 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc,
|
||||
item = opal_list_get_next(item) ) {
|
||||
wp_item = (errmgr_autor_wp_item_t*)item;
|
||||
|
||||
if( wp_item->name.vpid == proc->name.vpid &&
|
||||
wp_item->name.jobid == proc->name.jobid ) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &wp_item->name, &proc->name)) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
@ -518,6 +522,7 @@ static void errmgr_autor_process_fault_app(orte_job_t *jdata,
|
||||
wp_item = OBJ_NEW(errmgr_autor_wp_item_t);
|
||||
wp_item->name.jobid = proc->jobid;
|
||||
wp_item->name.vpid = proc->vpid;
|
||||
wp_item->name.epoch = proc->epoch;
|
||||
wp_item->state = state;
|
||||
|
||||
opal_list_append(procs_pending_recovery, &(wp_item->super));
|
||||
@ -612,7 +617,7 @@ static void errmgr_autor_process_fault_daemon(orte_job_t *jdata,
|
||||
/*
|
||||
* Record the dead daemon
|
||||
*/
|
||||
orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, 0);
|
||||
orte_errmgr_hnp_record_dead_process(proc);
|
||||
|
||||
return;
|
||||
}
|
||||
@ -621,6 +626,7 @@ void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp)
|
||||
{
|
||||
wp->name.jobid = ORTE_JOBID_INVALID;
|
||||
wp->name.vpid = ORTE_VPID_INVALID;
|
||||
wp->name.epoch = ORTE_EPOCH_MIN;
|
||||
|
||||
wp->state = 0;
|
||||
}
|
||||
@ -629,6 +635,7 @@ void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp)
|
||||
{
|
||||
wp->name.jobid = ORTE_JOBID_INVALID;
|
||||
wp->name.vpid = ORTE_VPID_INVALID;
|
||||
wp->name.epoch = ORTE_EPOCH_INVALID;
|
||||
|
||||
wp->state = 0;
|
||||
}
|
||||
|
@ -20,7 +20,7 @@
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_errmgr_hnp_component_version_string =
|
||||
"ORTE ERRMGR Hnp MCA component version " ORTE_VERSION;
|
||||
"ORTE ERRMGR hnp MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
@ -61,7 +61,7 @@ orte_errmgr_hnp_component_t mca_errmgr_hnp_component = {
|
||||
/* opal_output handler */
|
||||
-1,
|
||||
/* Default priority */
|
||||
50
|
||||
0
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -2,6 +2,9 @@
|
||||
* Copyright (c) 2009-2010 The Trustees of Indiana University.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -747,6 +750,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
close_iof_stdin = true;
|
||||
iof_name.jobid = proc->name.jobid;
|
||||
iof_name.vpid = proc->name.vpid;
|
||||
iof_name.epoch = proc->name.epoch;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -803,6 +807,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
close_iof_stdin = true;
|
||||
iof_name.jobid = proc->name.jobid;
|
||||
iof_name.vpid = proc->name.vpid;
|
||||
iof_name.epoch = proc->name.epoch;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -850,6 +855,7 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_
|
||||
close_iof_stdin = true;
|
||||
iof_name.jobid = proc->name.jobid;
|
||||
iof_name.vpid = proc->name.vpid;
|
||||
iof_name.epoch = proc->name.epoch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,12 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
mca_link_libraries=libopen-rte
|
@ -1,40 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
EXTRA_DIST = .windows
|
||||
|
||||
dist_pkgdata_DATA = help-orte-errmgr-hnp.txt
|
||||
|
||||
sources = \
|
||||
errmgr_hnpresil.h \
|
||||
errmgr_hnpresil_component.c \
|
||||
errmgr_hnpresil.c \
|
||||
errmgr_hnpresil_autor.c \
|
||||
errmgr_hnpresil_crmig.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_errmgr_hnpresil_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_hnpresil.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_hnpresil.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_hnpresil_la_SOURCES = $(sources)
|
||||
mca_errmgr_hnpresil_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_hnpresil_la_SOURCES =$(sources)
|
||||
libmca_errmgr_hnpresil_la_LDFLAGS = -module -avoid-version
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,137 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_HNPRESIL_EXPORT_H
|
||||
#define MCA_ERRMGR_HNPRESIL_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
struct orte_errmgr_hnpresil_component_t {
|
||||
orte_errmgr_base_component_t super; /** Base Errmgr component */
|
||||
|
||||
bool ignore_current_update;
|
||||
bool term_in_progress;
|
||||
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
/* State of the Recovery */
|
||||
bool crmig_in_progress;
|
||||
bool autor_in_progress;
|
||||
|
||||
/* CRMig Options */
|
||||
bool crmig_enabled;
|
||||
bool crmig_timing_enabled;
|
||||
|
||||
/* AutoR Options */
|
||||
bool autor_enabled;
|
||||
bool autor_timing_enabled;
|
||||
int autor_recovery_delay;
|
||||
bool autor_skip_oldnode;
|
||||
#endif
|
||||
};
|
||||
typedef struct orte_errmgr_hnpresil_component_t orte_errmgr_hnpresil_component_t;
|
||||
OPAL_MODULE_DECLSPEC extern orte_errmgr_hnpresil_component_t mca_errmgr_hnpresil_component;
|
||||
|
||||
int orte_errmgr_hnpresil_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
void orte_errmgr_hnpresil_update_proc(orte_job_t *jdata,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
/***************************
|
||||
* Module functions: Global
|
||||
***************************/
|
||||
int orte_errmgr_hnpresil_global_module_init(void);
|
||||
int orte_errmgr_hnpresil_global_module_finalize(void);
|
||||
|
||||
int orte_errmgr_hnpresil_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
int orte_errmgr_hnpresil_global_predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map);
|
||||
int orte_errmgr_hnpresil_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
int orte_errmgr_hnpresil_global_ft_event(int state);
|
||||
int orte_errmgr_hnpresil_global_post_startup(void);
|
||||
int orte_errmgr_hnpresil_global_pre_shutdown(void);
|
||||
int orte_errmgr_hnpresil_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs);
|
||||
int orte_errmgr_hnpresil_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
|
||||
int orte_errmgr_hnpresil_record_dead_process(orte_process_name_t *proc);
|
||||
|
||||
/* hnpresil Versions */
|
||||
int orte_errmgr_hnpresil_base_global_init(void);
|
||||
int orte_errmgr_hnpresil_base_global_finalize(void);
|
||||
int orte_errmgr_hnpresil_base_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
int orte_errmgr_hnpresil_base_global_ft_event(int state);
|
||||
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
/* CRMig Versions */
|
||||
int orte_errmgr_hnpresil_crmig_global_module_init(void);
|
||||
int orte_errmgr_hnpresil_crmig_global_module_finalize(void);
|
||||
|
||||
int orte_errmgr_hnpresil_crmig_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
int orte_errmgr_hnpresil_crmig_global_predicted_fault(opal_list_t *proc_list,
|
||||
opal_list_t *node_list,
|
||||
opal_list_t *suggested_map);
|
||||
int orte_errmgr_hnpresil_crmig_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
int orte_errmgr_hnpresil_crmig_global_ft_event(int state);
|
||||
|
||||
/* AutoR Versions */
|
||||
int orte_errmgr_hnpresil_autor_global_module_init(void);
|
||||
int orte_errmgr_hnpresil_autor_global_module_finalize(void);
|
||||
|
||||
int orte_errmgr_hnpresil_autor_global_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
int orte_errmgr_hnpresil_autor_global_suggest_map_targets(orte_proc_t *proc,
|
||||
orte_node_t *oldnode,
|
||||
opal_list_t *node_list);
|
||||
int orte_errmgr_hnpresil_autor_global_ft_event(int state);
|
||||
#endif
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_HNPRESIL_EXPORT_H */
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,201 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "orte/mca/errmgr/base/errmgr_private.h"
|
||||
#include "errmgr_hnpresil.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_errmgr_hnpresil_component_version_string =
|
||||
"ORTE ERRMGR hnpresil MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int orte_errmgr_hnpresil_open(void);
|
||||
static int orte_errmgr_hnpresil_close(void);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_errmgr_hnpresil_component_t mca_errmgr_hnpresil_component = {
|
||||
/* First do the base component stuff */
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component hnp
|
||||
*/
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
||||
/* Component name and version */
|
||||
"hnpresil",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
orte_errmgr_hnpresil_open,
|
||||
orte_errmgr_hnpresil_close,
|
||||
orte_errmgr_hnpresil_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
/* Verbosity level */
|
||||
0,
|
||||
/* opal_output handler */
|
||||
-1,
|
||||
/* Default priority */
|
||||
0
|
||||
}
|
||||
};
|
||||
|
||||
static int orte_errmgr_hnpresil_open(void)
|
||||
{
|
||||
int val;
|
||||
|
||||
/*
|
||||
* This should be the last componet to ever get used since
|
||||
* it doesn't do anything.
|
||||
*/
|
||||
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
|
||||
"priority",
|
||||
"Priority of the ERRMGR hnp component",
|
||||
false, false,
|
||||
mca_errmgr_hnpresil_component.super.priority,
|
||||
&mca_errmgr_hnpresil_component.super.priority);
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
|
||||
"verbose",
|
||||
"Verbose level for the ERRMGR hnp component",
|
||||
false, false,
|
||||
mca_errmgr_hnpresil_component.super.verbose,
|
||||
&mca_errmgr_hnpresil_component.super.verbose);
|
||||
/* If there is a custom verbose level for this component than use it
|
||||
* otherwise take our parents level and output channel
|
||||
*/
|
||||
if ( 0 != mca_errmgr_hnpresil_component.super.verbose) {
|
||||
mca_errmgr_hnpresil_component.super.output_handle = opal_output_open(NULL);
|
||||
opal_output_set_verbosity(mca_errmgr_hnpresil_component.super.output_handle,
|
||||
mca_errmgr_hnpresil_component.super.verbose);
|
||||
} else {
|
||||
mca_errmgr_hnpresil_component.super.output_handle = orte_errmgr_base.output;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
/****************************
|
||||
* CRMig (C/R Process Migration) MCA Options
|
||||
****************************/
|
||||
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
|
||||
"crmig_timing",
|
||||
"Enable Process Migration timer",
|
||||
false, false,
|
||||
0, &val);
|
||||
mca_errmgr_hnpresil_component.crmig_timing_enabled = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
|
||||
"crmig_enable",
|
||||
"Enable Process Migration (Default: 0/off)",
|
||||
false, false,
|
||||
0, &val);
|
||||
mca_errmgr_hnpresil_component.crmig_enabled = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
/****************************
|
||||
* AutoR (Automatic Recovery) MCA Options
|
||||
****************************/
|
||||
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
|
||||
"autor_timing",
|
||||
"Enable Automatic Recovery timer",
|
||||
false, false,
|
||||
0, &val);
|
||||
mca_errmgr_hnpresil_component.autor_timing_enabled = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
|
||||
"autor_enable",
|
||||
"Enable Automatic Recovery (Default: 0/off)",
|
||||
false, false,
|
||||
0, &val);
|
||||
mca_errmgr_hnpresil_component.autor_enabled = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
|
||||
"autor_recovery_delay",
|
||||
"Number of seconds to wait before starting to recover the job after a failure"
|
||||
" [Default: 1 sec]",
|
||||
false, false,
|
||||
1, &val);
|
||||
mca_errmgr_hnpresil_component.autor_recovery_delay = val;
|
||||
|
||||
mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version,
|
||||
"autor_skip_oldnode",
|
||||
"Skip the old node from failed proc, even if it is still available"
|
||||
" [Default: Enabled]",
|
||||
false, false,
|
||||
1, &val);
|
||||
mca_errmgr_hnpresil_component.autor_skip_oldnode = OPAL_INT_TO_BOOL(val);
|
||||
#else
|
||||
val = 0; /* Silence compiler warning */
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
|
||||
/*
|
||||
* Debug Output
|
||||
*/
|
||||
opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle,
|
||||
"errmgr:hnp: open()");
|
||||
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
|
||||
"errmgr:hnp: open: priority = %d",
|
||||
mca_errmgr_hnpresil_component.super.priority);
|
||||
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
|
||||
"errmgr:hnp: open: verbosity = %d",
|
||||
mca_errmgr_hnpresil_component.super.verbose);
|
||||
#if OPAL_ENABLE_FT_CR
|
||||
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
|
||||
"errmgr:hnp: open: --- CR Migration Options ---");
|
||||
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
|
||||
"errmgr:hnp: open: Process Migration = %s",
|
||||
(mca_errmgr_hnpresil_component.crmig_enabled ? "Enabled" : "Disabled"));
|
||||
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
|
||||
"errmgr:hnp: open: timing = %s",
|
||||
(mca_errmgr_hnpresil_component.crmig_timing_enabled ? "Enabled" : "Disabled"));
|
||||
|
||||
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
|
||||
"errmgr:hnp: open: --- Auto. Recovery Options ---");
|
||||
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
|
||||
"errmgr:hnp: open: Auto. Recover = %s",
|
||||
(mca_errmgr_hnpresil_component.autor_enabled ? "Enabled" : "Disabled"));
|
||||
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
|
||||
"errmgr:hnp: open: timing = %s",
|
||||
(mca_errmgr_hnpresil_component.autor_timing_enabled ? "Enabled" : "Disabled"));
|
||||
opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle,
|
||||
"errmgr:hnp: open: recover_delay = %d",
|
||||
mca_errmgr_hnpresil_component.autor_recovery_delay);
|
||||
|
||||
mca_errmgr_hnpresil_component.crmig_in_progress = false;
|
||||
mca_errmgr_hnpresil_component.autor_in_progress = false;
|
||||
mca_errmgr_hnpresil_component.term_in_progress = false;
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int orte_errmgr_hnpresil_close(void)
|
||||
{
|
||||
opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle,
|
||||
"errmgr:hnp: close()");
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,71 +0,0 @@
|
||||
-*- text -*-
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for ORTE Errmgr HNP module.
|
||||
#
|
||||
[errmgr-hnp:unknown-job-error]
|
||||
An error has occurred in an unknown job. This generally should not happen
|
||||
except due to an internal ORTE error.
|
||||
|
||||
Job state: %s
|
||||
|
||||
This information should probably be reported to the OMPI developers.
|
||||
#
|
||||
[errmgr-hnp:daemon-died]
|
||||
The system has lost communication with the following daemon:
|
||||
|
||||
Daemon: %s
|
||||
Node: %s
|
||||
|
||||
The reason for the lost communication channel is unknown. Possible
|
||||
reasons include failure of the daemon itself, failure of the
|
||||
connecting fabric/switch, and loss of the host node. Please
|
||||
check with your system administrator to try and determine the
|
||||
source of the problem.
|
||||
|
||||
Your job is being terminated as a result.
|
||||
#
|
||||
[errmgr-hnp:cannot-relocate]
|
||||
The system is unable to relocate the specified process:
|
||||
|
||||
Process: %s
|
||||
|
||||
because the application for that process could not be found. This
|
||||
appears to be a system error. Please report it to the ORTE
|
||||
developers.
|
||||
|
||||
[autor_recovering_job]
|
||||
Notice: The processes listed below failed unexpectedly.
|
||||
Using the last checkpoint to recover the job.
|
||||
Please standby.
|
||||
%s
|
||||
[autor_recovery_complete]
|
||||
Notice: The job has been successfully recovered from the
|
||||
last checkpoint.
|
||||
[autor_failed_to_recover_proc]
|
||||
Error: The process below has failed. There is no checkpoint available for
|
||||
this job, so we are terminating the application since automatic
|
||||
recovery cannot occur.
|
||||
Internal Name: %s
|
||||
MCW Rank: %d
|
||||
|
||||
[crmig_migrating_job]
|
||||
Notice: A migration of this job has been requested.
|
||||
The processes below will be migrated.
|
||||
Please standby.
|
||||
%s
|
||||
[crmig_migrated_job]
|
||||
Notice: The processes have been successfully migrated to/from the specified
|
||||
machines.
|
||||
[crmig_no_migrating_procs]
|
||||
Warning: Could not find any processes to migrate on the nodes specified.
|
||||
You provided the following:
|
||||
Nodes: %s
|
||||
Procs: %s
|
@ -3,6 +3,9 @@
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
|
||||
* Copyright (c) 2004-2011 The University of Tennessee and The University
|
||||
* of Tennessee Research Foundation. All rights
|
||||
* reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -29,9 +32,11 @@
|
||||
#include "orte/util/proc_info.h"
|
||||
#include "orte/util/session_dir.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/util/nidmap.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
#include "orte/mca/rml/rml.h"
|
||||
#include "orte/mca/odls/odls.h"
|
||||
#include "orte/mca/odls/base/base.h"
|
||||
#include "orte/mca/plm/plm_types.h"
|
||||
#include "orte/mca/routed/routed.h"
|
||||
#include "orte/mca/sensor/sensor.h"
|
||||
@ -53,8 +58,9 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code);
|
||||
static void update_local_children(orte_odls_job_t *jobdat,
|
||||
orte_job_state_t jobstate,
|
||||
orte_proc_state_t state);
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid);
|
||||
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch);
|
||||
static int record_dead_process(orte_process_name_t *proc);
|
||||
static int send_to_local_applications(opal_pointer_array_t *dead_names);
|
||||
|
||||
/*
|
||||
* Module functions: Global
|
||||
@ -79,10 +85,14 @@ static int suggest_map_targets(orte_proc_t *proc,
|
||||
|
||||
static int ft_event(int state);
|
||||
|
||||
static int post_startup(void);
|
||||
static int pre_shutdown(void);
|
||||
|
||||
static int mark_processes_as_dead(opal_pointer_array_t *dead_procs);
|
||||
static int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer);
|
||||
|
||||
/******************
|
||||
* ORTED module
|
||||
* orted module
|
||||
******************/
|
||||
orte_errmgr_base_module_t orte_errmgr_orted_module = {
|
||||
init,
|
||||
@ -95,11 +105,11 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = {
|
||||
suggest_map_targets,
|
||||
ft_event,
|
||||
orte_errmgr_base_register_migration_warning,
|
||||
NULL, /* post_startup */
|
||||
NULL, /* pre_shutdown */
|
||||
NULL, /* mark_processes_as_dead */
|
||||
NULL, /* set_fault_callback */
|
||||
NULL /* failure_notification */
|
||||
post_startup,
|
||||
pre_shutdown,
|
||||
mark_processes_as_dead,
|
||||
orte_errmgr_base_set_fault_callback, /* Set callback function */
|
||||
failure_notification
|
||||
};
|
||||
|
||||
/************************
|
||||
@ -130,6 +140,7 @@ static int update_state(orte_jobid_t job,
|
||||
int rc=ORTE_SUCCESS;
|
||||
orte_vpid_t null=ORTE_VPID_INVALID;
|
||||
orte_app_context_t *app;
|
||||
orte_ns_cmp_bitmask_t mask;
|
||||
|
||||
/*
|
||||
* if orte is trying to shutdown, just let it
|
||||
@ -138,6 +149,14 @@ static int update_state(orte_jobid_t job,
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"errmgr:orted:update_state() %s) "
|
||||
"------- %s state updated for process %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
((NULL == proc) ? "App. Process" :
|
||||
(proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")),
|
||||
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));
|
||||
|
||||
/* if this is a heartbeat failure, let the HNP handle it */
|
||||
if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate ||
|
||||
ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
|
||||
@ -202,10 +221,10 @@ static int update_state(orte_jobid_t job,
|
||||
/* update all procs in job */
|
||||
update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
|
||||
/* order all local procs for this job to be killed */
|
||||
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD);
|
||||
killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
|
||||
case ORTE_JOB_STATE_COMM_FAILED:
|
||||
/* kill all local procs */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
|
||||
/* tell the caller we can't recover */
|
||||
return ORTE_ERR_UNRECOVERABLE;
|
||||
break;
|
||||
@ -242,15 +261,16 @@ static int update_state(orte_jobid_t job,
|
||||
* lifeline
|
||||
*/
|
||||
if (ORTE_PROC_STATE_COMM_FAILED == state) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
|
||||
/* if it is our own connection, ignore it */
|
||||
if (ORTE_PROC_MY_NAME->jobid == proc->jobid &&
|
||||
ORTE_PROC_MY_NAME->vpid == proc->vpid) {
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
/* see if this was a lifeline */
|
||||
if (ORTE_SUCCESS != orte_routed.route_lost(proc)) {
|
||||
/* kill our children */
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
|
||||
killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD);
|
||||
/* terminate - our routed children will see
|
||||
* us leave and automatically die
|
||||
*/
|
||||
@ -261,10 +281,14 @@ static int update_state(orte_jobid_t job,
|
||||
/* was it a daemon that failed? */
|
||||
if (proc->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||
/* if all my routes are gone, then terminate ourselves */
|
||||
if (0 == orte_routed.num_routes()) {
|
||||
if (0 == orte_routed.num_routes() &&
|
||||
0 == opal_list_get_size(&orte_local_children)) {
|
||||
orte_quit();
|
||||
}
|
||||
}
|
||||
|
||||
record_dead_process(proc);
|
||||
|
||||
/* if not, then indicate we can continue */
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
@ -306,15 +330,15 @@ static int update_state(orte_jobid_t job,
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == proc->jobid &&
|
||||
child->name->vpid == proc->vpid) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
|
||||
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
|
||||
child->state = state;
|
||||
child->exit_code = exit_code;
|
||||
/* Decrement the number of local procs */
|
||||
jobdat->num_local_procs--;
|
||||
/* kill this proc */
|
||||
killprocs(proc->jobid, proc->vpid);
|
||||
killprocs(proc->jobid, proc->vpid, proc->epoch);
|
||||
}
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx);
|
||||
if( jobdat->enable_recovery && child->restarts < app->max_restarts ) {
|
||||
@ -351,8 +375,8 @@ static int update_state(orte_jobid_t job,
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == proc->jobid &&
|
||||
child->name->vpid == proc->vpid) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
|
||||
/* see if this child has reached its local restart limit */
|
||||
app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx);
|
||||
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||
@ -403,8 +427,8 @@ static int update_state(orte_jobid_t job,
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == proc->jobid &&
|
||||
child->name->vpid == proc->vpid) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
|
||||
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
|
||||
child->state = state;
|
||||
child->exit_code = exit_code;
|
||||
@ -447,8 +471,8 @@ REPORT_STATE:
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t*)item;
|
||||
if (child->name->jobid == proc->jobid &&
|
||||
child->name->vpid == proc->vpid) {
|
||||
mask = ORTE_NS_CMP_ALL;
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) {
|
||||
if (ORTE_PROC_STATE_UNTERMINATED > child->state) {
|
||||
child->state = state;
|
||||
if (0 < pid) {
|
||||
@ -485,7 +509,7 @@ REPORT_STATE:
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto FINAL_CLEANUP;
|
||||
}
|
||||
/* pack all the local child vpids */
|
||||
/* pack all the local child vpids and epochs */
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
@ -589,6 +613,7 @@ REPORT_STATE:
|
||||
rc = ORTE_SUCCESS;
|
||||
}
|
||||
OBJ_DESTRUCT(&alert);
|
||||
|
||||
/* indicate that the job is complete */
|
||||
return rc;
|
||||
}
|
||||
@ -609,11 +634,137 @@ static int suggest_map_targets(orte_proc_t *proc,
|
||||
return ORTE_ERR_NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
int ft_event(int state)
|
||||
static int ft_event(int state)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int post_startup(void) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int pre_shutdown(void) {
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int mark_processes_as_dead(opal_pointer_array_t *dead_procs) {
|
||||
int i;
|
||||
orte_process_name_t *name_item;
|
||||
opal_list_item_t *item;
|
||||
orte_odls_child_t *child;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"ORTED %s marking procs as dead",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) {
|
||||
if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) {
|
||||
opal_output(0, "NULL found in dead process list.");
|
||||
continue;
|
||||
}
|
||||
|
||||
if (name_item->epoch < orte_util_lookup_epoch(name_item)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"ORTED %s marking %s as dead",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name_item)));
|
||||
|
||||
/* Increment the epoch */
|
||||
orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED);
|
||||
orte_util_set_epoch(name_item, name_item->epoch + 1);
|
||||
|
||||
/* Remove the dead process from my list of children if applicable */
|
||||
for (item = opal_list_get_first(&orte_local_children);
|
||||
item != opal_list_get_end(&orte_local_children);
|
||||
item = opal_list_get_next(item)) {
|
||||
child = (orte_odls_child_t *) item;
|
||||
|
||||
if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID,
|
||||
child->name, name_item)) {
|
||||
opal_list_remove_item(&orte_local_children, item);
|
||||
OBJ_RELEASE(item);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Remove the route from the routing layer */
|
||||
orte_routed.delete_route(name_item);
|
||||
}
|
||||
|
||||
/* Update the routing module */
|
||||
orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid);
|
||||
|
||||
if (NULL != fault_cbfunc) {
|
||||
(*fault_cbfunc)(dead_procs);
|
||||
}
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) {
|
||||
opal_pointer_array_t *dead_names;
|
||||
orte_std_cntr_t n;
|
||||
int ret = ORTE_SUCCESS, num_failed;
|
||||
int32_t i;
|
||||
orte_process_name_t *name_item, proc;
|
||||
|
||||
dead_names = OBJ_NEW(opal_pointer_array_t);
|
||||
|
||||
n = 1;
|
||||
/* Get the number of failed procs */
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (i = 0; i < num_failed; i++) {
|
||||
/* Unpack the buffer to get the dead process' name. */
|
||||
n = 1;
|
||||
|
||||
name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t));
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (orte_debug_daemons_flag) {
|
||||
opal_output(0, "%s errmgr:orted ORTED received process %s failed from %s",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_NAME_PRINT(name_item),
|
||||
ORTE_NAME_PRINT(sender));
|
||||
}
|
||||
|
||||
/* There shouldn't be an issue of receiving this message multiple
|
||||
* times but it doesn't hurt to double check.
|
||||
*/
|
||||
if (proc.epoch < orte_util_lookup_epoch(name_item)) {
|
||||
opal_output(1, "Received from proc %s local epoch %d", ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item));
|
||||
continue;
|
||||
}
|
||||
|
||||
opal_pointer_array_add(dead_names, name_item);
|
||||
}
|
||||
|
||||
/* Tell the errmgr so it can handle changing the epoch, routes, etc. */
|
||||
mark_processes_as_dead(dead_names);
|
||||
|
||||
/* Tell the applications' ORTE layers that there is a failure. */
|
||||
if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (i = 0; i < num_failed; i++) {
|
||||
name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i);
|
||||
free(name_item);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*****************
|
||||
* Local Functions
|
||||
*****************/
|
||||
@ -794,6 +945,7 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf)
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
}
|
||||
/* Pack the child's epoch. */
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->epoch), 1, ORTE_EPOCH))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
return rc;
|
||||
@ -836,7 +988,7 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code)
|
||||
}
|
||||
}
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output,
|
||||
"%s errmgr:orted: job %s reported incomplete start",
|
||||
"%s errmgr:hnp: job %s reported incomplete start",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
ORTE_JOBID_PRINT(jobdat->jobid)));
|
||||
return;
|
||||
@ -860,7 +1012,7 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs
|
||||
}
|
||||
}
|
||||
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch)
|
||||
{
|
||||
opal_pointer_array_t cmd;
|
||||
orte_proc_t proc;
|
||||
@ -871,7 +1023,7 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
orte_sensor.stop(job);
|
||||
}
|
||||
|
||||
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) {
|
||||
if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) {
|
||||
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
@ -882,6 +1034,7 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
OBJ_CONSTRUCT(&proc, orte_proc_t);
|
||||
proc.name.jobid = job;
|
||||
proc.name.vpid = vpid;
|
||||
proc.name.epoch = epoch;
|
||||
opal_pointer_array_add(&cmd, &proc);
|
||||
if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
@ -889,3 +1042,85 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid)
|
||||
OBJ_DESTRUCT(&cmd);
|
||||
OBJ_DESTRUCT(&proc);
|
||||
}
|
||||
|
||||
static int record_dead_process(orte_process_name_t *proc) {
|
||||
opal_pointer_array_t *dead_name;
|
||||
opal_buffer_t *buffer;
|
||||
orte_daemon_cmd_flag_t command;
|
||||
int rc = ORTE_SUCCESS;
|
||||
int num_failed;
|
||||
|
||||
if (orte_odls_base_default_check_finished(proc)) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
dead_name = OBJ_NEW(opal_pointer_array_t);
|
||||
|
||||
opal_pointer_array_add(dead_name, proc);
|
||||
|
||||
/* Mark the process as dead */
|
||||
mark_processes_as_dead(dead_name);
|
||||
|
||||
/* Send a message to the HNP */
|
||||
buffer = OBJ_NEW(opal_buffer_t);
|
||||
command = ORTE_PROCESS_FAILED_NOTIFICATION;
|
||||
|
||||
num_failed = 1;
|
||||
|
||||
if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
} else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
} else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc, 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
|
||||
orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0);
|
||||
|
||||
OBJ_RELEASE(buffer);
|
||||
OBJ_RELEASE(dead_name);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
int send_to_local_applications(opal_pointer_array_t *dead_names) {
|
||||
opal_buffer_t *buf;
|
||||
int ret;
|
||||
orte_process_name_t *name_item;
|
||||
int size, i;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||
"%s Sending failure to local applications.",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
|
||||
|
||||
buf = OBJ_NEW(opal_buffer_t);
|
||||
|
||||
size = opal_pointer_array_get_size(dead_names);
|
||||
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) {
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buf);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
OBJ_RELEASE(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
OBJ_RELEASE(buf);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -13,8 +13,8 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_ORTED_EXPORT_H
|
||||
#define MCA_ERRMGR_ORTED_EXPORT_H
|
||||
#ifndef MCA_ERRMGR_orted_EXPORT_H
|
||||
#define MCA_ERRMGR_orted_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
@ -32,4 +32,4 @@ ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_orted_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_ORTED_EXPORT_H */
|
||||
#endif /* MCA_ERRMGR_orted_EXPORT_H */
|
||||
|
@ -72,7 +72,7 @@ static int errmgr_orted_component_query(mca_base_module_t **module, int *priorit
|
||||
/* keep our priority low so that other modules are higher
|
||||
* and will run before us
|
||||
*/
|
||||
*priority = 10;
|
||||
*priority = 0;
|
||||
*module = (mca_base_module_t *)&orte_errmgr_orted_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
@ -1,12 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart,
|
||||
# University of Stuttgart. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
# Specific to this module
|
||||
mca_link_libraries=libopen-rte
|
@ -1,38 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
|
||||
EXTRA_DIST = .windows
|
||||
|
||||
dist_pkgdata_DATA = help-orte-errmgr-orted.txt
|
||||
|
||||
sources = \
|
||||
errmgr_ortedresil.h \
|
||||
errmgr_ortedresil_component.c \
|
||||
errmgr_ortedresil.c
|
||||
|
||||
# Make the output library in this directory, and name it either
|
||||
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
|
||||
# (for static builds).
|
||||
|
||||
if MCA_BUILD_orte_errmgr_ortedresil_DSO
|
||||
component_noinst =
|
||||
component_install = mca_errmgr_ortedresil.la
|
||||
else
|
||||
component_noinst = libmca_errmgr_ortedresil.la
|
||||
component_install =
|
||||
endif
|
||||
|
||||
mcacomponentdir = $(pkglibdir)
|
||||
mcacomponent_LTLIBRARIES = $(component_install)
|
||||
mca_errmgr_ortedresil_la_SOURCES = $(sources)
|
||||
mca_errmgr_ortedresil_la_LDFLAGS = -module -avoid-version
|
||||
|
||||
noinst_LTLIBRARIES = $(component_noinst)
|
||||
libmca_errmgr_ortedresil_la_SOURCES =$(sources)
|
||||
libmca_errmgr_ortedresil_la_LDFLAGS = -module -avoid-version
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@ -1,35 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef MCA_ERRMGR_ORTEDRESIL_EXPORT_H
|
||||
#define MCA_ERRMGR_ORTEDRESIL_EXPORT_H
|
||||
|
||||
#include "orte_config.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
|
||||
BEGIN_C_DECLS
|
||||
|
||||
/*
|
||||
* Local Component structures
|
||||
*/
|
||||
|
||||
ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_ortedresil_component;
|
||||
|
||||
ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_ortedresil_module;
|
||||
|
||||
END_C_DECLS
|
||||
|
||||
#endif /* MCA_ERRMGR_ORTEDRESIL_EXPORT_H */
|
@ -1,84 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
*
|
||||
* $HEADER$
|
||||
*/
|
||||
|
||||
#include "orte_config.h"
|
||||
#include "opal/util/output.h"
|
||||
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/errmgr/base/base.h"
|
||||
#include "errmgr_ortedresil.h"
|
||||
|
||||
/*
|
||||
* Public string for version number
|
||||
*/
|
||||
const char *orte_errmgr_ortedresil_component_version_string =
|
||||
"ORTE ERRMGR ortedresil MCA component version " ORTE_VERSION;
|
||||
|
||||
/*
|
||||
* Local functionality
|
||||
*/
|
||||
static int errmgr_ortedresil_open(void);
|
||||
static int errmgr_ortedresil_close(void);
|
||||
static int errmgr_ortedresil_component_query(mca_base_module_t **module, int *priority);
|
||||
|
||||
/*
|
||||
* Instantiate the public struct with all of our public information
|
||||
* and pointer to our public functions in it
|
||||
*/
|
||||
orte_errmgr_base_component_t mca_errmgr_ortedresil_component =
|
||||
{
|
||||
/* Handle the general mca_component_t struct containing
|
||||
* meta information about the component itortedresil
|
||||
*/
|
||||
{
|
||||
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
||||
/* Component name and version */
|
||||
"ortedresil",
|
||||
ORTE_MAJOR_VERSION,
|
||||
ORTE_MINOR_VERSION,
|
||||
ORTE_RELEASE_VERSION,
|
||||
|
||||
/* Component open and close functions */
|
||||
errmgr_ortedresil_open,
|
||||
errmgr_ortedresil_close,
|
||||
errmgr_ortedresil_component_query
|
||||
},
|
||||
{
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
}
|
||||
};
|
||||
|
||||
static int errmgr_ortedresil_open(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_ortedresil_close(void)
|
||||
{
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
static int errmgr_ortedresil_component_query(mca_base_module_t **module, int *priority)
|
||||
{
|
||||
if (ORTE_PROC_IS_DAEMON) {
|
||||
/* keep our priority low so that other modules are higher
|
||||
* and will run before us
|
||||
*/
|
||||
*priority = 0;
|
||||
*module = (mca_base_module_t *)&orte_errmgr_ortedresil_module;
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
*priority = -1;
|
||||
*module = NULL;
|
||||
return ORTE_ERROR;
|
||||
}
|
||||
|
@ -1,14 +0,0 @@
|
||||
-*- text -*-
|
||||
#
|
||||
# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana
|
||||
# University Research and Technology
|
||||
# Corporation. All rights reserved.
|
||||
#
|
||||
# $COPYRIGHT$
|
||||
#
|
||||
# Additional copyrights may follow
|
||||
#
|
||||
# $HEADER$
|
||||
#
|
||||
# This is the US/English general help file for ORTE RecoS IGNORE framework.
|
||||
#
|
Загрузка…
x
Ссылка в новой задаче
Block a user