From 5fde3e0e00a69864646b854d56f95dfb3756ee8d Mon Sep 17 00:00:00 2001 From: Wesley Bland Date: Thu, 28 Jul 2011 21:24:34 +0000 Subject: [PATCH] Move the resilient orte errmgr code into a seperate errmgr for now while it's still unstable. Reverted errmgr modules back to the original errmgr (with the updates since the resilient code was brought into the trunk). This commit was SVN r24958. --- ompi/runtime/ompi_mpi_init.c | 4 +- orte/mca/errmgr/app/errmgr_app.c | 132 +- orte/mca/errmgr/appresil/.windows | 12 + orte/mca/errmgr/appresil/Makefile.am | 36 + orte/mca/errmgr/appresil/errmgr_appresil.c | 285 +++ orte/mca/errmgr/appresil/errmgr_appresil.h | 35 + .../appresil/errmgr_appresil_component.c | 89 + orte/mca/errmgr/hnp/errmgr_hnp.c | 852 ++----- orte/mca/errmgr/hnp/errmgr_hnp.h | 12 +- orte/mca/errmgr/hnp/errmgr_hnp_autor.c | 15 +- orte/mca/errmgr/hnp/errmgr_hnp_crmig.c | 6 - orte/mca/errmgr/hnpresil/.windows | 12 + orte/mca/errmgr/hnpresil/Makefile.am | 40 + orte/mca/errmgr/hnpresil/errmgr_hnpresil.c | 2112 +++++++++++++++++ orte/mca/errmgr/hnpresil/errmgr_hnpresil.h | 137 ++ .../errmgr/hnpresil/errmgr_hnpresil_autor.c | 1033 ++++++++ .../hnpresil/errmgr_hnpresil_component.c | 201 ++ .../errmgr/hnpresil/errmgr_hnpresil_crmig.c | 1517 ++++++++++++ .../errmgr/hnpresil/help-orte-errmgr-hnp.txt | 71 + orte/mca/errmgr/orted/errmgr_orted.c | 452 +--- orte/mca/errmgr/ortedresil/.windows | 12 + orte/mca/errmgr/ortedresil/Makefile.am | 38 + .../mca/errmgr/ortedresil/errmgr_ortedresil.c | 1126 +++++++++ .../mca/errmgr/ortedresil/errmgr_ortedresil.h | 35 + .../ortedresil/errmgr_ortedresil_component.c | 84 + .../ortedresil/help-orte-errmgr-orted.txt | 14 + orte/mca/ess/base/ess_base_std_app.c | 14 +- orte/mca/ess/base/ess_base_std_orted.c | 14 +- 28 files changed, 7288 insertions(+), 1102 deletions(-) create mode 100644 orte/mca/errmgr/appresil/.windows create mode 100644 orte/mca/errmgr/appresil/Makefile.am create mode 100644 orte/mca/errmgr/appresil/errmgr_appresil.c create mode 100644 orte/mca/errmgr/appresil/errmgr_appresil.h create mode 100644 orte/mca/errmgr/appresil/errmgr_appresil_component.c create mode 100644 orte/mca/errmgr/hnpresil/.windows create mode 100644 orte/mca/errmgr/hnpresil/Makefile.am create mode 100644 orte/mca/errmgr/hnpresil/errmgr_hnpresil.c create mode 100644 orte/mca/errmgr/hnpresil/errmgr_hnpresil.h create mode 100644 orte/mca/errmgr/hnpresil/errmgr_hnpresil_autor.c create mode 100644 orte/mca/errmgr/hnpresil/errmgr_hnpresil_component.c create mode 100644 orte/mca/errmgr/hnpresil/errmgr_hnpresil_crmig.c create mode 100644 orte/mca/errmgr/hnpresil/help-orte-errmgr-hnp.txt create mode 100644 orte/mca/errmgr/ortedresil/.windows create mode 100644 orte/mca/errmgr/ortedresil/Makefile.am create mode 100644 orte/mca/errmgr/ortedresil/errmgr_ortedresil.c create mode 100644 orte/mca/errmgr/ortedresil/errmgr_ortedresil.h create mode 100644 orte/mca/errmgr/ortedresil/errmgr_ortedresil_component.c create mode 100644 orte/mca/errmgr/ortedresil/help-orte-errmgr-orted.txt diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 706a34cfef..3f751a98ee 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -371,7 +371,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) } /* Register errhandler callback with orte errmgr */ - orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback); + if (NULL != orte_errmgr.set_fault_callback) { + orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback); + } /* Figure out the final MPI thread levels. If we were not compiled for support for MPI threads, then don't allow diff --git a/orte/mca/errmgr/app/errmgr_app.c b/orte/mca/errmgr/app/errmgr_app.c index 459112fc75..97332cc655 100644 --- a/orte/mca/errmgr/app/errmgr_app.c +++ b/orte/mca/errmgr/app/errmgr_app.c @@ -1,13 +1,9 @@ /* - * Copyright (c) 2009-2011 The Trustees of Indiana University. + * Copyright (c) 2009-2010 The Trustees of Indiana University. * All rights reserved. - * * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. * - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * * $COPYRIGHT$ * * Additional copyrights may follow @@ -26,15 +22,11 @@ #endif #include "opal/util/output.h" -#include "opal/dss/dss.h" -#include "opal/mca/event/event.h" #include "orte/util/error_strings.h" #include "orte/util/name_fns.h" #include "orte/util/show_help.h" -#include "orte/util/nidmap.h" #include "orte/runtime/orte_globals.h" -#include "orte/runtime/orte_wait.h" #include "orte/mca/routed/routed.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" @@ -56,22 +48,9 @@ static int update_state(orte_jobid_t job, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code); - static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs); -static int post_startup(void); -static int pre_shutdown(void); - -void epoch_change_recv(int status, - orte_process_name_t *sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, - void *cbdata); -void epoch_change(int fd, - short event, - void *data); - /****************** * HNP module ******************/ @@ -86,11 +65,11 @@ orte_errmgr_base_module_t orte_errmgr_app_module = { NULL, NULL, orte_errmgr_base_register_migration_warning, - post_startup, - pre_shutdown, - NULL, - orte_errmgr_base_set_fault_callback, - NULL + NULL, /* post_startup */ + NULL, /* pre_shutdown */ + NULL, /* mark_processes_as_dead */ + NULL, /* set_fault_callback */ + NULL /* failure_notification */ }; /************************ @@ -113,8 +92,6 @@ static int update_state(orte_jobid_t job, pid_t pid, orte_exit_code_t exit_code) { - orte_ns_cmp_bitmask_t mask; - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:app: job %s reported state %s" " for proc %s state %s exit_code %d", @@ -132,9 +109,9 @@ static int update_state(orte_jobid_t job, } if (ORTE_PROC_STATE_COMM_FAILED == state) { - mask = ORTE_NS_CMP_ALL; /* if it is our own connection, ignore it */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { + if (ORTE_PROC_MY_NAME->jobid == proc->vpid && + ORTE_PROC_MY_NAME->vpid == proc->vpid) { return ORTE_SUCCESS; } @@ -148,95 +125,6 @@ static int update_state(orte_jobid_t job, return ORTE_SUCCESS; } -static int post_startup(void) { - int ret = ORTE_SUCCESS; - - ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON, - ORTE_RML_TAG_EPOCH_CHANGE, - ORTE_RML_PERSISTENT, - epoch_change_recv, - NULL); - - return ret; -} - -static int pre_shutdown(void) { - int ret = ORTE_SUCCESS; - - ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON, - ORTE_RML_TAG_EPOCH_CHANGE); - - return ret; -} - -void epoch_change_recv(int status, - orte_process_name_t *sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, - void *cbdata) { - - ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change); -} - -void epoch_change(int fd, - short event, - void *data) { - orte_message_event_t *mev = (orte_message_event_t *) data; - opal_buffer_t *buffer = mev->buffer; - orte_process_name_t *proc; - int n = 1, ret, num_dead, i; - opal_pointer_array_t *procs; - - if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) { - return; - } - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Received epoch change notification", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - procs = OBJ_NEW(opal_pointer_array_t); - - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return; - } - - proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead); - for (i = 0; i < num_dead; i++) { - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return; - } - proc[i].epoch++; - orte_util_set_epoch(&proc[i], proc[i].epoch); - - opal_pointer_array_add(procs, &proc[i]); - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Epoch for %s updated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc[i]))); - } - - if (NULL != fault_cbfunc && 0 < num_dead) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Calling fault callback", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - (*fault_cbfunc)(procs); - } else { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Calling fault callback failed!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - } - - free(proc); - OBJ_RELEASE(procs); -} - static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs) { int ret, exit_status = ORTE_SUCCESS; @@ -278,7 +166,7 @@ static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr goto cleanup; } -cleanup: + cleanup: OBJ_DESTRUCT(&buffer); return exit_status; diff --git a/orte/mca/errmgr/appresil/.windows b/orte/mca/errmgr/appresil/.windows new file mode 100644 index 0000000000..aa7d7bbbe5 --- /dev/null +++ b/orte/mca/errmgr/appresil/.windows @@ -0,0 +1,12 @@ +# +# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module +mca_link_libraries=libopen-rte diff --git a/orte/mca/errmgr/appresil/Makefile.am b/orte/mca/errmgr/appresil/Makefile.am new file mode 100644 index 0000000000..4513d841cd --- /dev/null +++ b/orte/mca/errmgr/appresil/Makefile.am @@ -0,0 +1,36 @@ +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = .windows + +sources = \ + errmgr_appresil.h \ + errmgr_appresil_component.c \ + errmgr_appresil.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_errmgr_appresil_DSO +component_noinst = +component_install = mca_errmgr_appresil.la +else +component_noinst = libmca_errmgr_appresil.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_errmgr_appresil_la_SOURCES = $(sources) +mca_errmgr_appresil_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_errmgr_appresil_la_SOURCES =$(sources) +libmca_errmgr_appresil_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/appresil/errmgr_appresil.c b/orte/mca/errmgr/appresil/errmgr_appresil.c new file mode 100644 index 0000000000..72f61d9a6c --- /dev/null +++ b/orte/mca/errmgr/appresil/errmgr_appresil.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2009-2011 The Trustees of Indiana University. + * All rights reserved. + * + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/output.h" +#include "opal/dss/dss.h" +#include "opal/mca/event/event.h" + +#include "orte/util/error_strings.h" +#include "orte/util/name_fns.h" +#include "orte/util/show_help.h" +#include "orte/util/nidmap.h" +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_wait.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/rml_types.h" +#include "orte/mca/odls/odls_types.h" + +#include "orte/mca/errmgr/base/base.h" +#include "orte/mca/errmgr/base/errmgr_private.h" +#include "errmgr_appresil.h" + +/* + * Module functions: Global + */ +static int init(void); +static int finalize(void); + +static int update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); + +static int orte_errmgr_appresil_abort_peers(orte_process_name_t *procs, + orte_std_cntr_t num_procs); + +static int post_startup(void); +static int pre_shutdown(void); + +void epoch_change_recv(int status, + orte_process_name_t *sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, + void *cbdata); +void epoch_change(int fd, + short event, + void *data); + +/****************** + * HNP module + ******************/ +orte_errmgr_base_module_t orte_errmgr_appresil_module = { + init, + finalize, + orte_errmgr_base_log, + orte_errmgr_base_abort, + orte_errmgr_appresil_abort_peers, + update_state, + NULL, + NULL, + NULL, + orte_errmgr_base_register_migration_warning, + post_startup, + pre_shutdown, + NULL, + orte_errmgr_base_set_fault_callback, + NULL +}; + +/************************ + * API Definitions + ************************/ +static int init(void) +{ + return ORTE_SUCCESS; +} + +static int finalize(void) +{ + return ORTE_SUCCESS; +} + +static int update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + orte_ns_cmp_bitmask_t mask; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:appresil: job %s reported state %s" + " for proc %s state %s exit_code %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + orte_job_state_to_str(jobstate), + (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state), exit_code)); + + /* + * if orte is trying to shutdown, just let it + */ + if (orte_finalizing) { + return ORTE_SUCCESS; + } + + if (ORTE_PROC_STATE_COMM_FAILED == state) { + mask = ORTE_NS_CMP_ALL; + /* if it is our own connection, ignore it */ + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { + return ORTE_SUCCESS; + } + + /* delete the route */ + orte_routed.delete_route(proc); + /* see is this was a lifeline */ + if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { + return ORTE_ERR_UNRECOVERABLE; + } + } + return ORTE_SUCCESS; +} + +static int post_startup(void) { + int ret = ORTE_SUCCESS; + + ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON, + ORTE_RML_TAG_EPOCH_CHANGE, + ORTE_RML_PERSISTENT, + epoch_change_recv, + NULL); + + return ret; +} + +static int pre_shutdown(void) { + int ret = ORTE_SUCCESS; + + ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON, + ORTE_RML_TAG_EPOCH_CHANGE); + + return ret; +} + +void epoch_change_recv(int status, + orte_process_name_t *sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, + void *cbdata) { + + ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change); +} + +void epoch_change(int fd, + short event, + void *data) { + orte_message_event_t *mev = (orte_message_event_t *) data; + opal_buffer_t *buffer = mev->buffer; + orte_process_name_t *proc; + int n = 1, ret, num_dead, i; + opal_pointer_array_t *procs; + + if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) { + return; + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:appresil Received epoch change notification", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + procs = OBJ_NEW(opal_pointer_array_t); + + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(ret); + opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + return; + } + + proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead); + for (i = 0; i < num_dead; i++) { + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + return; + } + proc[i].epoch++; + orte_util_set_epoch(&proc[i], proc[i].epoch); + + opal_pointer_array_add(procs, &proc[i]); + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:appresil Epoch for %s updated", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc[i]))); + } + + if (NULL != fault_cbfunc && 0 < num_dead) { + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:appresil Calling fault callback", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + (*fault_cbfunc)(procs); + } else { + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:appresil Calling fault callback failed!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + } + + free(proc); + OBJ_RELEASE(procs); +} + +static int orte_errmgr_appresil_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs) +{ + int ret, exit_status = ORTE_SUCCESS; + opal_buffer_t buffer; + orte_std_cntr_t i; + orte_daemon_cmd_flag_t command = ORTE_DAEMON_ABORT_PROCS_CALLED; + + /* + * Pack up the list of processes and send them to the HNP + */ + OBJ_CONSTRUCT(&buffer, opal_buffer_t); + + if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* pack number of processes */ + if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(num_procs), 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* Pack the list of names */ + for( i = 0; i < num_procs; ++i ) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(procs[i]), 1, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + } + + /* Send to HNP for termination */ + if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_DAEMON, 0))) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + +cleanup: + OBJ_DESTRUCT(&buffer); + + return exit_status; +} diff --git a/orte/mca/errmgr/appresil/errmgr_appresil.h b/orte/mca/errmgr/appresil/errmgr_appresil.h new file mode 100644 index 0000000000..d72d4056f7 --- /dev/null +++ b/orte/mca/errmgr/appresil/errmgr_appresil.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + */ + +#ifndef MCA_ERRMGR_APPRESIL_EXPORT_H +#define MCA_ERRMGR_APPRESIL_EXPORT_H + +#include "orte_config.h" + +#include "orte/mca/errmgr/errmgr.h" + +BEGIN_C_DECLS + +/* + * Local Component structures + */ + +ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_appresil_component; + +ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_appresil_module; + +END_C_DECLS + +#endif /* MCA_ERRMGR_APPRESIL_EXPORT_H */ diff --git a/orte/mca/errmgr/appresil/errmgr_appresil_component.c b/orte/mca/errmgr/appresil/errmgr_appresil_component.c new file mode 100644 index 0000000000..33023d1052 --- /dev/null +++ b/orte/mca/errmgr/appresil/errmgr_appresil_component.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "errmgr_appresil.h" + +/* + * Public string for version number + */ +const char *orte_errmgr_appresil_component_version_string = + "ORTE ERRMGR appresil MCA component version " ORTE_VERSION; + +/* + * Local functionality + */ +static int errmgr_appresil_open(void); +static int errmgr_appresil_close(void); +static int errmgr_appresil_component_query(mca_base_module_t **module, int *priority); + +/* + * Instantiate the public struct with all of our public information + * and pointer to our public functions in it + */ +orte_errmgr_base_component_t mca_errmgr_appresil_component = +{ + /* Handle the general mca_component_t struct containing + * meta information about the component itapp + */ + { + ORTE_ERRMGR_BASE_VERSION_3_0_0, + /* Component name and version */ + "appresil", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + errmgr_appresil_open, + errmgr_appresil_close, + errmgr_appresil_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + /* Verbosity level */ + 0, + /* opal_output handler */ + -1, + /* Default priority */ + 0 +}; + +static int errmgr_appresil_open(void) +{ + return ORTE_SUCCESS; +} + +static int errmgr_appresil_close(void) +{ + return ORTE_SUCCESS; +} + +static int errmgr_appresil_component_query(mca_base_module_t **module, int *priority) +{ + if (ORTE_PROC_IS_APP) { + /* keep our priority low so that other modules are higher + * and will run before us + */ + *priority = 0; + *module = (mca_base_module_t *)&orte_errmgr_appresil_module; + return ORTE_SUCCESS; + } + + *priority = -1; + *module = NULL; + return ORTE_ERROR; +} diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index d065d952ff..381a3d4338 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -1,11 +1,8 @@ /* - * Copyright (c) 2009-2011 The Trustees of Indiana University. + * Copyright (c) 2009-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -40,14 +37,11 @@ #include "orte/mca/routed/routed.h" #include "orte/mca/debugger/base/base.h" #include "orte/mca/notifier/notifier.h" -#include "orte/mca/grpcomm/grpcomm.h" -#include "orte/mca/ess/ess.h" #include "orte/util/error_strings.h" #include "orte/util/name_fns.h" #include "orte/util/proc_info.h" #include "orte/util/show_help.h" -#include "orte/util/nidmap.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_locks.h" @@ -56,7 +50,6 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/errmgr_private.h" - #include "errmgr_hnp.h" /********************** @@ -83,16 +76,11 @@ static orte_errmgr_base_module_t global_module = { /* FT Event hook */ orte_errmgr_hnp_global_ft_event, orte_errmgr_base_register_migration_warning, - /* Post-startup */ - orte_errmgr_hnp_global_post_startup, - /* Pre-shutdown */ - orte_errmgr_hnp_global_pre_shutdown, - /* Mark as dead */ - orte_errmgr_hnp_global_mark_processes_as_dead, - /* Set the callback */ - orte_errmgr_base_set_fault_callback, - /* Receive failure notification */ - orte_errmgr_hnp_global_failure_notification + NULL, /* post_startup */ + NULL, /* pre_shutdown */ + NULL, /* mark_processes_as_dead */ + NULL, /* set_fault_callback */ + NULL /* failure_notification */ }; @@ -104,11 +92,10 @@ static void failed_start(orte_job_t *jdata); static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, orte_proc_state_t state, orte_exit_code_t exit_code); static void check_job_complete(orte_job_t *jdata); -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch); +static void killprocs(orte_jobid_t job, orte_vpid_t vpid); static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, orte_proc_state_t state, orte_exit_code_t exit_code); static orte_odls_child_t* proc_is_local(orte_process_name_t *proc); -static int send_to_local_applications(opal_pointer_array_t *dead_names); /************************ * API Definitions @@ -168,7 +155,7 @@ int orte_errmgr_hnp_global_module_init(void) goto cleanup; } -cleanup: + cleanup: return exit_status; } @@ -206,7 +193,7 @@ int orte_errmgr_hnp_global_module_finalize(void) goto cleanup; } -cleanup: + cleanup: return exit_status; } @@ -275,7 +262,7 @@ int orte_errmgr_hnp_global_update_state(orte_jobid_t job, } } -cleanup: + cleanup: return exit_status; } @@ -306,7 +293,7 @@ int orte_errmgr_hnp_global_predicted_fault(opal_list_t *proc_list, goto cleanup; } -cleanup: + cleanup: return exit_status; #else return ORTE_ERR_NOT_IMPLEMENTED; @@ -342,7 +329,7 @@ int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc, } } -cleanup: + cleanup: return exit_status; #else return ORTE_ERR_NOT_IMPLEMENTED; @@ -374,7 +361,7 @@ int orte_errmgr_hnp_global_ft_event(int state) goto cleanup; } -cleanup: + cleanup: return exit_status; } @@ -404,7 +391,8 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, orte_odls_child_t *child; int rc; orte_app_context_t *app; - + orte_proc_t *pdat; + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp: job %s reported state %s" " for proc %s state %s pid %d exit_code %d", @@ -435,7 +423,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, hnp_abort(job, exit_code); return ORTE_SUCCESS; } - + /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); @@ -443,7 +431,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, } /* update the state */ jdata->state = jobstate; - + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp: job %s reported state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -536,7 +524,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, exit_code); /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); check_job_complete(jdata); /* set the local proc states */ /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't @@ -548,7 +536,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, break; case ORTE_JOB_STATE_COMM_FAILED: /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); check_job_complete(jdata); /* set the local proc states */ /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't @@ -560,7 +548,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, break; case ORTE_JOB_STATE_HEARTBEAT_FAILED: /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); check_job_complete(jdata); /* set the local proc states */ /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't @@ -629,11 +617,6 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, /* guess not - let it fall thru to abort */ } } - - if (ORTE_PROC_STATE_ABORTED_BY_SIG == state) { - exit_code = 0; - } - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); check_job_complete(jdata); /* need to set the job state */ /* the job object for this job will have been NULL'd @@ -645,168 +628,174 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, } break; - case ORTE_PROC_STATE_FAILED_TO_START: - case ORTE_PROC_STATE_CALLED_ABORT: - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die + case ORTE_PROC_STATE_FAILED_TO_START: + case ORTE_PROC_STATE_CALLED_ABORT: + orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); + check_job_complete(jdata); + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + + case ORTE_PROC_STATE_REGISTERED: + case ORTE_PROC_STATE_RUNNING: + orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); + break; + + case ORTE_PROC_STATE_LAUNCHED: + /* record the pid for this child */ + orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); + break; + + case ORTE_PROC_STATE_TERMINATED: + case ORTE_PROC_STATE_TERM_NON_ZERO: + case ORTE_PROC_STATE_KILLED_BY_CMD: + orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); + check_job_complete(jdata); + break; + + case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: + if (jdata->enable_recovery) { + killprocs(proc->jobid, proc->vpid); + /* is this a local proc */ + if (NULL != (child = proc_is_local(proc))) { + /* local proc - see if it has reached its restart limit */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); + if (child->restarts < app->max_restarts) { + child->restarts++; + if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { + return ORTE_SUCCESS; + } + /* reset the child's state as restart_proc would + * have cleared it + */ + child->state = state; + /* see if we can relocate it somewhere else */ + if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { + return ORTE_SUCCESS; + } + /* let it fall thru to abort */ + } + } else { + /* this is a remote process - see if we can relocate it */ + if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { + return ORTE_SUCCESS; + } + /* guess not - let it fall thru to abort */ + } + } + /* kill all jobs */ + orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); + check_job_complete(jdata); /* need to set the job state */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + + case ORTE_PROC_STATE_COMM_FAILED: + /* is this to a daemon? */ + if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { + /* if this is my own connection, ignore it */ + if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s My own connection - ignoring it", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + break; + } + /* if we have ordered orteds to terminate, record it */ + if (orte_orteds_term_ordered) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s Daemons terminating - recording daemon %s as gone", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); + /* remove from dependent routes, if it is one */ + orte_routed.route_lost(proc); + /* update daemon job */ + orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, 0); + /* check for complete */ + check_job_complete(jdata); + break; + } + /* if abort is in progress, see if this one failed to tell + * us it had terminated */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - hnp_abort(jdata->jobid, exit_code); + if (orte_abnormal_term_ordered) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s Abort in progress - recording daemon %s as gone", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); + /* remove from dependent routes, if it is one */ + orte_routed.route_lost(proc); + /* update daemon job */ + orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code); + /* check for complete */ + check_job_complete(jdata); + break; } - break; - case ORTE_PROC_STATE_REGISTERED: - case ORTE_PROC_STATE_RUNNING: - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - break; + /* delete the route */ + orte_routed.delete_route(proc); + /* purge the oob */ + orte_rml.purge(proc); - case ORTE_PROC_STATE_LAUNCHED: - /* record the pid for this child */ - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - break; - - case ORTE_PROC_STATE_TERMINATED: - case ORTE_PROC_STATE_TERM_NON_ZERO: - case ORTE_PROC_STATE_KILLED_BY_CMD: - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); - break; - - case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: - if (jdata->enable_recovery) { - killprocs(proc->jobid, proc->vpid, proc->epoch); - /* is this a local proc */ - if (NULL != (child = proc_is_local(proc))) { - /* local proc - see if it has reached its restart limit */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); - if (child->restarts < app->max_restarts) { - child->restarts++; - if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { - return ORTE_SUCCESS; - } - /* reset the child's state as restart_proc would - * have cleared it - */ - child->state = state; - /* see if we can relocate it somewhere else */ - if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { - return ORTE_SUCCESS; - } - /* let it fall thru to abort */ - } - } else { - /* this is a remote process - see if we can relocate it */ - if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { - return ORTE_SUCCESS; - } - /* guess not - let it fall thru to abort */ - } - } - /* kill all jobs */ - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); /* need to set the job state */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_PROC_STATE_COMM_FAILED: - /* is this to a daemon? */ - if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - /* if this is my own connection, ignore it */ - if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s My own connection - ignoring it", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - break; - } - /* if we have ordered orteds to terminate, record it */ - if (orte_orteds_term_ordered) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s Daemons terminating - recording daemon %s as gone", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); - /* remove from dependent routes, if it is one */ - orte_routed.route_lost(proc); - /* update daemon job */ - orte_errmgr_hnp_record_dead_process(proc); - /* We'll check if the job was complete when we get the - * message back from the HNP notifying us of the dead - * process - */ - break; - } - /* if abort is in progress, see if this one failed to tell - * us it had terminated - */ - if (orte_abnormal_term_ordered) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s Abort in progress - recording daemon %s as gone", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); - /* remove from dependent routes, if it is one */ - orte_routed.route_lost(proc); - /* update daemon job */ - orte_errmgr_hnp_record_dead_process(proc); - /* We'll check if the job was complete when we get the - * message back from the HNP notifying us of the dead - * process - */ - break; - } - - /* delete the route */ - orte_routed.delete_route(proc); - /* purge the oob */ - orte_rml.purge(proc); - - if( orte_enable_recovery ) { - /* relocate its processes */ - if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) { - /* unable to relocate for some reason */ - opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); - /* kill all jobs */ - hnp_abort(ORTE_JOBID_WILDCARD, exit_code); - /* check if all is complete so we can terminate */ - check_job_complete(jdata); - } - } else { - if (ORTE_SUCCESS != orte_errmgr_hnp_record_dead_process(proc)) { - /* The process is already dead so don't keep trying to do - * this stuff. */ - return ORTE_SUCCESS; - } - /* We'll check if the job was complete when we get the - * message back from the HNP notifying us of the dead - * process */ - } - } - break; - - case ORTE_PROC_STATE_HEARTBEAT_FAILED: - /* heartbeats are only from daemons */ if( orte_enable_recovery ) { /* relocate its processes */ + if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) { + /* unable to relocate for some reason */ + opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); + /* kill all jobs */ + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); + /* check if all is complete so we can terminate */ + check_job_complete(jdata); + } } else { - orte_errmgr_hnp_record_dead_process(proc); + if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true, + ORTE_VPID_PRINT(proc->vpid), "Unknown"); + } else { + orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true, + ORTE_VPID_PRINT(proc->vpid), + (NULL == pdat->node) ? "Unknown" : + ((NULL == pdat->node->name) ? "Unknown" : pdat->node->name)); + } + /* remove this proc from the daemon job */ + orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code); /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* kill all jobs */ hnp_abort(ORTE_JOBID_WILDCARD, exit_code); - return ORTE_ERR_UNRECOVERABLE; + /* check if all is complete so we can terminate */ + check_job_complete(jdata); } - break; + } + break; - default: - break; + case ORTE_PROC_STATE_HEARTBEAT_FAILED: + /* heartbeats are only from daemons */ + if( orte_enable_recovery ) { + /* relocate its processes */ + } else { + orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code); + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); + /* kill all jobs */ + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); + return ORTE_ERR_UNRECOVERABLE; + } + break; + + default: + break; } return ORTE_SUCCESS; @@ -817,177 +806,13 @@ int orte_errmgr_hnp_base_global_ft_event(int state) return ORTE_SUCCESS; } -int orte_errmgr_hnp_global_post_startup(void) { - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_global_pre_shutdown(void) { - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) { - orte_std_cntr_t n; - int ret = ORTE_SUCCESS, num_failed; - opal_pointer_array_t *dead_names; - int32_t i; - orte_process_name_t *name_item; - orte_epoch_t epoch; - orte_job_t *jdat; - orte_proc_t *pdat, *pdat2; - opal_buffer_t *answer; - orte_daemon_cmd_flag_t command; - - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:hnp HNP received process failed from orted %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender)); - } - - n = 1; - /* Get the number of failed procs */ - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - return ret; - } - - dead_names = OBJ_NEW(opal_pointer_array_t); - - for (i = 0; i < num_failed; i++) { - name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - - /* Unpack the buffer to get the dead process' name. */ - n = 1; - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* Check to see if the message is telling us about an old epoch. - * If so ignore the message. - */ - epoch = orte_util_lookup_epoch(name_item); - if (name_item->epoch < epoch) { - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:hnp HNP ignoring duplicate notification for %s failure (reported epoch: %s local epoch: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item), - ORTE_EPOCH_PRINT(name_item->epoch), - ORTE_EPOCH_PRINT(epoch)); - } - free(name_item); - continue; - } else { - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:hnp HNP received notification for %s failure (reported epoch: %s local epoch: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item), - ORTE_EPOCH_PRINT(name_item->epoch), - ORTE_EPOCH_PRINT(epoch)); - } - } - - opal_pointer_array_add(dead_names, name_item); - - /* Check to see if the message is telling us about an orted and - * it is from another orted. Orteds don't have the list of all - * the application processes so they don't know if there were - * any child processes on the nodes that they are reporting. */ - if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, sender, ORTE_PROC_MY_NAME)) { - if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) { - continue; - } else if (NULL == (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid))) { - continue; - } else if (NULL == pdat->node) { - continue; - } - - if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) { - for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) { - if (NULL == (pdat2 = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) { - continue; - } else { - name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - - name_item->jobid = pdat2->name.jobid; - name_item->vpid = pdat2->name.vpid; - name_item->epoch = orte_util_lookup_epoch(&(pdat2->name)); - - opal_pointer_array_add(dead_names, name_item); - } - } - } - } - - } - - /* Update the number of failed process so any duplicates don't get - * re-reported. - */ - num_failed = opal_pointer_array_get_size(dead_names); - - if (num_failed > 0) { - orte_errmgr.mark_processes_as_dead(dead_names); - - if (!orte_orteds_term_ordered) { - /* Send a message out to all the orteds to inform them that the - * process is dead. Long live the process (or not if it is so - * decided)! - */ - answer = OBJ_NEW(opal_buffer_t); - command = ORTE_PROCESS_FAILED_NOTIFICATION; - - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); - return ret; - } - - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_failed, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); - return ret; - } - - for (i = 0; i < opal_pointer_array_get_size(dead_names); i++) { - if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, name_item, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); - return ret; - } - } - } - - if (ORTE_SUCCESS != (ret = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, answer, ORTE_RML_TAG_DAEMON))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); - return ret; - } - - /* Tell the applications' ORTE layers that there is a failure. */ - if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) { - return ret; - } - } - - for (i = 0; i < num_failed; i++) { - name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i); - free(name_item); - } - } - - OBJ_RELEASE(dead_names); - - return ret; -} - /***************** * Local Functions *****************/ static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code) { int rc; - + /* if we are already in progress, then ignore this call */ if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, @@ -996,7 +821,7 @@ static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code) ORTE_JOBID_PRINT(job), exit_code)); return; } - + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp: abort called on job %s with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -1030,14 +855,14 @@ static void failed_start(orte_job_t *jdata) orte_odls_job_t *jobdat; orte_odls_child_t *child; orte_proc_t *proc; - + /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; - + /* is this the specified job? */ if (jobdat->jobid == jdata->jobid) { break; @@ -1048,7 +873,7 @@ static void failed_start(orte_job_t *jdata) return; } jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; - + for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = next) { @@ -1070,7 +895,7 @@ static void failed_start(orte_job_t *jdata) } } } - + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp: job %s reported incomplete start", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -1084,14 +909,14 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta orte_odls_job_t *jobdat; orte_odls_child_t *child; orte_proc_t *proc; - + /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; - + /* is this the specified job? */ if (jobdat->jobid == jdata->jobid) { break; @@ -1207,7 +1032,7 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata, } } } - + /*** UPDATE REMOTE CHILD ***/ for (i=0; i < jdata->procs->size; i++) { if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { @@ -1275,14 +1100,14 @@ static void check_job_complete(orte_job_t *jdata) */ continue; } - + if (0 != proc->exit_code) { non_zero++; if (0 == lowest) { lowest = proc->exit_code; } } - + switch (proc->state) { case ORTE_PROC_STATE_KILLED_BY_CMD: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, @@ -1333,7 +1158,6 @@ static void check_job_complete(orte_job_t *jdata) ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } break; -#if 0 case ORTE_PROC_STATE_ABORTED_BY_SIG: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed proc %s aborted by signal", @@ -1349,7 +1173,6 @@ static void check_job_complete(orte_job_t *jdata) ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } break; -#endif case ORTE_PROC_STATE_TERM_WO_SYNC: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed proc %s terminated without sync", @@ -1372,7 +1195,6 @@ static void check_job_complete(orte_job_t *jdata) } break; case ORTE_PROC_STATE_COMM_FAILED: -#if 0 if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_COMM_FAILED; /* point to the lowest rank to cause the problem */ @@ -1382,7 +1204,6 @@ static void check_job_complete(orte_job_t *jdata) jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } -#endif break; case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: if (!jdata->abort) { @@ -1452,7 +1273,7 @@ static void check_job_complete(orte_job_t *jdata) break; } } - + if (jdata->abort) { /* the job aborted - turn off any sensors on this job */ orte_sensor.stop(jdata->jobid); @@ -1487,7 +1308,7 @@ static void check_job_complete(orte_job_t *jdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); } - + /* if this job is a continuously operating one, then don't do * anything further - just return here */ @@ -1496,7 +1317,7 @@ static void check_job_complete(orte_job_t *jdata) ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls)) { goto CHECK_ALIVE; } - + /* if the job that is being checked is the HNP, then we are * trying to terminate the orteds. In that situation, we * do -not- check all jobs - we simply notify the HNP @@ -1507,9 +1328,9 @@ static void check_job_complete(orte_job_t *jdata) * This can happen if a ctrl-c hits in the "wrong" place * while launching */ -CHECK_DAEMONS: + CHECK_DAEMONS: if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { - if ((jdata->num_procs - 1) <= jdata->num_terminated) { /* Subtract one for the HNP */ + if (0 == orte_routed.num_routes()) { /* orteds are done! */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s orteds complete - exiting", @@ -1523,7 +1344,7 @@ CHECK_DAEMONS: } return; } - + /* Release the resources used by this job. Since some errmgrs may want * to continue using resources allocated to the job as part of their * fault recovery procedure, we only do this once the job is "complete". @@ -1565,8 +1386,8 @@ CHECK_DAEMONS: OBJ_RELEASE(map); jdata->map = NULL; } - -CHECK_ALIVE: + + CHECK_ALIVE: /* now check to see if all jobs are done - release this jdata * object when we find it */ @@ -1672,29 +1493,28 @@ CHECK_ALIVE: } } -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch) +static void killprocs(orte_jobid_t job, orte_vpid_t vpid) { opal_pointer_array_t cmd; orte_proc_t proc; int rc; - + /* stop local sensors for this job */ if (ORTE_VPID_WILDCARD == vpid) { orte_sensor.stop(job); } - - if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) { + + if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) { if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { ORTE_ERROR_LOG(rc); } return; } - + OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); OBJ_CONSTRUCT(&proc, orte_proc_t); proc.name.jobid = job; proc.name.vpid = vpid; - proc.name.epoch = epoch; opal_pointer_array_add(&cmd, &proc); if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { ORTE_ERROR_LOG(rc); @@ -1731,7 +1551,7 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, */ if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { /* remove this proc from the daemon job */ - orte_errmgr_hnp_record_dead_process(proc); + orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code); /* check to see if any other nodes are "alive" */ if (!orte_hnp_is_allocated && jdata->num_procs == 1) { return ORTE_ERR_FATAL; @@ -1815,10 +1635,10 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, if (app->max_restarts < pdata->restarts) { return ORTE_ERR_RESTART_LIMIT_EXCEEDED; } - + /* reset the job params for restart */ orte_plm_base_reset_job(jdata); - + /* flag the current node as not-to-be-used */ pdata->node->state = ORTE_NODE_STATE_DO_NOT_USE; @@ -1842,7 +1662,7 @@ static orte_odls_child_t* proc_is_local(orte_process_name_t *proc) { orte_odls_child_t *child; opal_list_item_t *item; - + child = NULL; for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); @@ -1856,229 +1676,59 @@ static orte_odls_child_t* proc_is_local(orte_process_name_t *proc) return NULL; } -int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) { - orte_job_t *jdat; - orte_proc_t *pdat; - opal_buffer_t *buffer; - orte_daemon_cmd_flag_t command; - int i, rc, num_failed; - opal_pointer_array_t *dead_names; - orte_process_name_t *name_item; - orte_proc_t *proc_item; - - if (NULL == (jdat = orte_get_job_data_object(proc->jobid))) { - opal_output(0, "Can't find job object"); - return ORTE_ERR_NOT_FOUND; - } - - if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) && - ORTE_PROC_STATE_TERMINATED != pdat->state) { - - /* Make sure that the epochs match. */ - if (proc->epoch != pdat->name.epoch) { - opal_output(1, "The epoch does not match the current epoch. Throwing the request out."); - return ORTE_SUCCESS; - } - - dead_names = OBJ_NEW(opal_pointer_array_t); - - if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - opal_pointer_array_add(dead_names, &(pdat->name)); - - for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) { - if (NULL == (proc_item = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) { - continue; - } - - opal_pointer_array_add(dead_names, &(proc_item->name)); - } - } - - if (!orte_orteds_term_ordered) { - /* - * Send a message to the other daemons so they know that a daemon has - * died. - */ - buffer = OBJ_NEW(opal_buffer_t); - command = ORTE_PROCESS_FAILED_NOTIFICATION; - - num_failed = opal_pointer_array_get_size(dead_names); - - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - } else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - } else { - - /* Iterate of the list of dead procs and send them along with - * the rest. The HNP needs this info so it can tell the other - * ORTEDs and they can inform the appropriate applications. - */ - for (i = 0; i < num_failed; i++) { - if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, name_item, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - } - } - } - - OBJ_RELEASE(dead_names); - - orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0); - - OBJ_RELEASE(buffer); - } - } else { - orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); - } - } - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs) { - int i; - orte_process_name_t *name_item; - orte_job_t *jdat; +void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat, + orte_vpid_t vpid, + orte_proc_state_t state, + orte_exit_code_t exit_code) +{ + orte_job_t *jdt; orte_proc_t *pdat; orte_node_t *node; + int i; - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "HNP %s marking procs as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* Iterate over the list of processes */ - for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) { - if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) { - opal_output(1, "NULL found in dead process list."); - continue; - } - - if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s Job data not found.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return ORTE_ERR_NOT_FOUND; - } - - if (NULL != (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid)) && - ORTE_PROC_STATE_TERMINATED != pdat->state) { - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "HNP %s marking %s as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&pdat->name))); - - /* Make sure the epochs match, if not it probably means that we - * already reported this failure. */ - if (name_item->epoch != pdat->name.epoch) { + if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, vpid)) && + ORTE_PROC_STATE_TERMINATED != pdat->state) { + /* need to record that this one died */ + pdat->state = state; + pdat->exit_code = exit_code; + ORTE_UPDATE_EXIT_STATUS(exit_code); + /* remove it from the job array */ + opal_pointer_array_set_item(jdat->procs, vpid, NULL); + orte_process_info.num_procs--; + jdat->num_procs--; + /* mark the node as down so it won't be used in mapping + * procs to be relaunched + */ + node = pdat->node; + node->state = ORTE_NODE_STATE_DOWN; + node->daemon = NULL; + OBJ_RELEASE(pdat); /* maintain accounting */ + /* mark all procs on this node as having terminated */ + for (i=0; i < node->procs->size; i++) { + if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } - - orte_util_set_epoch(name_item, name_item->epoch + 1); - - /* Remove it from the job array */ - opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL); - orte_process_info.num_procs--; - jdat->num_procs--; - - /* Check if this is an ORTED */ - if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) { - /* Mark the node as down so it won't be used in mapping anymore. */ - node = pdat->node; - node->state = ORTE_NODE_STATE_DOWN; - node->daemon = NULL; + /* get the job data object for this process */ + if (NULL == (jdt = orte_get_job_data_object(pdat->name.jobid))) { + /* It is possible that the process job finishes before the daemons. + * In that case the process state is set to normal termination, and + * the job data has already been cleared. So no need to throw an + * error. + */ + if( ORTE_PROC_STATE_TERMINATED != pdat->state ) { + opal_output(0, + "%s Error: Failed to find job_data for proc %s (%s) on node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&pdat->name), + orte_proc_state_to_str(pdat->state), + node->name ); + /* major problem */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + } + continue; } - - OBJ_RELEASE(pdat); - - /* Create a new proc object that will keep track of the epoch - * information */ - pdat = OBJ_NEW(orte_proc_t); - pdat->name.jobid = jdat->jobid; - pdat->name.vpid = name_item->vpid; - pdat->name.epoch = name_item->epoch + 1; - - /* Set the state as terminated so we'll know the process isn't - * actually there. */ - pdat->state = ORTE_PROC_STATE_TERMINATED; - - opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); - jdat->num_procs++; - jdat->num_terminated++; - } else { - opal_output(0, "Proc data not found for %s", ORTE_NAME_PRINT(name_item)); - /* Create a new proc object that will keep track of the epoch - * information */ - pdat = OBJ_NEW(orte_proc_t); - pdat->name.jobid = jdat->jobid; - pdat->name.vpid = name_item->vpid; - pdat->name.epoch = name_item->epoch + 1; - - /* Set the state as terminated so we'll know the process isn't - * actually there. */ - pdat->state = ORTE_PROC_STATE_TERMINATED; - - opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); - jdat->num_procs++; - jdat->num_terminated++; - } - - check_job_complete(jdat); - } - - if (!orte_orteds_term_ordered) { - /* Need to update the orted routing module. */ - orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); - - if (NULL != fault_cbfunc) { - (*fault_cbfunc)(dead_procs); + pdat->state = ORTE_PROC_STATE_ABORTED; + jdt->num_terminated++; } } - - return ORTE_SUCCESS; -} - -int send_to_local_applications(opal_pointer_array_t *dead_names) { - opal_buffer_t *buf; - int ret = ORTE_SUCCESS; - orte_process_name_t *name_item; - int size, i; - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "%s Sending failure to local applications.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - buf = OBJ_NEW(opal_buffer_t); - - size = opal_pointer_array_get_size(dead_names); - - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - for (i = 0; i < size; i++) { - if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - } - } - - if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - OBJ_RELEASE(buf); - - return ret; } diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.h b/orte/mca/errmgr/hnp/errmgr_hnp.h index 4c296d0d8f..5c54a8f537 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.h +++ b/orte/mca/errmgr/hnp/errmgr_hnp.h @@ -1,8 +1,5 @@ /* * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. * * $COPYRIGHT$ * @@ -60,6 +57,10 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code); +void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat, + orte_vpid_t vpid, + orte_proc_state_t state, + orte_exit_code_t exit_code); /*************************** * Module functions: Global @@ -80,11 +81,6 @@ int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list); int orte_errmgr_hnp_global_ft_event(int state); -int orte_errmgr_hnp_global_post_startup(void); -int orte_errmgr_hnp_global_pre_shutdown(void); -int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs); -int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer); -int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc); /* HNP Versions */ int orte_errmgr_hnp_base_global_init(void); diff --git a/orte/mca/errmgr/hnp/errmgr_hnp_autor.c b/orte/mca/errmgr/hnp/errmgr_hnp_autor.c index 485332fdaa..e598c93a32 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp_autor.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp_autor.c @@ -1,10 +1,7 @@ /* - * Copyright (c) 2009-2011 The Trustees of Indiana University. + * Copyright (c) 2009-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. * * $COPYRIGHT$ * @@ -394,7 +391,6 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc, orte_node_t *node = NULL; bool found = false; int num_removed = 0, num_to_remove; - orte_ns_cmp_bitmask_t mask; if( NULL == current_global_jobdata ) { return ORTE_SUCCESS; @@ -414,8 +410,8 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc, item = opal_list_get_next(item) ) { wp_item = (errmgr_autor_wp_item_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &wp_item->name, &proc->name)) { + if( wp_item->name.vpid == proc->name.vpid && + wp_item->name.jobid == proc->name.jobid ) { found = true; break; } @@ -522,7 +518,6 @@ static void errmgr_autor_process_fault_app(orte_job_t *jdata, wp_item = OBJ_NEW(errmgr_autor_wp_item_t); wp_item->name.jobid = proc->jobid; wp_item->name.vpid = proc->vpid; - wp_item->name.epoch = proc->epoch; wp_item->state = state; opal_list_append(procs_pending_recovery, &(wp_item->super)); @@ -617,7 +612,7 @@ static void errmgr_autor_process_fault_daemon(orte_job_t *jdata, /* * Record the dead daemon */ - orte_errmgr_hnp_record_dead_process(proc); + orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, 0); return; } @@ -626,7 +621,6 @@ void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp) { wp->name.jobid = ORTE_JOBID_INVALID; wp->name.vpid = ORTE_VPID_INVALID; - wp->name.epoch = ORTE_EPOCH_MIN; wp->state = 0; } @@ -635,7 +629,6 @@ void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp) { wp->name.jobid = ORTE_JOBID_INVALID; wp->name.vpid = ORTE_VPID_INVALID; - wp->name.epoch = ORTE_EPOCH_INVALID; wp->state = 0; } diff --git a/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c b/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c index b6a45d51db..63d21e1322 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c @@ -2,9 +2,6 @@ * Copyright (c) 2009-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. * * $COPYRIGHT$ * @@ -750,7 +747,6 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_ close_iof_stdin = true; iof_name.jobid = proc->name.jobid; iof_name.vpid = proc->name.vpid; - iof_name.epoch = proc->name.epoch; } } } @@ -807,7 +803,6 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_ close_iof_stdin = true; iof_name.jobid = proc->name.jobid; iof_name.vpid = proc->name.vpid; - iof_name.epoch = proc->name.epoch; } } } @@ -855,7 +850,6 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_ close_iof_stdin = true; iof_name.jobid = proc->name.jobid; iof_name.vpid = proc->name.vpid; - iof_name.epoch = proc->name.epoch; } } } diff --git a/orte/mca/errmgr/hnpresil/.windows b/orte/mca/errmgr/hnpresil/.windows new file mode 100644 index 0000000000..aa7d7bbbe5 --- /dev/null +++ b/orte/mca/errmgr/hnpresil/.windows @@ -0,0 +1,12 @@ +# +# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module +mca_link_libraries=libopen-rte diff --git a/orte/mca/errmgr/hnpresil/Makefile.am b/orte/mca/errmgr/hnpresil/Makefile.am new file mode 100644 index 0000000000..cb4f030a86 --- /dev/null +++ b/orte/mca/errmgr/hnpresil/Makefile.am @@ -0,0 +1,40 @@ +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = .windows + +dist_pkgdata_DATA = help-orte-errmgr-hnp.txt + +sources = \ + errmgr_hnpresil.h \ + errmgr_hnpresil_component.c \ + errmgr_hnpresil.c \ + errmgr_hnpresil_autor.c \ + errmgr_hnpresil_crmig.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_errmgr_hnpresil_DSO +component_noinst = +component_install = mca_errmgr_hnpresil.la +else +component_noinst = libmca_errmgr_hnpresil.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_errmgr_hnpresil_la_SOURCES = $(sources) +mca_errmgr_hnpresil_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_errmgr_hnpresil_la_SOURCES =$(sources) +libmca_errmgr_hnpresil_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/hnpresil/errmgr_hnpresil.c b/orte/mca/errmgr/hnpresil/errmgr_hnpresil.c new file mode 100644 index 0000000000..2070a5533e --- /dev/null +++ b/orte/mca/errmgr/hnpresil/errmgr_hnpresil.c @@ -0,0 +1,2112 @@ +/* + * Copyright (c) 2009-2011 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif +#ifdef HAVE_SYS_WAIT_H +#include +#endif + +#include "opal/util/output.h" +#include "opal/util/opal_sos.h" +#include "opal/dss/dss.h" + +#include "orte/mca/rml/rml.h" +#include "orte/mca/odls/odls.h" +#include "orte/mca/odls/base/base.h" +#include "orte/mca/plm/base/plm_private.h" +#include "orte/mca/plm/plm.h" +#include "orte/mca/rmaps/rmaps_types.h" +#include "orte/mca/sensor/sensor.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/debugger/base/base.h" +#include "orte/mca/notifier/notifier.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/mca/ess/ess.h" + +#include "orte/util/error_strings.h" +#include "orte/util/name_fns.h" +#include "orte/util/proc_info.h" +#include "orte/util/show_help.h" +#include "orte/util/nidmap.h" + +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_locks.h" +#include "orte/runtime/orte_quit.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "orte/mca/errmgr/base/errmgr_private.h" + +#include "errmgr_hnpresil.h" + +/********************** + * C/R Mgr Components + * Global: HNP + **********************/ +static orte_errmgr_base_module_t global_module = { + /** Initialization Function */ + orte_errmgr_hnpresil_global_module_init, + /** Finalization Function */ + orte_errmgr_hnpresil_global_module_finalize, + /** Error Log */ + orte_errmgr_base_log, + /** Forced Abort */ + orte_errmgr_base_abort, + /** Peer Force Abort */ + orte_errmgr_base_abort_peers, + /** Update State */ + orte_errmgr_hnpresil_global_update_state, + /* Predicted Fault */ + orte_errmgr_hnpresil_global_predicted_fault, + /* Suggest proc to node mapping */ + orte_errmgr_hnpresil_global_suggest_map_targets, + /* FT Event hook */ + orte_errmgr_hnpresil_global_ft_event, + orte_errmgr_base_register_migration_warning, + /* Post-startup */ + orte_errmgr_hnpresil_global_post_startup, + /* Pre-shutdown */ + orte_errmgr_hnpresil_global_pre_shutdown, + /* Mark as dead */ + orte_errmgr_hnpresil_global_mark_processes_as_dead, + /* Set the callback */ + orte_errmgr_base_set_fault_callback, + /* Receive failure notification */ + orte_errmgr_hnpresil_global_failure_notification +}; + + +/* + * Local functions + */ +static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code); +static void failed_start(orte_job_t *jdata); +static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, + orte_proc_state_t state, orte_exit_code_t exit_code); +static void check_job_complete(orte_job_t *jdata); +static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch); +static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, + orte_proc_state_t state, orte_exit_code_t exit_code); +static orte_odls_child_t* proc_is_local(orte_process_name_t *proc); +static int send_to_local_applications(opal_pointer_array_t *dead_names); + +/************************ + * API Definitions + ************************/ +int orte_errmgr_hnpresil_component_query(mca_base_module_t **module, int *priority) +{ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp:component_query()"); + + if( ORTE_PROC_IS_HNP ) { + *priority = mca_errmgr_hnpresil_component.super.priority; + *module = (mca_base_module_t *)&global_module; + } + /* Daemons and Apps have their own components */ + else { + *module = NULL; + *priority = -1; + } + + return ORTE_SUCCESS; +} + +/******************* + * Global Functions + ********************/ +int orte_errmgr_hnpresil_global_module_init(void) +{ + int ret, exit_status = ORTE_SUCCESS; + +#if OPAL_ENABLE_FT_CR + if( mca_errmgr_hnpresil_component.crmig_enabled ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_crmig_global_module_init()) ) { + exit_status = ret; + goto cleanup; + } + } + else { + /* Still need the tool listener so we can tell it that we cannot do + * anything if they ask. + */ + if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_init()) ) { + ORTE_ERROR_LOG(ret); + return ret; + } + } + + if( mca_errmgr_hnpresil_component.autor_enabled ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_autor_global_module_init()) ) { + exit_status = ret; + goto cleanup; + } + } +#endif /* OPAL_ENABLE_FT_CR */ + + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_base_global_init()) ) { + exit_status = ret; + goto cleanup; + } + +cleanup: + return exit_status; +} + +int orte_errmgr_hnpresil_global_module_finalize(void) +{ + int ret, exit_status = ORTE_SUCCESS; + +#if OPAL_ENABLE_FT_CR + if( mca_errmgr_hnpresil_component.crmig_enabled ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_crmig_global_module_finalize()) ) { + exit_status = ret; + goto cleanup; + } + } + else { + /* Still need the tool listener so we can tell it that we cannot do + * anything if they ask. + */ + if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_finalize()) ) { + ORTE_ERROR_LOG(ret); + return ret; + } + } + + if( mca_errmgr_hnpresil_component.autor_enabled ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_autor_global_module_finalize()) ) { + exit_status = ret; + goto cleanup; + } + } +#endif /* OPAL_ENABLE_FT_CR */ + + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_base_global_finalize()) ) { + exit_status = ret; + goto cleanup; + } + +cleanup: + return exit_status; +} + +int orte_errmgr_hnpresil_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + int ret, exit_status = ORTE_SUCCESS; + + mca_errmgr_hnpresil_component.ignore_current_update = false; + + if (orte_finalizing || + orte_job_term_ordered || + ORTE_PROC_STATE_TERMINATED == state ) { + mca_errmgr_hnpresil_component.term_in_progress = true; + } + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, + "errmgr:hnp:update_state() %s) " + "------- %s state updated for process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ((NULL == proc_name) ? "App. Process" : + (proc_name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), + (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name))); + +#if OPAL_ENABLE_FT_CR + if( mca_errmgr_hnpresil_component.crmig_enabled && + !mca_errmgr_hnpresil_component.autor_in_progress) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_crmig_global_update_state(job, + jobstate, + proc_name, + state, + pid, + exit_code)) ) { + exit_status = ret; + goto cleanup; + } + } + + if( mca_errmgr_hnpresil_component.autor_enabled && + !mca_errmgr_hnpresil_component.crmig_in_progress) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_autor_global_update_state(job, + jobstate, + proc_name, + state, + pid, + exit_code)) ) { + exit_status = ret; + goto cleanup; + } + } +#endif /* OPAL_ENABLE_FT_CR */ + + if( !mca_errmgr_hnpresil_component.ignore_current_update ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_base_global_update_state(job, + jobstate, + proc_name, + state, + pid, + exit_code)) ) { + exit_status = ret; + goto cleanup; + } + } + +cleanup: + return exit_status; +} + +int orte_errmgr_hnpresil_global_predicted_fault(opal_list_t *proc_list, + opal_list_t *node_list, + opal_list_t *suggested_map) +{ +#if OPAL_ENABLE_FT_CR + int ret, exit_status = ORTE_SUCCESS; + + if( mca_errmgr_hnpresil_component.crmig_enabled ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_crmig_global_predicted_fault(proc_list, + node_list, + suggested_map)) ) { + exit_status = ret; + goto cleanup; + } + } + /* + * If Process migration is not enabled, then return an error the tool + * which will print an appropriate message for the user. + */ + else { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp:predicted_fault() Command line asked for a migration, but it is not enabled\n")); + orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERROR); + exit_status = ORTE_ERR_NOT_IMPLEMENTED; + goto cleanup; + } + +cleanup: + return exit_status; +#else + return ORTE_ERR_NOT_IMPLEMENTED; +#endif /* OPAL_ENABLE_FT_CR */ +} + +int orte_errmgr_hnpresil_global_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list) +{ +#if OPAL_ENABLE_FT_CR + int ret, exit_status = ORTE_ERR_NOT_IMPLEMENTED; + + if( mca_errmgr_hnpresil_component.crmig_enabled && + !mca_errmgr_hnpresil_component.autor_in_progress ) { + exit_status = ORTE_SUCCESS; + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_crmig_global_suggest_map_targets(proc, + oldnode, + node_list)) ) { + exit_status = ret; + goto cleanup; + } + } + + if( mca_errmgr_hnpresil_component.autor_enabled && + !mca_errmgr_hnpresil_component.crmig_in_progress ) { + exit_status = ORTE_SUCCESS; + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_autor_global_suggest_map_targets(proc, + oldnode, + node_list)) ) { + exit_status = ret; + goto cleanup; + } + } + +cleanup: + return exit_status; +#else + return ORTE_ERR_NOT_IMPLEMENTED; +#endif /* OPAL_ENABLE_FT_CR */ +} + +int orte_errmgr_hnpresil_global_ft_event(int state) +{ + int ret, exit_status = ORTE_SUCCESS; + +#if OPAL_ENABLE_FT_CR + if( !mca_errmgr_hnpresil_component.crmig_enabled ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_crmig_global_ft_event(state)) ) { + exit_status = ret; + goto cleanup; + } + } + + if( !mca_errmgr_hnpresil_component.autor_enabled ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_autor_global_ft_event(state)) ) { + exit_status = ret; + goto cleanup; + } + } +#endif /* OPAL_ENABLE_FT_CR */ + + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_base_global_ft_event(state)) ) { + exit_status = ret; + goto cleanup; + } + +cleanup: + return exit_status; +} + + +/********************** + * From HNP + **********************/ +int orte_errmgr_hnpresil_base_global_init(void) +{ + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_base_global_finalize(void) +{ + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_base_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + orte_job_t *jdata; + orte_exit_code_t sts; + orte_odls_child_t *child; + int rc; + orte_app_context_t *app; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp: job %s reported state %s" + " for proc %s state %s pid %d exit_code %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + orte_job_state_to_str(jobstate), + (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state), pid, exit_code)); + + /* + * if orte is trying to shutdown, just let it + */ + if (orte_finalizing) { + return ORTE_SUCCESS; + } + + if (NULL == proc) { + /* this is an update for an entire local job */ + if (ORTE_JOBID_INVALID == job) { + /* whatever happened, we don't know what job + * it happened to + */ + if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate) { + orte_never_launched = true; + } + orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:unknown-job-error", + true, orte_job_state_to_str(jobstate)); + hnp_abort(job, exit_code); + return ORTE_SUCCESS; + } + + /* get the job object */ + if (NULL == (jdata = orte_get_job_data_object(job))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + /* update the state */ + jdata->state = jobstate; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp: job %s reported state %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid), + orte_job_state_to_str(jobstate))); + + switch (jobstate) { + case ORTE_JOB_STATE_TERMINATED: + /* support batch-operated jobs */ + update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_TERMINATED, 0); + jdata->num_terminated = jdata->num_procs; + check_job_complete(jdata); + break; + + case ORTE_JOB_STATE_ABORTED: + /* support batch-operated jobs */ + update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_ABORTED, exit_code); + jdata->num_terminated = jdata->num_procs; + check_job_complete(jdata); + break; + + case ORTE_JOB_STATE_FAILED_TO_START: + failed_start(jdata); + check_job_complete(jdata); /* set the local proc states */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(job))) { + sts = exit_code; + if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) { + /* set the flag indicating that a daemon failed so we use the proper + * methods for attempting to shutdown the rest of the system + */ + orte_abnormal_term_ordered = true; + if (WIFSIGNALED(exit_code)) { /* died on signal */ +#ifdef WCOREDUMP + if (WCOREDUMP(exit_code)) { + orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, + WTERMSIG(exit_code)); + sts = WTERMSIG(exit_code); + } else { + orte_show_help("help-plm-base.txt", "daemon-died-signal", true, + WTERMSIG(exit_code)); + sts = WTERMSIG(exit_code); + } +#else + orte_show_help("help-plm-base.txt", "daemon-died-signal", true, + WTERMSIG(exit_code)); + sts = WTERMSIG(exit_code); +#endif /* WCOREDUMP */ + } else { + orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true, + WEXITSTATUS(exit_code)); + sts = WEXITSTATUS(exit_code); + } + } + hnp_abort(jdata->jobid, sts); + } + break; + case ORTE_JOB_STATE_RUNNING: + /* update all procs in job */ + update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING, 0); + /* record that we reported */ + jdata->num_daemons_reported++; + /* report if requested */ + if (orte_report_launch_progress) { + if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) { + opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs", + (int)jdata->num_daemons_reported, (int)orte_process_info.num_procs, + (int)jdata->num_launched, (int)jdata->num_procs); + } + } + break; + case ORTE_JOB_STATE_NEVER_LAUNCHED: + orte_never_launched = true; + jdata->num_terminated = jdata->num_procs; + check_job_complete(jdata); /* set the local proc states */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(job))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: + /* update all procs in job */ + update_local_procs_in_job(jdata, jobstate, + ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, + exit_code); + /* order all local procs for this job to be killed */ + killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + check_job_complete(jdata); /* set the local proc states */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(job))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + case ORTE_JOB_STATE_COMM_FAILED: + /* order all local procs for this job to be killed */ + killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + check_job_complete(jdata); /* set the local proc states */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(job))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + case ORTE_JOB_STATE_HEARTBEAT_FAILED: + /* order all local procs for this job to be killed */ + killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + check_job_complete(jdata); /* set the local proc states */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(job))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + + default: + break; + } + return ORTE_SUCCESS; + } + + /* get the job object */ + if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { + /* if the orteds are terminating, check job complete */ + if (orte_orteds_term_ordered) { + opal_output(0, "TERM ORDERED - CHECKING COMPLETE"); + check_job_complete(NULL); + return ORTE_SUCCESS; + } else { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + } + +#if OPAL_ENABLE_FT_CR + /* Notify the process state to the notifier framework if it is + active and selected. */ + orte_errmgr_base_proc_state_notify(state, proc); +#endif + + /* update is for a specific proc */ + switch (state) { + case ORTE_PROC_STATE_ABORTED: + case ORTE_PROC_STATE_ABORTED_BY_SIG: + case ORTE_PROC_STATE_TERM_WO_SYNC: + if( jdata->enable_recovery ) { + /* is this a local proc */ + if (NULL != (child = proc_is_local(proc))) { + /* local proc - see if it has reached its restart limit */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); + if (child->restarts < app->max_restarts) { + child->restarts++; + if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { + return ORTE_SUCCESS; + } + /* reset the child's state as restart_proc would + * have cleared it + */ + child->state = state; + /* see if we can relocate it somewhere else */ + if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { + return ORTE_SUCCESS; + } + /* let it fall thru to abort */ + } + } else { + /* this is a remote process - see if we can relocate it */ + if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { + return ORTE_SUCCESS; + } + /* guess not - let it fall thru to abort */ + } + } + + if (ORTE_PROC_STATE_ABORTED_BY_SIG == state) { + exit_code = 0; + } + + orte_errmgr_hnpresil_update_proc(jdata, proc, state, pid, exit_code); + check_job_complete(jdata); /* need to set the job state */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + + case ORTE_PROC_STATE_FAILED_TO_START: + case ORTE_PROC_STATE_CALLED_ABORT: + orte_errmgr_hnpresil_update_proc(jdata, proc, state, pid, exit_code); + check_job_complete(jdata); + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + + case ORTE_PROC_STATE_REGISTERED: + case ORTE_PROC_STATE_RUNNING: + orte_errmgr_hnpresil_update_proc(jdata, proc, state, pid, exit_code); + break; + + case ORTE_PROC_STATE_LAUNCHED: + /* record the pid for this child */ + orte_errmgr_hnpresil_update_proc(jdata, proc, state, pid, exit_code); + break; + + case ORTE_PROC_STATE_TERMINATED: + case ORTE_PROC_STATE_TERM_NON_ZERO: + case ORTE_PROC_STATE_KILLED_BY_CMD: + orte_errmgr_hnpresil_update_proc(jdata, proc, state, pid, exit_code); + check_job_complete(jdata); + break; + + case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: + if (jdata->enable_recovery) { + killprocs(proc->jobid, proc->vpid, proc->epoch); + /* is this a local proc */ + if (NULL != (child = proc_is_local(proc))) { + /* local proc - see if it has reached its restart limit */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); + if (child->restarts < app->max_restarts) { + child->restarts++; + if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { + return ORTE_SUCCESS; + } + /* reset the child's state as restart_proc would + * have cleared it + */ + child->state = state; + /* see if we can relocate it somewhere else */ + if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { + return ORTE_SUCCESS; + } + /* let it fall thru to abort */ + } + } else { + /* this is a remote process - see if we can relocate it */ + if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { + return ORTE_SUCCESS; + } + /* guess not - let it fall thru to abort */ + } + } + /* kill all jobs */ + orte_errmgr_hnpresil_update_proc(jdata, proc, state, pid, exit_code); + check_job_complete(jdata); /* need to set the job state */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + + case ORTE_PROC_STATE_COMM_FAILED: + /* is this to a daemon? */ + if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { + /* if this is my own connection, ignore it */ + if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s My own connection - ignoring it", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + break; + } + /* if we have ordered orteds to terminate, record it */ + if (orte_orteds_term_ordered) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s Daemons terminating - recording daemon %s as gone", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); + /* remove from dependent routes, if it is one */ + orte_routed.route_lost(proc); + /* update daemon job */ + orte_errmgr_hnpresil_record_dead_process(proc); + /* We'll check if the job was complete when we get the + * message back from the HNP notifying us of the dead + * process + */ + check_job_complete(jdata); + break; + } + /* if abort is in progress, see if this one failed to tell + * us it had terminated + */ + if (orte_abnormal_term_ordered) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s Abort in progress - recording daemon %s as gone", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); + /* remove from dependent routes, if it is one */ + orte_routed.route_lost(proc); + /* update daemon job */ + orte_errmgr_hnpresil_record_dead_process(proc); + /* We'll check if the job was complete when we get the + * message back from the HNP notifying us of the dead + * process + */ + check_job_complete(jdata); + break; + } + + /* delete the route */ + orte_routed.delete_route(proc); + /* purge the oob */ + orte_rml.purge(proc); + + if( orte_enable_recovery ) { + /* relocate its processes */ + if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) { + /* unable to relocate for some reason */ + opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + /* kill all jobs */ + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); + /* check if all is complete so we can terminate */ + check_job_complete(jdata); + } + } else { + if (ORTE_SUCCESS != orte_errmgr_hnpresil_record_dead_process(proc)) { + /* The process is already dead so don't keep trying to do + * this stuff. */ + return ORTE_SUCCESS; + } + /* We'll check if the job was complete when we get the + * message back from the HNP notifying us of the dead + * process */ + } + } + break; + + case ORTE_PROC_STATE_HEARTBEAT_FAILED: + /* heartbeats are only from daemons */ + if( orte_enable_recovery ) { + /* relocate its processes */ + } else { + orte_errmgr_hnpresil_record_dead_process(proc); + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + /* kill all jobs */ + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); + return ORTE_ERR_UNRECOVERABLE; + } + break; + + default: + break; + } + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_base_global_ft_event(int state) +{ + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_global_post_startup(void) { + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_global_pre_shutdown(void) { + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) { + orte_std_cntr_t n; + int ret = ORTE_SUCCESS, num_failed; + opal_pointer_array_t *dead_names; + int32_t i; + orte_process_name_t *name_item; + orte_epoch_t epoch; + orte_job_t *jdat; + orte_proc_t *pdat, *pdat2; + opal_buffer_t *answer; + orte_daemon_cmd_flag_t command; + + if (orte_debug_daemons_flag) { + opal_output(0, "%s errmgr:hnp HNP received process failed from orted %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender)); + } + + n = 1; + /* Get the number of failed procs */ + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(ret); + return ret; + } + + dead_names = OBJ_NEW(opal_pointer_array_t); + + for (i = 0; i < num_failed; i++) { + name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); + + /* Unpack the buffer to get the dead process' name. */ + n = 1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* Check to see if the message is telling us about an old epoch. + * If so ignore the message. + */ + epoch = orte_util_lookup_epoch(name_item); + if (name_item->epoch < epoch) { + if (orte_debug_daemons_flag) { + opal_output(0, "%s errmgr:hnp HNP ignoring duplicate notification for %s failure (reported epoch: %s local epoch: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name_item), + ORTE_EPOCH_PRINT(name_item->epoch), + ORTE_EPOCH_PRINT(epoch)); + } + free(name_item); + continue; + } else { + if (orte_debug_daemons_flag) { + opal_output(0, "%s errmgr:hnp HNP received notification for %s failure (reported epoch: %s local epoch: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name_item), + ORTE_EPOCH_PRINT(name_item->epoch), + ORTE_EPOCH_PRINT(epoch)); + } + } + + opal_pointer_array_add(dead_names, name_item); + + /* Check to see if the message is telling us about an orted and + * it is from another orted. Orteds don't have the list of all + * the application processes so they don't know if there were + * any child processes on the nodes that they are reporting. */ + if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, sender, ORTE_PROC_MY_NAME)) { + if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) { + continue; + } else if (NULL == (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid))) { + continue; + } else if (NULL == pdat->node) { + continue; + } + + if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) { + for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) { + if (NULL == (pdat2 = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) { + continue; + } + + /* ignore this process if it has already terminated */ + if (ORTE_PROC_STATE_TERMINATED <= pdat2->state) { + continue; + } + + /* the proc must have been alive, so notify everyone that it died */ + name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); + + name_item->jobid = pdat2->name.jobid; + name_item->vpid = pdat2->name.vpid; + name_item->epoch = orte_util_lookup_epoch(&(pdat2->name)); + + opal_pointer_array_add(dead_names, name_item); + } + } + } + + } + + /* Update the number of failed process so any duplicates don't get + * re-reported. + */ + num_failed = opal_pointer_array_get_size(dead_names); + + if (num_failed > 0) { + orte_errmgr.mark_processes_as_dead(dead_names); + + if (!orte_orteds_term_ordered) { + /* Send a message out to all the orteds to inform them that the + * process is dead. Long live the process (or not if it is so + * decided)! + */ + answer = OBJ_NEW(opal_buffer_t); + command = ORTE_PROCESS_FAILED_NOTIFICATION; + + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &command, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); + return ret; + } + + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_failed, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); + return ret; + } + + for (i = 0; i < opal_pointer_array_get_size(dead_names); i++) { + if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, name_item, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); + return ret; + } + } + } + + if (ORTE_SUCCESS != (ret = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, answer, ORTE_RML_TAG_DAEMON))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); + return ret; + } + + /* Tell the applications' ORTE layers that there is a failure. */ + if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) { + return ret; + } + } + + for (i = 0; i < num_failed; i++) { + name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i); + free(name_item); + } + } + + OBJ_RELEASE(dead_names); + + return ret; +} + +/***************** + * Local Functions + *****************/ +static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code) +{ + int rc; + + /* if we are already in progress, then ignore this call */ + if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp: abort in progress, ignoring abort on job %s with status %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), exit_code)); + return; + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp: abort called on job %s with status %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), exit_code)); + + /* if debuggers are running, clean up */ + orte_debugger.finalize(); + + /* set control params to indicate we are terminating */ + orte_job_term_ordered = true; + orte_abnormal_term_ordered = true; + orte_enable_recovery = false; + + /* set the exit status, just in case whomever called us failed + * to do so - it can only be done once, so we are protected + * from overwriting it + */ + ORTE_UPDATE_EXIT_STATUS(exit_code); + + /* tell the plm to terminate the orteds - they will automatically + * kill their local procs + */ + if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) { + ORTE_ERROR_LOG(rc); + } +} + +static void failed_start(orte_job_t *jdata) +{ + opal_list_item_t *item, *next; + orte_odls_job_t *jobdat; + orte_odls_child_t *child; + orte_proc_t *proc; + + /* lookup the local jobdat for this job */ + jobdat = NULL; + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jobdat = (orte_odls_job_t*)item; + + /* is this the specified job? */ + if (jobdat->jobid == jdata->jobid) { + break; + } + } + if (NULL == jobdat) { + /* race condition - may not have been formed yet */ + return; + } + jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; + + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = next) { + next = opal_list_get_next(item); + child = (orte_odls_child_t*)item; + if (child->name->jobid == jobdat->jobid) { + if (ORTE_PROC_STATE_LAUNCHED > child->state || + ORTE_PROC_STATE_UNTERMINATED < child->state) { + /* get the master proc object */ + proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); + proc->state = child->state; + proc->exit_code = child->exit_code; + /* update the counter so we can terminate */ + jdata->num_terminated++; + /* remove the child from our list */ + opal_list_remove_item(&orte_local_children, &child->super); + OBJ_RELEASE(child); + jobdat->num_local_procs--; + } + } + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp: job %s reported incomplete start", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid))); +} + +static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, + orte_proc_state_t state, orte_exit_code_t exit_code) +{ + opal_list_item_t *item, *next; + orte_odls_job_t *jobdat; + orte_odls_child_t *child; + orte_proc_t *proc; + + /* lookup the local jobdat for this job */ + jobdat = NULL; + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jobdat = (orte_odls_job_t*)item; + + /* is this the specified job? */ + if (jobdat->jobid == jdata->jobid) { + break; + } + } + if (NULL == jobdat) { + /* race condition - may not have been formed yet */ + return; + } + jobdat->state = jobstate; + jdata->state = jobstate; + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = next) { + next = opal_list_get_next(item); + child = (orte_odls_child_t*)item; + if (jdata->jobid == child->name->jobid) { + child->state = state; + proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); + proc->state = state; + if (proc->exit_code < exit_code) { + proc->exit_code = exit_code; + } + if (ORTE_PROC_STATE_UNTERMINATED < state) { + opal_list_remove_item(&orte_local_children, &child->super); + OBJ_RELEASE(child); + jdata->num_terminated++; + jobdat->num_local_procs--; + } else if (ORTE_PROC_STATE_RUNNING) { + jdata->num_launched++; + } else if (ORTE_PROC_STATE_REGISTERED == state) { + jdata->num_reported++; + if (jdata->dyn_spawn_active && + jdata->num_reported == jdata->num_procs) { + OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, + &jdata->dyn_spawn_cond, + &jdata->dyn_spawn_active); + } + } + } + } +} + +void orte_errmgr_hnpresil_update_proc(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + opal_list_item_t *item, *next; + orte_odls_child_t *child; + orte_proc_t *proct; + orte_odls_job_t *jobdat, *jdat; + int i; + + jobdat = NULL; + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jdat = (orte_odls_job_t*)item; + if (jdat->jobid == jdata->jobid) { + jobdat = jdat; + break; + } + } + if (NULL == jobdat) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + } + + /*** UPDATE LOCAL CHILD ***/ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = next) { + next = opal_list_get_next(item); + child = (orte_odls_child_t*)item; + if (child->name->jobid == proc->jobid) { + if (child->name->vpid == proc->vpid) { + child->state = state; + if (0 < pid) { + child->pid = pid; + } + child->exit_code = exit_code; + proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); + proct->state = state; + if (0 < pid) { + proct->pid = pid; + } + proct->exit_code = exit_code; + if (ORTE_PROC_STATE_UNTERMINATED < state) { + if (!jdata->enable_recovery) { + opal_list_remove_item(&orte_local_children, &child->super); + OBJ_RELEASE(child); + if (NULL != jobdat) { + jobdat->num_local_procs--; + } + } + jdata->num_terminated++; + } else if (ORTE_PROC_STATE_RUNNING == state) { + jdata->num_launched++; + if (jdata->num_launched == jdata->num_procs) { + jdata->state = ORTE_JOB_STATE_RUNNING; + } + } else if (ORTE_PROC_STATE_REGISTERED == state) { + jdata->num_reported++; + if (jdata->dyn_spawn_active && + jdata->num_reported == jdata->num_procs) { + OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, + &jdata->dyn_spawn_cond, + &jdata->dyn_spawn_active); + } + } + return; + } + } + } + + /*** UPDATE REMOTE CHILD ***/ + for (i=0; i < jdata->procs->size; i++) { + if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + continue; + } + if (proct->name.jobid != proc->jobid || + proct->name.vpid != proc->vpid) { + continue; + } + proct->state = state; + if (0 < pid) { + proct->pid = pid; + } + proct->exit_code = exit_code; + if (ORTE_PROC_STATE_REGISTERED == state) { + jdata->num_reported++; + if (jdata->dyn_spawn_active && + jdata->num_reported == jdata->num_procs) { + OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, + &jdata->dyn_spawn_cond, + &jdata->dyn_spawn_active); + } + } else if (ORTE_PROC_STATE_UNTERMINATED < state) { + /* update the counter so we can terminate */ + jdata->num_terminated++; + } else if (ORTE_PROC_STATE_RUNNING == state) { + jdata->num_launched++; + if (jdata->num_launched == jdata->num_procs) { + jdata->state = ORTE_JOB_STATE_RUNNING; + } + } + return; + } +} + +static void check_job_complete(orte_job_t *jdata) +{ + orte_proc_t *proc; + int i; + orte_std_cntr_t j; + orte_job_t *job; + orte_node_t *node; + orte_job_map_t *map; + orte_std_cntr_t index; + bool one_still_alive; + orte_vpid_t non_zero=0, lowest=0; + char *msg; + +#if 0 + /* Check if FileM is active. If so then keep processing. */ + OPAL_ACQUIRE_THREAD(&orte_filem_base_lock, &orte_filem_base_cond, &orte_filem_base_is_active); +#endif + if (NULL == jdata) { + /* just check to see if the daemons are complete */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_complete - received NULL job, checking daemons", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + goto CHECK_DAEMONS; + } + + for (i=0; i < jdata->procs->size && !jdata->abort; i++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + /* the proc array may no longer be left justified, so + * we need to check everything + */ + continue; + } + + if (0 != proc->exit_code) { + non_zero++; + if (0 == lowest) { + lowest = proc->exit_code; + } + } + + switch (proc->state) { + case ORTE_PROC_STATE_KILLED_BY_CMD: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s killed by cmd", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + /* we ordered this proc to die, so it isn't an abnormal termination + * and we don't flag it as such - just check the remaining jobs to + * see if anyone is still alive + */ + if (jdata->num_terminated >= jdata->num_procs) { + /* this job has terminated - now we need to check to see if ALL + * the other jobs have also completed and wakeup if that is true + */ + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD; + } + } + goto CHECK_ALIVE; + break; + case ORTE_PROC_STATE_ABORTED: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s aborted", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_FAILED_TO_START: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr_hnpresil:check_job_completed proc %s failed to start", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_FAILED_TO_START; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; +#if 0 + case ORTE_PROC_STATE_ABORTED_BY_SIG: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s aborted by signal", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; +#endif + case ORTE_PROC_STATE_TERM_WO_SYNC: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s terminated without sync", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + /* now treat a special case - if the proc exit'd without a required + * sync, it may have done so with a zero exit code. We want to ensure + * that the user realizes there was an error, so in this -one- case, + * we overwrite the process' exit code with the default error code + */ + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + } + break; + case ORTE_PROC_STATE_COMM_FAILED: +#if 0 + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_COMM_FAILED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } +#endif + break; + case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_CALLED_ABORT: + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_CALLED_ABORT; + /* point to the first proc to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_HEARTBEAT_FAILED: + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_TERM_NON_ZERO: + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + if (orte_abort_non_zero_exit) { + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + } + } + break; + + default: + if (ORTE_PROC_STATE_UNTERMINATED < proc->state && + jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s terminated and continuous", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + proc->state = ORTE_PROC_STATE_ABORTED; + jdata->state = ORTE_JOB_STATE_ABORTED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + } + break; + } + } + + if (jdata->abort) { + /* the job aborted - turn off any sensors on this job */ + orte_sensor.stop(jdata->jobid); + } + + if (ORTE_JOB_STATE_UNTERMINATED > jdata->state && + jdata->num_terminated >= jdata->num_procs) { + /* this job has terminated */ + jdata->state = ORTE_JOB_STATE_TERMINATED; + + /* turn off any sensor monitors on this job */ + orte_sensor.stop(jdata->jobid); + + if (0 < non_zero) { + if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { + /* update the exit code */ + ORTE_UPDATE_EXIT_STATUS(lowest); + } + + /* warn user */ + opal_output(orte_clean_output, + "-------------------------------------------------------\n" + "While %s job %s terminated normally, %s %s. Further examination may be required.\n" + "-------------------------------------------------------", + (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", + (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), + ORTE_VPID_PRINT(non_zero), + (1 == non_zero) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes."); + } + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid))); + } + + /* if this job is a continuously operating one, then don't do + * anything further - just return here + */ + if (NULL != jdata && + (ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls || + ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls)) { + goto CHECK_ALIVE; + } + + /* if the job that is being checked is the HNP, then we are + * trying to terminate the orteds. In that situation, we + * do -not- check all jobs - we simply notify the HNP + * that the orteds are complete. Also check special case + * if jdata is NULL - we want + * to definitely declare the job done if the orteds + * have completed, no matter what else may be happening. + * This can happen if a ctrl-c hits in the "wrong" place + * while launching + */ +CHECK_DAEMONS: + if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { + if ((jdata->num_procs - 1) <= jdata->num_terminated) { /* Subtract one for the HNP */ + /* orteds are done! */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s orteds complete - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + if (NULL == jdata) { + jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + } + jdata->state = ORTE_JOB_STATE_TERMINATED; + orte_quit(); + return; + } + return; + } + + /* Release the resources used by this job. Since some errmgrs may want + * to continue using resources allocated to the job as part of their + * fault recovery procedure, we only do this once the job is "complete". + * Note that an aborted/killed job -is- flagged as complete and will + * therefore have its resources released. We need to do this after + * we call the errmgr so that any attempt to restart the job will + * avoid doing so in the exact same place as the current job + */ + if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { + map = jdata->map; + for (index = 0; index < map->nodes->size; index++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { + continue; + } + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s releasing procs from node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name)); + for (i = 0; i < node->procs->size; i++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { + continue; + } + if (proc->name.jobid != jdata->jobid) { + /* skip procs from another job */ + continue; + } + node->slots_inuse--; + node->num_procs--; + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s releasing proc %s from node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name), node->name)); + /* set the entry in the node array to NULL */ + opal_pointer_array_set_item(node->procs, i, NULL); + /* release the proc once for the map entry */ + OBJ_RELEASE(proc); + } + } + OBJ_RELEASE(map); + jdata->map = NULL; + } + +CHECK_ALIVE: + /* now check to see if all jobs are done - release this jdata + * object when we find it + */ + one_still_alive = false; + for (j=1; j < orte_job_data->size; j++) { + if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { + /* since we are releasing jdata objects as we + * go, we can no longer assume that the job_data + * array is left justified + */ + continue; + } + /* if this is the job we are checking AND it normally terminated, + * then go ahead and release it. We cannot release it if it + * abnormally terminated as mpirun needs the info so it can + * report appropriately to the user + * + * NOTE: do not release the primary job (j=1) so we + * can pretty-print completion message + */ + if (NULL != jdata && job->jobid == jdata->jobid && + (jdata->state == ORTE_JOB_STATE_TERMINATED || + jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD)) { + /* release this object, ensuring that the + * pointer array internal accounting + * is maintained! + */ + if (1 < j) { + opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ + OBJ_RELEASE(jdata); + } + continue; + } + /* if the job is flagged to not be monitored, skip it */ + if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & job->controls) { + continue; + } + /* when checking for job termination, we must be sure to NOT check + * our own job as it - rather obviously - has NOT terminated! + */ + if (job->num_terminated < job->num_procs) { + /* we have at least one job that is not done yet - we cannot + * just return, though, as we need to ensure we cleanout the + * job data for the job that just completed + */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed job %s is not terminated (%d:%d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job->jobid), + job->num_terminated, job->num_procs)); + one_still_alive = true; + } + else { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed job %s is terminated (%d vs %d [%s])", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job->jobid), + job->num_terminated, job->num_procs, + (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); + } + } + /* if a job is still alive, we just return */ + if (one_still_alive) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed at least one job is not terminated", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + return; + } + /* if we get here, then all jobs are done, so terminate */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed all jobs terminated", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* set the exit status to 0 - this will only happen if it + * wasn't already set by an error condition + */ + ORTE_UPDATE_EXIT_STATUS(0); + /* provide a notifier message if that framework is active - ignored otherwise */ + if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1))) { + if (NULL == job->name) { + job->name = strdup(orte_process_info.nodename); + } + if (NULL == job->instance) { + asprintf(&job->instance, "%d", orte_process_info.pid); + } + if (0 == orte_exit_status) { + asprintf(&msg, "Job %s:%s complete", job->name, job->instance); + orte_notifier.log(ORTE_NOTIFIER_INFO, 0, msg); + } else { + asprintf(&msg, "Job %s:%s terminated abnormally", job->name, job->instance); + orte_notifier.log(ORTE_NOTIFIER_ALERT, orte_exit_status, msg); + } + free(msg); + /* this job object will be release during finalize */ + } + + orte_jobs_complete(); + /* if I am the only daemon alive, then I can exit now */ + if (0 == orte_routed.num_routes()) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s orteds complete - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + orte_quit(); + } +} + +static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch) +{ + opal_pointer_array_t cmd; + orte_proc_t proc; + int rc; + + /* stop local sensors for this job */ + if (ORTE_VPID_WILDCARD == vpid) { + orte_sensor.stop(job); + } + + if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) { + if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { + ORTE_ERROR_LOG(rc); + } + return; + } + + OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); + OBJ_CONSTRUCT(&proc, orte_proc_t); + proc.name.jobid = job; + proc.name.vpid = vpid; + proc.name.epoch = epoch; + opal_pointer_array_add(&cmd, &proc); + if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { + ORTE_ERROR_LOG(rc); + } + OBJ_DESTRUCT(&cmd); + OBJ_DESTRUCT(&proc); +} + +static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, + orte_proc_state_t state, orte_exit_code_t exit_code) +{ + orte_job_t *jdat; + orte_proc_t *pdata, *pdt, *pdt2; + orte_node_t *node, *nd; + orte_app_context_t *app; + char *app_name; + int rc, i, n; + + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, + "%s CHECKING ON RELOCATE FOR APP %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + + /* get the proc_t object for this process */ + pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); + if (NULL == pdata) { + opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc)); + return ORTE_ERR_NOT_FOUND; + } + + /* set the state */ + pdata->state = state; + + /* retain the node id */ + node = pdata->node; + + /* if it is a daemon that died, we need to flag all of its procs + * to be relocated + */ + if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { + /* remove this proc from the daemon job */ + orte_errmgr_hnpresil_record_dead_process(proc); + /* check to see if any other nodes are "alive" */ + if (!orte_hnp_is_allocated && jdata->num_procs == 1) { + return ORTE_ERR_FATAL; + } + app_name = "orted"; + /* scan the procs looking for each unique jobid on the node */ + for (i=0; i < node->procs->size; i++) { + if (NULL == (pdt = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { + continue; + } + /* get the job data object for this process */ + if (NULL == (jdat = orte_get_job_data_object(pdt->name.jobid))) { + /* major problem */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + continue; + } + /* since the node was used in this job's map, release + * it so that accounting is maintained + */ + OBJ_RELEASE(node); + /* mark this proc as dead so it will be restarted */ + pdt->state = ORTE_PROC_STATE_ABORTED; + /* remove this proc from the node */ + OBJ_RELEASE(pdt); /* maintains accounting */ + opal_pointer_array_set_item(node->procs, i, NULL); + /* maintain accounting on num procs alive in case this can't restart */ + jdat->num_terminated++; + /* look for all other procs on this node from the same job */ + for (n=0; n < node->procs->size; n++) { + if (NULL == (pdt2 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) { + continue; + } + if (pdt2->name.jobid == pdt->name.jobid) { + /* mark this proc as having aborted */ + pdt2->state = ORTE_PROC_STATE_ABORTED; + /* remove it from the node */ + OBJ_RELEASE(pdt2); + opal_pointer_array_set_item(node->procs, n, NULL); + /* maintain accounting on num procs alive */ + jdat->num_terminated++; + } + } + /* and remove the node from the map */ + for (n=0; n < jdat->map->nodes->size; n++) { + if (NULL == (nd = (orte_node_t*)opal_pointer_array_get_item(jdat->map->nodes, n))) { + continue; + } + if (nd->index == node->index) { + opal_pointer_array_set_item(jdat->map->nodes, n, NULL); + OBJ_RELEASE(node); /* maintain accounting */ + break; + } + } + /* reset the job params for this job */ + orte_plm_base_reset_job(jdat); + + /* relaunch the job */ + opal_output(0, "%s RELOCATING APPS FOR JOB %s FROM NODE %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdat->jobid), node->name); + if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdat))) { + opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc)); + return rc; + } + } + + return ORTE_SUCCESS; + } + + /* otherwise, we are an app - try to relocate us to another node */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx); + if (NULL == app) { + /* no way to restart this job */ + orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:cannot-relocate", true, + ORTE_NAME_PRINT(proc)); + return ORTE_ERR_NOT_FOUND; + } + app_name = app->app; + /* track that we are attempting to restart */ + pdata->restarts++; + /* have we exceeded the number of restarts for this proc? */ + if (app->max_restarts < pdata->restarts) { + return ORTE_ERR_RESTART_LIMIT_EXCEEDED; + } + + /* reset the job params for restart */ + orte_plm_base_reset_job(jdata); + + /* flag the current node as not-to-be-used */ + pdata->node->state = ORTE_NODE_STATE_DO_NOT_USE; + + /* restart the job - the spawn function will remap and + * launch the replacement proc(s) + */ + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, + "%s RELOCATING APP %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + + if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { + opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc)); + return rc; + } + + return ORTE_SUCCESS; +} + +static orte_odls_child_t* proc_is_local(orte_process_name_t *proc) +{ + orte_odls_child_t *child; + opal_list_item_t *item; + + child = NULL; + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + if (child->name->jobid == proc->jobid && + child->name->vpid == proc->vpid) { + return child; + } + } + return NULL; +} + +static void cbfunc(int status, + orte_process_name_t *peer, + opal_buffer_t *buffer, + orte_rml_tag_t tag, + void* cbdata) { + OBJ_RELEASE(buffer); +} + +int orte_errmgr_hnpresil_record_dead_process(orte_process_name_t *proc) { + orte_job_t *jdat; + orte_proc_t *pdat; + opal_buffer_t *buffer; + orte_daemon_cmd_flag_t command; + int i, rc, num_failed; + opal_pointer_array_t *dead_names; + orte_process_name_t *name_item; + orte_proc_t *proc_item; + + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, + "%s RECORDING DEAD PROCESS %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + + if (NULL == (jdat = orte_get_job_data_object(proc->jobid))) { + opal_output(0, "Can't find job object"); + return ORTE_ERR_NOT_FOUND; + } + + if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) && + ORTE_PROC_STATE_TERMINATED < pdat->state) { + + /* Make sure that the epochs match. */ + if (proc->epoch != pdat->name.epoch) { + opal_output(1, "The epoch does not match the current epoch. Throwing the request out."); + return ORTE_SUCCESS; + } + + dead_names = OBJ_NEW(opal_pointer_array_t); + + if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { + opal_pointer_array_add(dead_names, &(pdat->name)); + + for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) { + if (NULL == (proc_item = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) { + continue; + } + + opal_pointer_array_add(dead_names, &(proc_item->name)); + } + } + + if (!orte_orteds_term_ordered) { + /* + * Send a message to the other daemons so they know that a daemon has + * died. + */ + buffer = OBJ_NEW(opal_buffer_t); + command = ORTE_PROCESS_FAILED_NOTIFICATION; + + num_failed = opal_pointer_array_get_size(dead_names); + + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buffer); + } else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buffer); + } else { + + /* Iterate of the list of dead procs and send them along with + * the rest. The HNP needs this info so it can tell the other + * ORTEDs and they can inform the appropriate applications. + */ + for (i = 0; i < num_failed; i++) { + if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, name_item, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buffer); + } + } + } + + OBJ_RELEASE(dead_names); + + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, + "%s SENDING DEAD PROCESS MESSAGE TO HNP", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0, cbfunc, NULL); + } + } else { + orte_errmgr_hnpresil_global_mark_processes_as_dead(dead_names); + } + } + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs) { + int i; + orte_process_name_t *name_item; + orte_job_t *jdat; + orte_proc_t *pdat; + orte_node_t *node; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "HNP %s marking procs as dead", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* Iterate over the list of processes */ + for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) { + if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) { + opal_output(1, "NULL found in dead process list."); + continue; + } + + if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) { + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s Job data not found.", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + return ORTE_ERR_NOT_FOUND; + } + + if (NULL != (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid)) && + pdat->state < ORTE_PROC_STATE_TERMINATED) { + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "HNP %s marking %s as dead", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&pdat->name))); + + /* Make sure the epochs match, if not it probably means that we + * already reported this failure. */ + if (name_item->epoch != pdat->name.epoch) { + continue; + } + + orte_util_set_epoch(name_item, name_item->epoch + 1); + + /* Remove it from the job array */ + opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL); + orte_process_info.num_procs--; + jdat->num_procs--; + + /* Check if this is an ORTED */ + if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) { + /* Mark the node as down so it won't be used in mapping anymore. */ + node = pdat->node; + node->state = ORTE_NODE_STATE_DOWN; + node->daemon = NULL; + } + + OBJ_RELEASE(pdat); + + /* Create a new proc object that will keep track of the epoch + * information */ + pdat = OBJ_NEW(orte_proc_t); + pdat->name.jobid = jdat->jobid; + pdat->name.vpid = name_item->vpid; + pdat->name.epoch = name_item->epoch + 1; + + /* Set the state as terminated so we'll know the process isn't + * actually there. */ + pdat->state = ORTE_PROC_STATE_TERMINATED; + + opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); + jdat->num_procs++; + jdat->num_terminated++; + } else { + opal_output(0, "Proc data not found for %s", ORTE_NAME_PRINT(name_item)); + /* Create a new proc object that will keep track of the epoch + * information */ + pdat = OBJ_NEW(orte_proc_t); + pdat->name.jobid = jdat->jobid; + pdat->name.vpid = name_item->vpid; + pdat->name.epoch = name_item->epoch + 1; + + /* Set the state as terminated so we'll know the process isn't + * actually there. */ + pdat->state = ORTE_PROC_STATE_TERMINATED; + + opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); + jdat->num_procs++; + jdat->num_terminated++; + } + + check_job_complete(jdat); + } + + if (!orte_orteds_term_ordered) { + /* Need to update the orted routing module. */ + orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); + + if (NULL != fault_cbfunc) { + (*fault_cbfunc)(dead_procs); + } + } + + return ORTE_SUCCESS; +} + +int send_to_local_applications(opal_pointer_array_t *dead_names) { + opal_buffer_t *buf; + int ret = ORTE_SUCCESS; + orte_process_name_t *name_item; + int size, i; + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, + "%s Sending failure to local applications.", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + buf = OBJ_NEW(opal_buffer_t); + + size = opal_pointer_array_get_size(dead_names); + + if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buf); + return ret; + } + + for (i = 0; i < size; i++) { + if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buf); + return ret; + } + } + } + + if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buf); + return ret; + } + + OBJ_RELEASE(buf); + + return ret; +} diff --git a/orte/mca/errmgr/hnpresil/errmgr_hnpresil.h b/orte/mca/errmgr/hnpresil/errmgr_hnpresil.h new file mode 100644 index 0000000000..d9ac6ddcc6 --- /dev/null +++ b/orte/mca/errmgr/hnpresil/errmgr_hnpresil.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + */ + +#ifndef MCA_ERRMGR_HNPRESIL_EXPORT_H +#define MCA_ERRMGR_HNPRESIL_EXPORT_H + +#include "orte_config.h" + +#include "orte/mca/errmgr/errmgr.h" + +BEGIN_C_DECLS + +/* + * Local Component structures + */ +struct orte_errmgr_hnpresil_component_t { + orte_errmgr_base_component_t super; /** Base Errmgr component */ + + bool ignore_current_update; + bool term_in_progress; + +#if OPAL_ENABLE_FT_CR + /* State of the Recovery */ + bool crmig_in_progress; + bool autor_in_progress; + + /* CRMig Options */ + bool crmig_enabled; + bool crmig_timing_enabled; + + /* AutoR Options */ + bool autor_enabled; + bool autor_timing_enabled; + int autor_recovery_delay; + bool autor_skip_oldnode; +#endif +}; +typedef struct orte_errmgr_hnpresil_component_t orte_errmgr_hnpresil_component_t; +OPAL_MODULE_DECLSPEC extern orte_errmgr_hnpresil_component_t mca_errmgr_hnpresil_component; + +int orte_errmgr_hnpresil_component_query(mca_base_module_t **module, int *priority); + +void orte_errmgr_hnpresil_update_proc(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); + +/*************************** + * Module functions: Global + ***************************/ +int orte_errmgr_hnpresil_global_module_init(void); +int orte_errmgr_hnpresil_global_module_finalize(void); + +int orte_errmgr_hnpresil_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); +int orte_errmgr_hnpresil_global_predicted_fault(opal_list_t *proc_list, + opal_list_t *node_list, + opal_list_t *suggested_map); +int orte_errmgr_hnpresil_global_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list); +int orte_errmgr_hnpresil_global_ft_event(int state); +int orte_errmgr_hnpresil_global_post_startup(void); +int orte_errmgr_hnpresil_global_pre_shutdown(void); +int orte_errmgr_hnpresil_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs); +int orte_errmgr_hnpresil_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer); +int orte_errmgr_hnpresil_record_dead_process(orte_process_name_t *proc); + +/* hnpresil Versions */ +int orte_errmgr_hnpresil_base_global_init(void); +int orte_errmgr_hnpresil_base_global_finalize(void); +int orte_errmgr_hnpresil_base_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); +int orte_errmgr_hnpresil_base_global_ft_event(int state); + +#if OPAL_ENABLE_FT_CR +/* CRMig Versions */ +int orte_errmgr_hnpresil_crmig_global_module_init(void); +int orte_errmgr_hnpresil_crmig_global_module_finalize(void); + +int orte_errmgr_hnpresil_crmig_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); +int orte_errmgr_hnpresil_crmig_global_predicted_fault(opal_list_t *proc_list, + opal_list_t *node_list, + opal_list_t *suggested_map); +int orte_errmgr_hnpresil_crmig_global_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list); +int orte_errmgr_hnpresil_crmig_global_ft_event(int state); + +/* AutoR Versions */ +int orte_errmgr_hnpresil_autor_global_module_init(void); +int orte_errmgr_hnpresil_autor_global_module_finalize(void); + +int orte_errmgr_hnpresil_autor_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); +int orte_errmgr_hnpresil_autor_global_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list); +int orte_errmgr_hnpresil_autor_global_ft_event(int state); +#endif + +END_C_DECLS + +#endif /* MCA_ERRMGR_HNPRESIL_EXPORT_H */ diff --git a/orte/mca/errmgr/hnpresil/errmgr_hnpresil_autor.c b/orte/mca/errmgr/hnpresil/errmgr_hnpresil_autor.c new file mode 100644 index 0000000000..6a51b7f239 --- /dev/null +++ b/orte/mca/errmgr/hnpresil/errmgr_hnpresil_autor.c @@ -0,0 +1,1033 @@ +/* + * Copyright (c) 2009-2011 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/show_help.h" +#include "opal/util/output.h" +#include "opal/util/opal_environ.h" +#include "opal/util/basename.h" +#include "opal/util/argv.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/mca/crs/crs.h" +#include "opal/mca/crs/base/base.h" +#include "opal/mca/event/event.h" + +#include "orte/util/error_strings.h" +#include "orte/util/name_fns.h" +#include "orte/util/proc_info.h" +#include "orte/runtime/orte_globals.h" +#include "opal/dss/dss.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/rml_types.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/iof/iof.h" +#include "orte/mca/plm/plm.h" +#include "orte/mca/plm/base/base.h" +#include "orte/mca/plm/base/plm_private.h" +#include "orte/mca/filem/filem.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/runtime/orte_wait.h" +#include "orte/mca/rmaps/rmaps_types.h" +#include "orte/mca/snapc/snapc.h" +#include "orte/mca/snapc/base/base.h" +#include "orte/mca/sstore/sstore.h" +#include "orte/mca/sstore/base/base.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "orte/mca/errmgr/base/errmgr_private.h" + +#include "errmgr_hnpresil.h" + +#include MCA_timer_IMPLEMENTATION_HEADER + +#if OPAL_ENABLE_FT_CR +/************************ + * Work Pool structures + ************************/ +struct errmgr_autor_wp_item_t { + /** List super object */ + opal_list_item_t super; + + /** ORTE Process name */ + orte_process_name_t name; + + /** State that was passed with it */ + orte_proc_state_t state; +}; +typedef struct errmgr_autor_wp_item_t errmgr_autor_wp_item_t; + +OBJ_CLASS_DECLARATION(errmgr_autor_wp_item_t); + +void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp); +void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp); + +OBJ_CLASS_INSTANCE(errmgr_autor_wp_item_t, + opal_list_item_t, + errmgr_autor_wp_item_construct, + errmgr_autor_wp_item_destruct); + +/************************************ + * Locally Global vars & functions :) + ************************************/ +static orte_jobid_t current_global_jobid = ORTE_JOBID_INVALID; +static orte_job_t *current_global_jobdata = NULL; + +static bool autor_mask_faults = false; + +static opal_list_t *procs_pending_recovery = NULL; +static bool autor_timer_active = false; +static opal_event_t *autor_timer_event = NULL; + +static void errmgr_autor_recover_processes(int fd, short event, void *cbdata); +static int autor_set_current_job_info(orte_job_t *given_jdata, orte_process_name_t *proc_name); + +static int display_procs(void ); +static int autor_procs_sort_compare_fn(opal_list_item_t **a, + opal_list_item_t **b); + +static int orte_errmgr_hnpresil_autor_global_process_fault(orte_job_t *jdata, + orte_process_name_t *proc_name, + orte_proc_state_t state); +static void errmgr_autor_process_fault_app(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state); +static void errmgr_autor_process_fault_daemon(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state); + +static int check_if_terminated(opal_pointer_array_t *procs); +static int check_if_restarted(opal_pointer_array_t *procs); + +/* + * Timer stuff + */ +static void errmgr_autor_set_time(int idx); +static void errmgr_autor_display_all_timers(void); +static void errmgr_autor_clear_timers(void); + +static double errmgr_autor_get_time(void); +static void errmgr_autor_display_indv_timer_core(double diff, char *str); +static double timer_start[OPAL_CR_TIMER_MAX]; + +#define ERRMGR_AUTOR_TIMER_START 0 +#define ERRMGR_AUTOR_TIMER_SETUP 1 +#define ERRMGR_AUTOR_TIMER_TERM 2 +#define ERRMGR_AUTOR_TIMER_RESETUP 3 +#define ERRMGR_AUTOR_TIMER_RESTART 4 +#define ERRMGR_AUTOR_TIMER_FINISH 5 +#define ERRMGR_AUTOR_TIMER_MAX 6 + +#define ERRMGR_AUTOR_CLEAR_TIMERS() \ + { \ + if(OPAL_UNLIKELY(mca_errmgr_hnpresil_component.autor_timing_enabled > 0)) { \ + errmgr_autor_clear_timers(); \ + } \ + } + +#define ERRMGR_AUTOR_SET_TIMER(idx) \ + { \ + if(OPAL_UNLIKELY(mca_errmgr_hnpresil_component.autor_timing_enabled > 0)) { \ + errmgr_autor_set_time(idx); \ + } \ + } + +#define ERRMGR_AUTOR_DISPLAY_ALL_TIMERS() \ + { \ + if(OPAL_UNLIKELY(mca_errmgr_hnpresil_component.autor_timing_enabled > 0)) { \ + errmgr_autor_display_all_timers(); \ + } \ + } + +/************************ + * Function Definitions: Global + ************************/ +int orte_errmgr_hnpresil_autor_global_module_init(void) +{ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(autor):init()"); + + procs_pending_recovery = OBJ_NEW(opal_list_t); + + current_global_jobid = ORTE_JOBID_INVALID; + current_global_jobdata = NULL; + + if( NULL == autor_timer_event ) { + autor_timer_event = opal_event_evtimer_new(opal_event_base, errmgr_autor_recover_processes, NULL); + } + + ERRMGR_AUTOR_CLEAR_TIMERS(); + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_autor_global_module_finalize(void) +{ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(autor):finalize()"); + + if( NULL != procs_pending_recovery ) { + OBJ_RELEASE(procs_pending_recovery); + procs_pending_recovery = NULL; + } + if( NULL != autor_timer_event ) { + free(autor_timer_event); + autor_timer_event = NULL; + } + + current_global_jobid = ORTE_JOBID_INVALID; + current_global_jobdata = NULL; + + ERRMGR_AUTOR_CLEAR_TIMERS(); + + return ORTE_SUCCESS; +} + +static int autor_set_current_job_info(orte_job_t *given_jdata, orte_process_name_t *proc_name) +{ + orte_job_t *jdata = NULL; + int i; + + /* + * If we already figured it out, then just move ahead + */ + if( NULL != current_global_jobdata ) { + if( given_jdata->jobid != ORTE_PROC_MY_NAME->jobid && + given_jdata->jobid != current_global_jobdata->jobid ) { + current_global_jobdata = given_jdata; + current_global_jobid = given_jdata->jobid; + } + return ORTE_SUCCESS; + } + + /* + * If this references the application, and not the daemons + */ + if( given_jdata->jobid != ORTE_PROC_MY_NAME->jobid ) { + current_global_jobdata = given_jdata; + current_global_jobid = given_jdata->jobid; + return ORTE_SUCCESS; + } + + /* + * Otherwise iterate through the job structure and find the first job. + */ + for(i = 0; i < orte_job_data->size; ++i ) { + if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { + continue; + } + /* Exclude outselves */ + if( jdata->jobid == ORTE_PROC_MY_NAME->jobid ) { + continue; + } + current_global_jobdata = jdata; + current_global_jobid = jdata->jobid; + break; + } + + if( NULL == current_global_jobdata ) { + opal_output(0, "errmgr:hnp(autor):process_fault(): Global) Error: Cannot find the jdata for the current job."); + return ORTE_ERROR; + } + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_autor_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + orte_proc_t *loc_proc = NULL; + orte_job_t *jdata = NULL; + int ret = ORTE_SUCCESS, exit_status = ORTE_SUCCESS; + int32_t i; + + /* + * if orte is trying to shutdown, just let it + */ + if( mca_errmgr_hnpresil_component.term_in_progress ) { + return ORTE_SUCCESS; + } + + if( NULL != proc_name && + OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc_name) ) { + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp(autor): Update reported on self (%s), state %s. Skip...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc_name), + orte_proc_state_to_str(state) )); + return ORTE_SUCCESS; + } + + /* + * Get the job data object for this process + */ + if( NULL != proc_name ) { /* Get job from proc's jobid */ + jdata = orte_get_job_data_object(proc_name->jobid); + } else { /* Get from the general job */ + jdata = orte_get_job_data_object(job); + } + if( NULL == jdata ) { + opal_output(0, "%s errmgr:hnp(autor):update_state() Error: Cannot find job %s for Process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) ); + ret = ORTE_ERROR; + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* + * If this is a tool, ignore + */ + if( jdata->num_apps == 0 && + OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) { + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp(autor): An external tool disconnected. Ignore...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + exit_status = ORTE_SUCCESS; + goto cleanup; + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp(autor): job %s reported state %s" + " for proc %s state %s exit_code %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + orte_job_state_to_str(jobstate), + (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name), + orte_proc_state_to_str(state), exit_code)); + + if( ORTE_JOB_STATE_RESTART == jobstate ) { + for(i = 0; i < jdata->procs->size; ++i) { + if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + continue; + } + break; + } + + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_autor_global_process_fault(jdata, &(loc_proc->name), state)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + } + else if( ORTE_PROC_STATE_ABORTED_BY_SIG == state || + ORTE_PROC_STATE_COMM_FAILED == state ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_autor_global_process_fault(jdata, proc_name, state)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + } + else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) { + if( autor_mask_faults ) { + mca_errmgr_hnpresil_component.ignore_current_update = true; + orte_errmgr_hnpresil_update_proc(jdata, proc_name, state, 0, exit_code); + } + } + + cleanup: + return ret; +} + +static int orte_errmgr_hnpresil_autor_global_process_fault(orte_job_t *jdata, + orte_process_name_t *proc_name, + orte_proc_state_t state) +{ + int ret; + + /* + * Recover from the process failure by relaunching. + */ + if( ORTE_SUCCESS != (ret = autor_set_current_job_info(jdata, proc_name)) ) { + ORTE_ERROR_LOG(ret); + return ORTE_SUCCESS; /* JJH: Do this for now. Need to fix the flag for normal shutdown */ + /*return ret;*/ + } + + current_global_jobdata->controls |= ORTE_JOB_CONTROL_RECOVERABLE; + + if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) { + errmgr_autor_process_fault_daemon(jdata, proc_name, state); + } else { + orte_errmgr_hnpresil_update_proc(jdata, proc_name, state, 0, 0); + errmgr_autor_process_fault_app(jdata, proc_name, state); + } + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_autor_global_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list) +{ + opal_list_item_t *item = NULL; + errmgr_autor_wp_item_t *wp_item = NULL; + orte_node_t *node = NULL; + bool found = false; + int num_removed = 0, num_to_remove; + orte_ns_cmp_bitmask_t mask; + + if( NULL == current_global_jobdata ) { + return ORTE_SUCCESS; + } + + /* JJH Nasty Hack */ + num_to_remove = current_global_jobdata->num_procs / 2; + num_to_remove += 1; + + /* + * Find this process in the known failures list + */ + found = false; + if( mca_errmgr_hnpresil_component.autor_skip_oldnode ) { + for(item = opal_list_get_first(procs_pending_recovery); + item != opal_list_get_end(procs_pending_recovery); + item = opal_list_get_next(item) ) { + wp_item = (errmgr_autor_wp_item_t*)item; + + mask = ORTE_NS_CMP_ALL; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &wp_item->name, &proc->name)) { + found = true; + break; + } + } + } + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor): suggest_map() " + "Process remapping: %s oldnode %s, %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name), + oldnode->name, + (found ? "Failed Proc." : "Good Proc.") )); + + /* + * If not a failed process, then return it to the oldnode + * If failed process, do not place it back on the same node + */ + num_removed = 0; + for( item = opal_list_get_first(node_list); + item != opal_list_get_end(node_list); + item = opal_list_get_next(item) ) { + node = (orte_node_t*)item; + if( found ) { + if( num_removed >= num_to_remove ) { + break; + } + /* JJH Nasty Hack */ +#if 0 + /* Remove oldnode (if more than one node) */ + if( node == oldnode && 1 < opal_list_get_size(node_list) ) { + opal_output(0, "JJH Remove Node (%s)", node->name); + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + } +#else + if( 1 < opal_list_get_size(node_list) ) { + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + } +#endif + num_removed++; + } else { + /* Stay on same node */ + if( node != oldnode ) { + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + } + } + } + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_autor_global_ft_event(int state) +{ + return ORTE_SUCCESS; +} + + +/***************** + * Local Functions + *****************/ +static void errmgr_autor_process_fault_app(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state) +{ + errmgr_autor_wp_item_t *wp_item = NULL; + struct timeval soon; + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor): process_fault() " + "Process fault! proc %s (0x%x)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + state)); + + if( !orte_sstore_base_is_checkpoint_available ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor): process_fault() " + "No checkpoints are available for this job! Cannot Automaticly Recover!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) )); + opal_show_help("help-orte-errmgr-hnp.txt", "autor_failed_to_recover_proc", true, + ORTE_NAME_PRINT(proc), proc->vpid); + return; + } + + mca_errmgr_hnpresil_component.ignore_current_update = true; + + /* + * If we are already in the shutdown stage of the recovery, then just skip it + */ + if( autor_mask_faults ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor):process_fault() " + "Currently recovering the job. Failure masked!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + return; + } + + /* + * Append this process to the list to process + */ + wp_item = OBJ_NEW(errmgr_autor_wp_item_t); + wp_item->name.jobid = proc->jobid; + wp_item->name.vpid = proc->vpid; + wp_item->name.epoch = proc->epoch; + wp_item->state = state; + + opal_list_append(procs_pending_recovery, &(wp_item->super)); + + /* + * Activate the timer, if it is not already setup + */ + if( !autor_timer_active ) { + autor_timer_active = true; + + opal_event_evtimer_set(opal_event_base, autor_timer_event, errmgr_autor_recover_processes, NULL); + soon.tv_sec = mca_errmgr_hnpresil_component.autor_recovery_delay; + soon.tv_usec = 0; + opal_event_evtimer_add(autor_timer_event, &soon); + } + + return; +} + +static void errmgr_autor_process_fault_daemon(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state) +{ + orte_proc_t *loc_proc = NULL, *child_proc = NULL; + orte_std_cntr_t i_proc; + int32_t i; + + OPAL_OUTPUT_VERBOSE((15, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor): process_fault_daemon() " + "------- Daemon fault reported! proc %s (0x%x)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + state)); + + /* + * Set the process state in the job data structure + */ + for(i = 0; i < jdata->procs->size; ++i) { + if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + continue; + } + + if( loc_proc->name.vpid != proc->vpid) { + continue; + } + + loc_proc->state = state; + + break; + } + + /* + * Remove the route to this process + */ + orte_routed.delete_route(proc); + + /* + * If the aborted daemon had active processes on its node, then we should + * make sure to signal that all the children are gone. + */ + if( loc_proc->node->num_procs > 0 ) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, + "%s errmgr:base: stabalize_runtime() " + "------- Daemon lost with the following processes", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + for(i_proc = 0; i_proc < opal_pointer_array_get_size(loc_proc->node->procs); ++i_proc) { + child_proc = (orte_proc_t*)opal_pointer_array_get_item(loc_proc->node->procs, i_proc); + if( NULL == child_proc ) { + continue; + } + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, + "%s errmgr:base: stabalize_runtime() " + "\t %s [0x%x]", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&child_proc->name), + child_proc->state)); + + if( child_proc->last_errmgr_state < child_proc->state ) { + child_proc->last_errmgr_state = child_proc->state; + orte_errmgr.update_state(child_proc->name.jobid, ORTE_JOB_STATE_COMM_FAILED, + &(child_proc->name), ORTE_PROC_STATE_COMM_FAILED, + 0, 1); + } + } + } else { + /* This daemon had no children, so just mask the failure */ + mca_errmgr_hnpresil_component.ignore_current_update = true; + } + + /* + * Record the dead daemon + */ + orte_errmgr_hnpresil_record_dead_process(proc); + + return; +} + +void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp) +{ + wp->name.jobid = ORTE_JOBID_INVALID; + wp->name.vpid = ORTE_VPID_INVALID; + wp->name.epoch = ORTE_EPOCH_MIN; + + wp->state = 0; +} + +void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp) +{ + wp->name.jobid = ORTE_JOBID_INVALID; + wp->name.vpid = ORTE_VPID_INVALID; + wp->name.epoch = ORTE_EPOCH_INVALID; + + wp->state = 0; +} + +static int display_procs(void ) +{ + opal_list_item_t *item = NULL; + errmgr_autor_wp_item_t *wp_item = NULL; + char *proc_str = NULL; + char *tmp_str = NULL; + + for(item = opal_list_get_first(procs_pending_recovery); + item != opal_list_get_end(procs_pending_recovery); + item = opal_list_get_next(item) ) { + wp_item = (errmgr_autor_wp_item_t*)item; + + if( NULL == proc_str ) { + asprintf(&proc_str, "\t%s Rank %d\n", + ORTE_NAME_PRINT(&(wp_item->name)), + (int)wp_item->name.vpid); + } else { + tmp_str = strdup(proc_str); + free(proc_str); + proc_str = NULL; + asprintf(&proc_str, "%s\t%s Rank %d\n", + tmp_str, + ORTE_NAME_PRINT(&(wp_item->name)), + (int)wp_item->name.vpid); + } + } + + opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovering_job", true, + proc_str); + + if( NULL != tmp_str ) { + free(tmp_str); + tmp_str = NULL; + } + + if( NULL != proc_str ) { + free(proc_str); + proc_str = NULL; + } + + return ORTE_SUCCESS; +} + +static int autor_procs_sort_compare_fn(opal_list_item_t **a, + opal_list_item_t **b) +{ + errmgr_autor_wp_item_t *wp_a, *wp_b; + + wp_a = (errmgr_autor_wp_item_t*)(*a); + wp_b = (errmgr_autor_wp_item_t*)(*b); + + if( wp_a->name.vpid > wp_b->name.vpid ) { + return 1; + } + else if( wp_a->name.vpid == wp_b->name.vpid ) { + return 0; + } + else { + return -1; + } +} + +static void errmgr_autor_recover_processes(int fd, short event, void *cbdata) +{ + int ret, exit_status = ORTE_SUCCESS; + opal_list_item_t *item = NULL; + errmgr_autor_wp_item_t *wp_item = NULL; + orte_std_cntr_t i_proc; + orte_proc_t *proc = NULL; + orte_sstore_base_global_snapshot_info_t *snapshot = NULL; + char * tmp_str = NULL; + + autor_mask_faults = true; + ERRMGR_AUTOR_CLEAR_TIMERS(); + ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_START); + + /* + * Display the processes that are to be recovered + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor):recover() " + "------- Display known failed processes in the job %s -------", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(current_global_jobdata->jobid))); + + opal_list_sort(procs_pending_recovery, autor_procs_sort_compare_fn); + display_procs(); + + /* + * Find the latest checkpoint + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor):recover() " + "------- Find the latest checkpoint for the job %s -------", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(current_global_jobdata->jobid))); + + snapshot = OBJ_NEW(orte_sstore_base_global_snapshot_info_t); + if( ORTE_SUCCESS != (ret = orte_sstore.request_global_snapshot_data(&orte_sstore_handle_last_stable, snapshot)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_SETUP); + + /* + * Safely terminate the entire job + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(autor):recover() " + "------- Safely terminate the job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); + if( NULL == proc ) { + continue; + } + if( proc->state < ORTE_PROC_STATE_UNTERMINATED ) { + proc->state = ORTE_PROC_STATE_MIGRATING; + } + if( current_global_jobdata->stdin_target == proc->name.vpid ) { + orte_iof.close(&(proc->name), ORTE_IOF_STDIN); + } + } + + orte_plm.terminate_procs(current_global_jobdata->procs); + + /* + * Wait for the job to terminate all processes + */ + while(!check_if_terminated(current_global_jobdata->procs) ) { + opal_progress(); + } + + ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_TERM); + + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(autor):recover() " + "------- Done waiting for termination of job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + current_global_jobdata->num_terminated = current_global_jobdata->num_procs; + orte_plm_base_reset_job(current_global_jobdata); + + /* + * Construct the app contexts to restart + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor):recover() " + "------- Rebuild job %s app context -------", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(current_global_jobdata->jobid))); + for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); + if( NULL == proc ) { + continue; + } + + if( ORTE_SUCCESS != (ret = orte_errmgr_base_update_app_context_for_cr_recovery(current_global_jobdata, + proc, + &(snapshot->local_snapshots))) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\tAdjusted: \"%s\" [0x%d] [%s]\n", + ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name)); + } + + ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_RESETUP); + + /* + * Spawn the restarted job + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(autor):recover() " + "------- Respawning the job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + orte_snapc_base_has_recovered = false; + autor_mask_faults = false; /* Failures pass this point are worth noting */ + orte_plm.spawn(current_global_jobdata); + + /* + * Wait for all the processes to restart + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(autor):recover() " + "------- Waiting for restart -------"); + while(!check_if_restarted(current_global_jobdata->procs) ) { + opal_progress(); + } + + ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_RESTART); + + /* + * All done + */ + while( !orte_snapc_base_has_recovered ) { + opal_progress(); + } + + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(autor):recover() " + "------- Finished recovering job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovery_complete", true); + + ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_FINISH); + + cleanup: + while(NULL != (item = opal_list_remove_first(procs_pending_recovery))) { + wp_item = (errmgr_autor_wp_item_t*)item; + OBJ_RELEASE(wp_item); + } + + if( NULL != tmp_str ) { + free(tmp_str); + tmp_str = NULL; + } + + ERRMGR_AUTOR_DISPLAY_ALL_TIMERS(); + + autor_timer_active = false; + autor_mask_faults = false; + + return; +} + +static int check_if_terminated(opal_pointer_array_t *procs) +{ + orte_std_cntr_t i_proc; + orte_proc_t *proc = NULL; + bool is_done; + + if( NULL == procs ){ + return true; + } + + is_done = true; + for(i_proc = 0; i_proc < opal_pointer_array_get_size(procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(procs, i_proc); + if( NULL == proc ) { + continue; + } + + if( proc->state < ORTE_PROC_STATE_UNTERMINATED || + proc->state == ORTE_PROC_STATE_MIGRATING ) { + is_done = false; + break; + } + } + + if( !is_done ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t Still waiting for termination: \"%s\" [0x%x] < [0x%x]\n", + ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_UNTERMINATED)); + } + + return is_done; +} + +static int check_if_restarted(opal_pointer_array_t *procs) +{ + orte_std_cntr_t i_proc; + orte_proc_t *proc = NULL; + bool is_done; + + if( NULL == procs ){ + return true; + } + + is_done = true; + for(i_proc = 0; i_proc < opal_pointer_array_get_size(procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(procs, i_proc); + if( NULL == proc ) { + continue; + } + + if( !(ORTE_PROC_STATE_RUNNING & proc->state) ) { + is_done = false; + break; + } + } + + if( !is_done ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t Still waiting for restart: \"%s\" [0x%x] != [0x%x]\n", + ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_RUNNING)); + } + + return is_done; +} + +/************************ + * Timing + ************************/ +static void errmgr_autor_set_time(int idx) +{ + if(idx < ERRMGR_AUTOR_TIMER_MAX ) { + if( timer_start[idx] <= 0.0 ) { + timer_start[idx] = errmgr_autor_get_time(); + } + } +} + +static void errmgr_autor_display_all_timers(void) +{ + double diff = 0.0; + char * label = NULL; + + opal_output(0, "Auto. Recovery Timing: ******************** Summary Begin\n"); + + /********** Structure Setup **********/ + label = strdup("Setup"); + diff = timer_start[ERRMGR_AUTOR_TIMER_SETUP] - timer_start[ERRMGR_AUTOR_TIMER_START]; + errmgr_autor_display_indv_timer_core(diff, label); + free(label); + + /********** Termination **********/ + label = strdup("Terminate"); + diff = timer_start[ERRMGR_AUTOR_TIMER_TERM] - timer_start[ERRMGR_AUTOR_TIMER_SETUP]; + errmgr_autor_display_indv_timer_core(diff, label); + free(label); + + /********** Setup new job **********/ + label = strdup("Setup Relaunch"); + diff = timer_start[ERRMGR_AUTOR_TIMER_RESETUP] - timer_start[ERRMGR_AUTOR_TIMER_TERM]; + errmgr_autor_display_indv_timer_core(diff, label); + free(label); + + /********** Restart **********/ + label = strdup("Restart"); + diff = timer_start[ERRMGR_AUTOR_TIMER_RESTART] - timer_start[ERRMGR_AUTOR_TIMER_RESETUP]; + errmgr_autor_display_indv_timer_core(diff, label); + free(label); + + /********** Finish **********/ + label = strdup("Finalize"); + diff = timer_start[ERRMGR_AUTOR_TIMER_FINISH] - timer_start[ERRMGR_AUTOR_TIMER_RESTART]; + errmgr_autor_display_indv_timer_core(diff, label); + free(label); + + opal_output(0, "Auto. Recovery Timing: ******************** Summary End\n"); +} + +static void errmgr_autor_clear_timers(void) +{ + int i; + for(i = 0; i < ERRMGR_AUTOR_TIMER_MAX; ++i) { + timer_start[i] = 0.0; + } +} + +static double errmgr_autor_get_time(void) +{ + double wtime; + +#if OPAL_TIMER_USEC_NATIVE + wtime = (double)opal_timer_base_get_usec() / 1000000.0; +#else + struct timeval tv; + gettimeofday(&tv, NULL); + wtime = tv.tv_sec; + wtime += (double)tv.tv_usec / 1000000.0; +#endif + + return wtime; +} + +static void errmgr_autor_display_indv_timer_core(double diff, char *str) +{ + double total = 0; + double perc = 0; + + total = timer_start[ERRMGR_AUTOR_TIMER_MAX-1] - timer_start[ERRMGR_AUTOR_TIMER_START]; + perc = (diff/total) * 100; + + opal_output(0, + "errmgr_autor: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n", + str, + diff, + total, + perc); + return; +} + +#endif /* OPAL_ENABLE_FT_CR */ diff --git a/orte/mca/errmgr/hnpresil/errmgr_hnpresil_component.c b/orte/mca/errmgr/hnpresil/errmgr_hnpresil_component.c new file mode 100644 index 0000000000..d96654d739 --- /dev/null +++ b/orte/mca/errmgr/hnpresil/errmgr_hnpresil_component.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "orte/mca/errmgr/base/errmgr_private.h" +#include "errmgr_hnpresil.h" + +/* + * Public string for version number + */ +const char *orte_errmgr_hnpresil_component_version_string = + "ORTE ERRMGR hnpresil MCA component version " ORTE_VERSION; + +/* + * Local functionality + */ +static int orte_errmgr_hnpresil_open(void); +static int orte_errmgr_hnpresil_close(void); + +/* + * Instantiate the public struct with all of our public information + * and pointer to our public functions in it + */ +orte_errmgr_hnpresil_component_t mca_errmgr_hnpresil_component = { + /* First do the base component stuff */ + { + /* Handle the general mca_component_t struct containing + * meta information about the component hnp + */ + { + ORTE_ERRMGR_BASE_VERSION_3_0_0, + /* Component name and version */ + "hnpresil", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + orte_errmgr_hnpresil_open, + orte_errmgr_hnpresil_close, + orte_errmgr_hnpresil_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + /* Verbosity level */ + 0, + /* opal_output handler */ + -1, + /* Default priority */ + 0 + } +}; + +static int orte_errmgr_hnpresil_open(void) +{ + int val; + + /* + * This should be the last componet to ever get used since + * it doesn't do anything. + */ + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "priority", + "Priority of the ERRMGR hnp component", + false, false, + mca_errmgr_hnpresil_component.super.priority, + &mca_errmgr_hnpresil_component.super.priority); + + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "verbose", + "Verbose level for the ERRMGR hnp component", + false, false, + mca_errmgr_hnpresil_component.super.verbose, + &mca_errmgr_hnpresil_component.super.verbose); + /* If there is a custom verbose level for this component than use it + * otherwise take our parents level and output channel + */ + if ( 0 != mca_errmgr_hnpresil_component.super.verbose) { + mca_errmgr_hnpresil_component.super.output_handle = opal_output_open(NULL); + opal_output_set_verbosity(mca_errmgr_hnpresil_component.super.output_handle, + mca_errmgr_hnpresil_component.super.verbose); + } else { + mca_errmgr_hnpresil_component.super.output_handle = orte_errmgr_base.output; + } + +#if OPAL_ENABLE_FT_CR + /**************************** + * CRMig (C/R Process Migration) MCA Options + ****************************/ + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "crmig_timing", + "Enable Process Migration timer", + false, false, + 0, &val); + mca_errmgr_hnpresil_component.crmig_timing_enabled = OPAL_INT_TO_BOOL(val); + + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "crmig_enable", + "Enable Process Migration (Default: 0/off)", + false, false, + 0, &val); + mca_errmgr_hnpresil_component.crmig_enabled = OPAL_INT_TO_BOOL(val); + + /**************************** + * AutoR (Automatic Recovery) MCA Options + ****************************/ + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "autor_timing", + "Enable Automatic Recovery timer", + false, false, + 0, &val); + mca_errmgr_hnpresil_component.autor_timing_enabled = OPAL_INT_TO_BOOL(val); + + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "autor_enable", + "Enable Automatic Recovery (Default: 0/off)", + false, false, + 0, &val); + mca_errmgr_hnpresil_component.autor_enabled = OPAL_INT_TO_BOOL(val); + + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "autor_recovery_delay", + "Number of seconds to wait before starting to recover the job after a failure" + " [Default: 1 sec]", + false, false, + 1, &val); + mca_errmgr_hnpresil_component.autor_recovery_delay = val; + + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "autor_skip_oldnode", + "Skip the old node from failed proc, even if it is still available" + " [Default: Enabled]", + false, false, + 1, &val); + mca_errmgr_hnpresil_component.autor_skip_oldnode = OPAL_INT_TO_BOOL(val); +#else + val = 0; /* Silence compiler warning */ +#endif /* OPAL_ENABLE_FT_CR */ + + /* + * Debug Output + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open()"); + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: priority = %d", + mca_errmgr_hnpresil_component.super.priority); + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: verbosity = %d", + mca_errmgr_hnpresil_component.super.verbose); +#if OPAL_ENABLE_FT_CR + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: --- CR Migration Options ---"); + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: Process Migration = %s", + (mca_errmgr_hnpresil_component.crmig_enabled ? "Enabled" : "Disabled")); + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: timing = %s", + (mca_errmgr_hnpresil_component.crmig_timing_enabled ? "Enabled" : "Disabled")); + + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: --- Auto. Recovery Options ---"); + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: Auto. Recover = %s", + (mca_errmgr_hnpresil_component.autor_enabled ? "Enabled" : "Disabled")); + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: timing = %s", + (mca_errmgr_hnpresil_component.autor_timing_enabled ? "Enabled" : "Disabled")); + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: recover_delay = %d", + mca_errmgr_hnpresil_component.autor_recovery_delay); + + mca_errmgr_hnpresil_component.crmig_in_progress = false; + mca_errmgr_hnpresil_component.autor_in_progress = false; + mca_errmgr_hnpresil_component.term_in_progress = false; +#endif /* OPAL_ENABLE_FT_CR */ + + return ORTE_SUCCESS; +} + +static int orte_errmgr_hnpresil_close(void) +{ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: close()"); + + return ORTE_SUCCESS; +} diff --git a/orte/mca/errmgr/hnpresil/errmgr_hnpresil_crmig.c b/orte/mca/errmgr/hnpresil/errmgr_hnpresil_crmig.c new file mode 100644 index 0000000000..e56c451649 --- /dev/null +++ b/orte/mca/errmgr/hnpresil/errmgr_hnpresil_crmig.c @@ -0,0 +1,1517 @@ +/* + * Copyright (c) 2009-2010 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/show_help.h" +#include "opal/util/output.h" +#include "opal/util/opal_environ.h" +#include "opal/util/basename.h" +#include "opal/util/argv.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/mca/crs/crs.h" +#include "opal/mca/crs/base/base.h" + +#include "orte/util/error_strings.h" +#include "orte/util/name_fns.h" +#include "orte/util/proc_info.h" +#include "orte/runtime/orte_globals.h" +#include "opal/dss/dss.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/rml_types.h" +#include "orte/mca/iof/iof.h" +#include "orte/mca/plm/plm.h" +#include "orte/mca/plm/base/base.h" +#include "orte/mca/plm/base/plm_private.h" +#include "orte/mca/filem/filem.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/runtime/orte_wait.h" +#include "orte/mca/rmaps/rmaps_types.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/snapc/snapc.h" +#include "orte/mca/snapc/base/base.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "orte/mca/errmgr/base/errmgr_private.h" + +#include "errmgr_hnpresil.h" + +#include MCA_timer_IMPLEMENTATION_HEADER + +#if OPAL_ENABLE_FT_CR + +/************************************ + * Locally Global vars & functions :) + ************************************/ +static orte_jobid_t current_global_jobid = ORTE_JOBID_INVALID; +static orte_job_t *current_global_jobdata = NULL; + +static bool migrating_underway = false; +static bool migrating_terminated = false; +static bool migrating_restarted = false; + +static opal_list_t *current_onto_mapping_general = NULL; +static opal_list_t *current_onto_mapping_exclusive = NULL; + +/*** Command Line Interactions */ +static int current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE; + +static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_nodes, opal_list_t *onto_map); + +static int orte_errmgr_hnpresil_crmig_global_process_fault(orte_job_t *jdata, + orte_process_name_t *proc_name, + orte_proc_state_t state); +static void errmgr_crmig_process_fault_app(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state); +static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state); + +static bool check_if_duplicate_proc(orte_proc_t *proc, opal_pointer_array_t *migrating_procs); +static int check_if_terminated(opal_pointer_array_t *migrating_procs); +static int check_if_restarted(opal_pointer_array_t *migrating_procs); + +static int check_and_pre_map(opal_list_t *off_procs, + opal_list_t *off_nodes, + orte_snapc_base_quiesce_t *cur_datum); + +static void display_request(opal_list_t *off_procs, + opal_list_t *off_nodes, + orte_snapc_base_quiesce_t *cur_datum); + +/* + * Timer stuff + */ +static void errmgr_crmig_set_time(int idx); +static void errmgr_crmig_display_all_timers(void); +static void errmgr_crmig_clear_timers(void); + +static double errmgr_crmig_get_time(void); +static void errmgr_crmig_display_indv_timer_core(double diff, char *str); +static double timer_start[OPAL_CR_TIMER_MAX]; + +#define ERRMGR_CRMIG_TIMER_START 0 +#define ERRMGR_CRMIG_TIMER_SETUP 1 +#define ERRMGR_CRMIG_TIMER_CKPT 2 +#define ERRMGR_CRMIG_TIMER_TERM 3 +#define ERRMGR_CRMIG_TIMER_RESETUP 4 +#define ERRMGR_CRMIG_TIMER_RESTART 5 +#define ERRMGR_CRMIG_TIMER_FINISH 6 +#define ERRMGR_CRMIG_TIMER_MAX 7 + +#define ERRMGR_CRMIG_CLEAR_TIMERS() \ + { \ + if(OPAL_UNLIKELY(mca_errmgr_hnpresil_component.crmig_timing_enabled > 0)) { \ + errmgr_crmig_clear_timers(); \ + } \ + } + +#define ERRMGR_CRMIG_SET_TIMER(idx) \ + { \ + if(OPAL_UNLIKELY(mca_errmgr_hnpresil_component.crmig_timing_enabled > 0)) { \ + errmgr_crmig_set_time(idx); \ + } \ + } + +#define ERRMGR_CRMIG_DISPLAY_ALL_TIMERS() \ + { \ + if(OPAL_UNLIKELY(mca_errmgr_hnpresil_component.crmig_timing_enabled > 0)) { \ + errmgr_crmig_display_all_timers(); \ + } \ + } + +/************************ + * Function Definitions: Global + ************************/ +int orte_errmgr_hnpresil_crmig_global_module_init(void) +{ + int ret; + + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig): init()"); + + migrating_underway = false; + + current_global_jobid = ORTE_JOBID_INVALID; + current_global_jobdata = NULL; + + /* + * Initialize the connection to the orte-migrate tool + */ + if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_init()) ) { + ORTE_ERROR_LOG(ret); + return ret; + } + + ERRMGR_CRMIG_CLEAR_TIMERS(); + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_crmig_global_module_finalize(void) +{ + int ret; + + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig): finalize()"); + + /* + * Finalize the connection to the orte-migrate tool + */ + if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_finalize()) ) { + ORTE_ERROR_LOG(ret); + return ret; + } + + migrating_underway = false; + + current_global_jobid = ORTE_JOBID_INVALID; + current_global_jobdata = NULL; + + ERRMGR_CRMIG_CLEAR_TIMERS(); + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_crmig_global_predicted_fault(opal_list_t *proc_list, + opal_list_t *node_list, + opal_list_t *suggested_map) +{ + int ret, exit_status = ORTE_SUCCESS; + orte_job_t *jdata = NULL; + int i; + + /* + * JJH: RETURN HERE + * If we are already migrating, then reject this request + */ + if( migrating_underway ) { + ; + } + + /* + * Determine the jobid for this migration + * JJH: Assumes only one job active at any one time + */ + for(i = 0; i < orte_job_data->size; ++i ) { + if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { + continue; + } + /* Exclude outselves */ + if( jdata->jobid == ORTE_PROC_MY_NAME->jobid ) { + continue; + } + current_global_jobdata = jdata; + current_global_jobid = jdata->jobid; + break; + } + if( NULL == current_global_jobdata ) { + opal_output(0, "errmgr:hnp(crmig):predicted_fault(): Global) Error: Cannot find the jdata for the current job."); + ORTE_ERROR_LOG(ORTE_ERROR); + return ORTE_ERROR; + } + current_global_jobdata->controls |= ORTE_JOB_CONTROL_RECOVERABLE; + + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_REQUEST; + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /************************* + * Kick off the migration + *************************/ + if( ORTE_SUCCESS != (ret = errmgr_crmig_global_migrate(proc_list, node_list, suggested_map)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /************************ + * Set up the Command Line listener again + *************************/ + if( ORTE_ERRMGR_MIGRATE_STATE_ERROR != current_migration_status ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_NONE)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrated_job", true); + } + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE; + + cleanup: + return exit_status; +} + +int orte_errmgr_hnpresil_crmig_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + orte_job_t *jdata = NULL; + int ret = ORTE_SUCCESS; + + /* + * if orte is trying to shutdown, just let it + */ + if( mca_errmgr_hnpresil_component.term_in_progress ) { + return ORTE_SUCCESS; + } + + /* + * Get the job data object for this process + */ + if( NULL != proc_name ) { /* Get job from proc's jobid */ + jdata = orte_get_job_data_object(proc_name->jobid); + } else { /* Get from the general job */ + jdata = orte_get_job_data_object(job); + } + if( NULL == jdata ) { + opal_output(0, "%s errmgr:hnp(crmig):update_state() Error: Cannot find job %s for Process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) ); + ret = ORTE_ERROR; + ORTE_ERROR_LOG(ret); + return ret; + } + + /* + * If this is a tool, ignore + */ + if( jdata->num_apps == 0 && + OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) { + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp(crmig): An external tool disconnected. Ignore...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp(crmig): job %s reported state %s" + " for proc %s state %s exit_code %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + orte_job_state_to_str(jobstate), + (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name), + orte_proc_state_to_str(state), exit_code)); + + if( ORTE_PROC_STATE_ABORTED_BY_SIG == state || + ORTE_PROC_STATE_COMM_FAILED == state ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_crmig_global_process_fault(jdata, proc_name, state)) ) { + ORTE_ERROR_LOG(ret); + return ret; + } + } + else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) { + if( migrating_underway ) { + /* If we are migrating, then we need to mask this to prevent the lower level from terminating us */ + mca_errmgr_hnpresil_component.ignore_current_update = true; + orte_errmgr_hnpresil_update_proc(jdata, proc_name, state, 0, exit_code); + } + } + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_crmig_global_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list) +{ + int exit_status = ORTE_SUCCESS; + opal_list_item_t *item = NULL, *m_item = NULL; + orte_errmgr_predicted_map_t *onto_map = NULL, *current_proc_map = NULL; + orte_node_t *node = NULL; + bool found = false; + int num_suggested = 0; + orte_std_cntr_t i_proc; + orte_proc_t *peer_proc = NULL; + + /* + * If not migrating, then suggest nothing + */ + if( !migrating_underway ) { + return ORTE_SUCCESS; + } + + /* + * First look for an exclusive mapping for this process + */ + for(item = opal_list_get_first(current_onto_mapping_exclusive); + item != opal_list_get_end(current_onto_mapping_exclusive); + item = opal_list_get_next(item) ) { + onto_map = (orte_errmgr_predicted_map_t*) item; + if( onto_map->proc_name.vpid == proc->name.vpid ) { + current_proc_map = onto_map; + break; + } + } + + /* + * If there is an exclusive mapping then... + */ + if( NULL != current_proc_map ) { + /* + * If we made an exclusive mapping during the check_and_pre_map() + * then honor it here. + */ + if( NULL != current_proc_map->pre_map_fixed_node ) { + for( item = opal_list_get_first(node_list); + item != opal_list_get_end(node_list); + item = opal_list_get_next(item) ) { + node = (orte_node_t*)item; + + /* Exclude all other nodes */ + found = false; + + if( 0 == strncmp(node->name, current_proc_map->pre_map_fixed_node, + strlen(current_proc_map->pre_map_fixed_node)) ) { + found = true; + break; + } + if( !found ) { + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + continue; + } else { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Fixed use of node [%15s : %10s -> %10s (%10s)] -------", + ORTE_NAME_PRINT(&proc->name), oldnode->name, + current_proc_map->pre_map_fixed_node, node->name)); + } + } + + /* All done with mapping */ + exit_status = ORTE_SUCCESS; + goto cleanup; + } + + /* + * If 'off_current_node' then exclude current node + */ + if( current_proc_map->off_current_node ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Remove old node (info) [%15s : %10s] -------", + ORTE_NAME_PRINT(&proc->name), oldnode->name)); + for( item = opal_list_get_first(node_list); + item != opal_list_get_end(node_list); + item = opal_list_get_next(item) ) { + node = (orte_node_t*)item; + + /* Exclude the old node */ + if( node == oldnode ) { + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + break; + } + } + } + + /* + * If 'map_proc_name' then map to the node where this process resides + * Note: Only do this if there was no 'other' node suggested. If there + * was an 'other' node suggested then we need to honor that before + * we honor the peer suggestion. + */ + if( ORTE_VPID_INVALID != current_proc_map->map_proc_name.vpid && + current_proc_map->proc_name.vpid != current_proc_map->map_proc_name.vpid && + NULL == current_proc_map->map_node_name ) { + /* + * Find the node containting the target process + */ + for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { + peer_proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); + if( NULL == peer_proc ) { + continue; + } + if( peer_proc->name.vpid == current_proc_map->map_proc_name.vpid ) { + current_proc_map->map_node_name = strdup(peer_proc->node->name); + break; + } + } + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Force use of node with proc [%15s -> %15s: %10s -> %10s] -------", + ORTE_NAME_PRINT(&proc->name), ORTE_NAME_PRINT(&peer_proc->name), + oldnode->name, current_proc_map->map_node_name)); + } + + /* + * If 'map_node_name' then use this node exclusively + */ + if( NULL != current_proc_map->map_node_name ) { + for( item = opal_list_get_first(node_list); + item != opal_list_get_end(node_list); + item = opal_list_get_next(item) ) { + node = (orte_node_t*)item; + + /* Exclude all nodes not in the include list */ + found = false; + + if( 0 == strncmp(node->name, current_proc_map->map_node_name, strlen(current_proc_map->map_node_name)) ) { + found = true; + } + if( !found ) { + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + continue; + } else { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Force use of node [%15s : %10s -> %10s (%10s)] -------", + ORTE_NAME_PRINT(&proc->name), oldnode->name, + current_proc_map->map_node_name, node->name)); + } + } + + /* All done with mapping */ + exit_status = ORTE_SUCCESS; + goto cleanup; + } + + /* + * Otherwise then map as if there was no exclusive mapping + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Suggesting as if non-exclusive [%15s : 0x%x : %10s] -------", + ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); + } + /* + * If no exclusive mapping (or exclusive did not yield any results) then... + */ + else { + /* + * Remove the old node from the list, if there are more than 1 nodes available + */ + if(1 < opal_list_get_size(node_list) ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Remove old node [%15s : %10s] -------", + ORTE_NAME_PRINT(&proc->name), oldnode->name)); + for( item = opal_list_get_first(node_list); + item != opal_list_get_end(node_list); + item = opal_list_get_next(item) ) { + node = (orte_node_t*)item; + + /* Exclude the old node */ + if( node == oldnode ) { + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + break; + } + } + } + } + + /* + * If we do not have any general suggestions, then just return + */ + if( opal_list_get_size(current_onto_mapping_general) <= 0 ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- No suggestions for target [%15s : 0x%x : %10s] -------", + ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); + exit_status = ORTE_SUCCESS; + goto cleanup; + } + + /* + * Otherwise look through the general suggestions as an include list + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Suggest a target for [%15s : 0x%x : %10s] -------", + ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); + + num_suggested = 0; + for( item = opal_list_get_first(node_list); + item != opal_list_get_end(node_list); + item = opal_list_get_next(item) ) { + node = (orte_node_t*)item; + + /* Exclude all nodes not in the include list */ + found = false; + + for(m_item = opal_list_get_first(current_onto_mapping_general); + m_item != opal_list_get_end(current_onto_mapping_general); + m_item = opal_list_get_next(m_item) ) { + onto_map = (orte_errmgr_predicted_map_t*) m_item; + + if( 0 == strncmp(node->name, onto_map->map_node_name, strlen(onto_map->map_node_name)) ) { + found = true; + break; + } + } + if( !found ) { + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + continue; + } + + ++num_suggested; + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Suggesting target %2d [%15s : 0x%x : %10s -> %10s] -------", + num_suggested, ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name, node->name)); + } + + cleanup: + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Suggested %2d nodes for [%15s : 0x%x : %10s] -------", + (int)opal_list_get_size(node_list), ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); + + return exit_status; +} + +int orte_errmgr_hnpresil_crmig_global_ft_event(int state) +{ + return ORTE_SUCCESS; +} + + +/************************ + * Function Definitions: Static + ************************/ +static int orte_errmgr_hnpresil_crmig_global_process_fault(orte_job_t *jdata, + orte_process_name_t *proc_name, + orte_proc_state_t state) +{ + /* + * JJH: Todo + * The expected logic here is: + * if( a daemon with children fails ) { + * abort migration. + * } + * if( a daemon without children fails ) { + * continue. No processes lost + * } + * if( an application process fails ) { + * abort migration. Might be a bad checkpoint, or a process that we were + * not migrating that died. + * } + * else { + * continue; + * } + */ + if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) { + errmgr_crmig_process_fault_daemon(jdata, proc_name, state); + } else { + errmgr_crmig_process_fault_app(jdata, proc_name, state); + } + + return ORTE_SUCCESS; +} + +static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_nodes, opal_list_t *onto_maps) +{ + int ret, exit_status = ORTE_SUCCESS; + orte_std_cntr_t i_node; + orte_std_cntr_t i_proc; + orte_node_t *node = NULL; + orte_proc_t *proc = NULL; + bool found = false; + orte_snapc_base_quiesce_t *cur_datum = NULL; + bool close_iof_stdin = false; + orte_process_name_t iof_name = {ORTE_JOBID_INVALID, 0}; + char * err_str_procs = NULL; + char * err_str_nodes = NULL; + char * tmp_str = NULL; + orte_errmgr_predicted_proc_t *off_proc = NULL; + orte_errmgr_predicted_node_t *off_node = NULL; + orte_errmgr_predicted_map_t *onto_map = NULL; + opal_list_item_t *item = NULL; + + ERRMGR_CRMIG_CLEAR_TIMERS(); + ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_START); + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Migrating (%3d, %3d, %3d) -------", + (int)opal_list_get_size(off_procs), + (int)opal_list_get_size(off_nodes), + (int)opal_list_get_size(onto_maps))); + + /* + * Modeled after orte_plm_base_reset_job + */ + cur_datum = OBJ_NEW(orte_snapc_base_quiesce_t); + cur_datum->migrating = true; + migrating_underway = true; + mca_errmgr_hnpresil_component.crmig_in_progress = true; + + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_RUNNING; + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* + * Check to make sure that the 'off' and 'onto' nodes exist + * - if 'onto' nodes do not, then add them (JJH XXX) + * - if 'off' nodes do not, then return an error (JJH XXX) + * JJH TODO... + */ + + /* + * Copy over the onto_nodes so we can suggest them later + */ + if( NULL != current_onto_mapping_general ) { + OBJ_RELEASE(current_onto_mapping_general); + current_onto_mapping_general = NULL; + } + if( NULL != current_onto_mapping_exclusive ) { + OBJ_RELEASE(current_onto_mapping_exclusive); + current_onto_mapping_exclusive = NULL; + } + current_onto_mapping_general = OBJ_NEW(opal_list_t); + current_onto_mapping_exclusive = OBJ_NEW(opal_list_t); + if( NULL != onto_maps ) { + while( NULL != (item = opal_list_remove_first(onto_maps)) ) { + onto_map = (orte_errmgr_predicted_map_t*) item; + /* Determine if process exclude mapping, or general */ + if( onto_map->proc_name.vpid == ORTE_VPID_INVALID ) { + opal_list_append(current_onto_mapping_general, item); + } else { + opal_list_append(current_onto_mapping_exclusive, item); + } + } + } + + for(item = opal_list_get_first(current_onto_mapping_exclusive); + item != opal_list_get_end(current_onto_mapping_exclusive); + item = opal_list_get_next(item) ) { + onto_map = (orte_errmgr_predicted_map_t*) item; + /* + * Find the node currently containing this process + */ + found = false; + for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); + if( NULL == proc ) { + continue; + } + + if( proc->name.vpid == onto_map->proc_name.vpid) { + found = true; + break; + } + } + + /* + * Check to see if this process hsould be skipped + */ + if( !onto_map->off_current_node && + (ORTE_VPID_INVALID == onto_map->map_proc_name.vpid || + onto_map->proc_name.vpid == onto_map->map_proc_name.vpid ) && + (NULL == onto_map->map_node_name || + 0 == strncmp(onto_map->map_node_name, proc->node->name, strlen(proc->node->name))) ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Process %15s does not wish to move -------", + ORTE_NAME_PRINT(&proc->name))); + + } else { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Process %15s will be moved -------", + ORTE_NAME_PRINT(&proc->name))); + /* + * Set the process to restarting + */ + proc->state = ORTE_PROC_STATE_MIGRATING; + + opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc); + OBJ_RETAIN(proc); + (cur_datum->num_migrating)++; + + if( current_global_jobdata->stdin_target == proc->name.vpid ) { + close_iof_stdin = true; + iof_name.jobid = proc->name.jobid; + iof_name.vpid = proc->name.vpid; + iof_name.epoch = proc->name.epoch; + } + } + } + + migrating_terminated = false; + migrating_restarted = false; + + /* + * Create a list of processes to migrate, if 'off_nodes' specified + */ + for(item = opal_list_get_first(off_nodes); + item != opal_list_get_end(off_nodes); + item = opal_list_get_next(item) ) { + off_node = (orte_errmgr_predicted_node_t*)item; + + /* + * Find the node in the job structure + * - Make sure that 'odin00' doesn't match all 'odin00*' + */ + found = false; + for(i_node = 0; i_node < opal_pointer_array_get_size(current_global_jobdata->map->nodes); ++i_node) { + node = (orte_node_t*)opal_pointer_array_get_item(current_global_jobdata->map->nodes, i_node); + if( NULL == node ) { + continue; + } + + if( 0 == strncmp(node->name, off_node->node_name, strlen(off_node->node_name)) ) { + found = true; + break; + } + } + if( !found ) { + ; /* Warn about invalid node */ + } else { + /* + * Add all processes from this node + */ + for(i_proc = 0; i_proc < opal_pointer_array_get_size(node->procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i_proc); + if( NULL == proc ) { + continue; + } + + /* + * Set the process to restarting + */ + proc->state = ORTE_PROC_STATE_MIGRATING; + + opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc); + OBJ_RETAIN(proc); + (cur_datum->num_migrating)++; + + if( current_global_jobdata->stdin_target == proc->name.vpid ) { + close_iof_stdin = true; + iof_name.jobid = proc->name.jobid; + iof_name.vpid = proc->name.vpid; + iof_name.epoch = proc->name.epoch; + } + } + } + } + + /* + * Create a list of processes to migrate, if 'off_procs' specified + */ + for(item = opal_list_get_first(off_procs); + item != opal_list_get_end(off_procs); + item = opal_list_get_next(item) ) { + off_proc = (orte_errmgr_predicted_proc_t*)item; + + /* + * Find the process in the job structure + */ + found = false; + for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); + if( NULL == proc ) { + continue; + } + + if( proc->name.vpid == off_proc->proc_name.vpid) { + found = true; + break; + } + } + /* + * Make sure the process is not listed multiple times + */ + if( found ) { + found = check_if_duplicate_proc(proc, &(cur_datum->migrating_procs)); + if( !found ) { + /* + * Set the process to restarting + */ + proc->state = ORTE_PROC_STATE_MIGRATING; + + opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc); + OBJ_RETAIN(proc); + (cur_datum->num_migrating)++; + + if( current_global_jobdata->stdin_target == proc->name.vpid ) { + close_iof_stdin = true; + iof_name.jobid = proc->name.jobid; + iof_name.vpid = proc->name.vpid; + iof_name.epoch = proc->name.epoch; + } + } + } + } + + /* + * If we did not find any processes to migrate, then throw a warning, and skip it. + */ + if( 0 >= cur_datum->num_migrating ) { + for(item = opal_list_get_first(off_nodes); + item != opal_list_get_end(off_nodes); + item = opal_list_get_next(item) ) { + off_node = (orte_errmgr_predicted_node_t*)item; + if( NULL != err_str_nodes ) { + asprintf(&tmp_str, "%s, %s", err_str_nodes, off_node->node_name); + free(err_str_nodes); + err_str_nodes = strdup(tmp_str); + free(tmp_str); + tmp_str = NULL; + } else { + asprintf(&err_str_nodes, "%s", off_node->node_name); + } + } + + for(item = opal_list_get_first(off_procs); + item != opal_list_get_end(off_procs); + item = opal_list_get_next(item) ) { + off_proc = (orte_errmgr_predicted_proc_t*)item; + if( NULL != err_str_procs ) { + asprintf(&tmp_str, "%s, %d", err_str_procs, (int)off_proc->proc_name.vpid); + free(err_str_procs); + err_str_procs = strdup(tmp_str); + free(tmp_str); + tmp_str = NULL; + } else { + asprintf(&err_str_procs, "%d", off_proc->proc_name.vpid); + } + } + + opal_show_help("help-orte-errmgr-hnp.txt", "crmig_no_migrating_procs", true, + err_str_nodes, + err_str_procs); + + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_ERROR; + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + goto cleanup; + } + + /* + * Final pass on the migration list to pre-map processes and remove + * processes that should not be migrated. + */ + if( ORTE_SUCCESS != (ret = check_and_pre_map(off_procs, off_nodes, cur_datum)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* + * Display the request before processing it. + */ + display_request(off_procs, off_nodes, cur_datum); + + ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_SETUP); + + /* + * Checkpoint the job + * - Hold all non-migrating processes + * - Abort the marked processes + * - + */ + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_RUN_CKPT; + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Starting the checkpoint of job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + if( ORTE_SUCCESS != (ret = orte_snapc.start_ckpt(cur_datum)) ) { + opal_output(0, "errmgr:hnp(crmig):migrate() Error: Unable to start the checkpoint."); + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_CKPT); + + /* + * Terminate the migrating processes + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Terminate old processes in job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + orte_plm.terminate_procs(&cur_datum->migrating_procs); + + /* + * Clear the IOF stdin target if necessary + */ + if( close_iof_stdin ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Closing old STDIN target for job %s (%s)-------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid), + ORTE_NAME_PRINT(&iof_name) )); + + orte_iof.close(&iof_name, ORTE_IOF_STDIN); + } + + /* + * Wait for the processes to finish terminating + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Waiting for termination -------"); + + while( !migrating_terminated ) { + opal_progress(); + check_if_terminated(&(cur_datum->migrating_procs)); + } + + ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_TERM); + + /* + * Start remapping the processes + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Checkpoint finished, setting up job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_STARTUP; + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* + * Reset the job parameters for restart + * This will set the state of the job to 'restart' + */ + orte_plm_base_reset_job(current_global_jobdata); + + /* + * Adjust the application context information + */ + for(i_proc = 0; i_proc < opal_pointer_array_get_size(&(cur_datum->migrating_procs)); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(&(cur_datum->migrating_procs), i_proc); + if( NULL == proc ) { + continue; + } + + if( ORTE_SUCCESS != (ret = orte_errmgr_base_update_app_context_for_cr_recovery(current_global_jobdata, + proc, + &(cur_datum->ss_snapshot->local_snapshots))) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\tAdjusted: \"%s\" [0x%d] [%s]\n", + ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name)); + } + + ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_RESETUP); + + /* + * Restart the job + * - spawn function will remap and launch the replacement proc(s) + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Respawning migrating processes in job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + orte_plm.spawn(current_global_jobdata); + + + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Waiting for restart -------"); + + migrating_restarted = false; + while( !migrating_restarted ) { + opal_progress(); + check_if_restarted(&(cur_datum->migrating_procs)); + } + + ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_RESTART); + + /* + * Finish the checkpoint + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Reconnecting processes in job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + if( ORTE_SUCCESS != (ret = orte_snapc.end_ckpt(cur_datum)) ) { + opal_output(0, "errmgr:hnp(crmig):migrate() Error: Unable to end the checkpoint."); + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* + * All done + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Finished migrating processes in job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + OBJ_RELEASE(cur_datum); + + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_FINISH; + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_FINISH); + ERRMGR_CRMIG_DISPLAY_ALL_TIMERS(); + + cleanup: + migrating_underway = false; + migrating_terminated = false; + migrating_restarted = false; + mca_errmgr_hnpresil_component.crmig_in_progress = false; + + if( NULL != err_str_procs ) { + free(err_str_procs); + err_str_procs = NULL; + } + + if( NULL != err_str_nodes ) { + free(err_str_nodes); + err_str_nodes = NULL; + } + + return exit_status; +} + +static bool check_if_duplicate_proc(orte_proc_t *proc, opal_pointer_array_t *migrating_procs) +{ + orte_std_cntr_t i_proc; + orte_proc_t *loc_proc = NULL; + + for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) { + loc_proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc); + if( NULL == loc_proc ) { + continue; + } + if( loc_proc->name.vpid == proc->name.vpid ) { + return true; + } + } + + return false; +} + +static int check_if_terminated(opal_pointer_array_t *migrating_procs) +{ + orte_std_cntr_t i_proc; + orte_proc_t *proc = NULL; + bool is_done; + + is_done = true; + for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc); + if( NULL == proc ) { + continue; + } + + if( !(ORTE_PROC_STATE_KILLED_BY_CMD & proc->state) ) { + is_done = false; + break; + } + } + + if( is_done ) { + migrating_terminated = true; + } + else { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t Still waiting for termination: \"%s\" [0x%x] != [0x%x]\n", + ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_KILLED_BY_CMD)); + } + + return ORTE_SUCCESS; +} + +static int check_if_restarted(opal_pointer_array_t *migrating_procs) +{ + orte_std_cntr_t i_proc; + orte_proc_t *proc = NULL; + bool is_done; + + is_done = true; + for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc); + if( NULL == proc ) { + continue; + } + + /* proc->state != ORTE_PROC_STATE_LAUNCHED */ + if( !(ORTE_PROC_STATE_RUNNING & proc->state) ) { + is_done = false; + break; + } + } + + if( is_done ) { + migrating_restarted = true; + } + else { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\tStill waiting for restart: \"%s\" [0x%x] != [0x%x]\n", + ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_RUNNING)); + } + + return ORTE_SUCCESS; +} + +static void errmgr_crmig_process_fault_app(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state) +{ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):process_fault_app() " + "------- Application fault reported! proc %s (0x%x) " + "- %s", + ORTE_NAME_PRINT(proc), + state, + (migrating_underway ? "Migrating" : "Not Migrating") )); + + return; +} + +static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state) +{ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):process_fault_daemon() " + "------- Daemon fault reported! proc %s (0x%x) " + "- %s", + ORTE_NAME_PRINT(proc), + state, + (migrating_underway ? "Migrating" : "Not Migrating") )); + + /* + * Failed communication can be ignored for the most part. + * Make sure to remove the route + * JJH: Check to make sure this is not a new daemon loss. + */ + if( ORTE_PROC_STATE_COMM_FAILED == state ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):process_fault_daemon() " + "------- Daemon fault reported! proc %s (0x%x) " + "- Communication failure, keep going", + ORTE_NAME_PRINT(proc), + state )); + } + + return; +} + +static int check_and_pre_map(opal_list_t *off_procs, + opal_list_t *off_nodes, + orte_snapc_base_quiesce_t *cur_datum) +{ + /* + * Check the 'off_procs' list for processes that should not be migrated + */ + + /* + * Check the 'current_onto_mapping_exclusive' for processes that are moving + * 'near/with' other processes that are also moving. Be sure to watch out + * for circular deadlock. + */ + + /* + * Use the 'pre_map_fixed_node' structure to fix this process' mapping. + */ + + return ORTE_SUCCESS; +} + +static void display_request(opal_list_t *off_procs, + opal_list_t *off_nodes, + orte_snapc_base_quiesce_t *cur_datum) +{ + orte_std_cntr_t i_node; + orte_std_cntr_t i_proc; + orte_node_t *node = NULL; + orte_proc_t *proc = NULL; + bool found = false; + char * status_str = NULL; + char * tmp_str = NULL; + orte_errmgr_predicted_proc_t *off_proc = NULL; + orte_errmgr_predicted_node_t *off_node = NULL; + orte_errmgr_predicted_map_t *onto_map = NULL; + opal_list_item_t *item = NULL; + + /* + * Display all requested processes to migrate + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() Requested Processes to migrate: (%d procs)\n", + (int) opal_list_get_size(off_procs) )); + for(item = opal_list_get_first(off_procs); + item != opal_list_get_end(off_procs); + item = opal_list_get_next(item) ) { + off_proc = (orte_errmgr_predicted_proc_t*)item; + + /* + * Find the process in the job structure + */ + found = false; + for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); + if( NULL == proc ) { + continue; + } + + if( proc->name.vpid == off_proc->proc_name.vpid) { + found = true; + break; + } + } + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t%s (Rank %3d) on node %s\n", + ORTE_NAME_PRINT(&proc->name), (int)off_proc->proc_name.vpid, proc->node->name)); + } + + /* + * Display Off Nodes + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() Requested Nodes to migration: (%d nodes)\n", + (int)opal_list_get_size(off_nodes) )); + + for(item = opal_list_get_first(off_nodes); + item != opal_list_get_end(off_nodes); + item = opal_list_get_next(item) ) { + off_node = (orte_errmgr_predicted_node_t*)item; + + for(i_node = 0; i_node < opal_pointer_array_get_size(current_global_jobdata->map->nodes); ++i_node) { + node = (orte_node_t*)opal_pointer_array_get_item(current_global_jobdata->map->nodes, i_node); + if( NULL == node ) { + continue; + } + + found = false; + if( 0 == strncmp(node->name, off_node->node_name, strlen(off_node->node_name)) ) { + found = true; + break; + } + } + if( found ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t\"%s\" \t%d\n", + node->name, node->num_procs)); + for(i_proc = 0; i_proc < opal_pointer_array_get_size(node->procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i_proc); + if( NULL == proc ) { + continue; + } + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t\t\"%s\" [0x%x]\n", + ORTE_NAME_PRINT(&proc->name), proc->state)); + } + } + } + + /* + * Suggested onto nodes + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() Suggested nodes to migration onto: (%d nodes)\n", + (int)opal_list_get_size(current_onto_mapping_general) )); + for(item = opal_list_get_first(current_onto_mapping_general); + item != opal_list_get_end(current_onto_mapping_general); + item = opal_list_get_next(item) ) { + onto_map = (orte_errmgr_predicted_map_t*) item; + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t\"%s\"\n", + onto_map->map_node_name)); + } + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() Suggested nodes to migration onto (exclusive): (%d nodes)\n", + (int)opal_list_get_size(current_onto_mapping_exclusive) )); + for(item = opal_list_get_first(current_onto_mapping_exclusive); + item != opal_list_get_end(current_onto_mapping_exclusive); + item = opal_list_get_next(item) ) { + onto_map = (orte_errmgr_predicted_map_t*) item; + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t%d\t(%c)\t\"%s\"\n", + onto_map->proc_name.vpid, + (onto_map->off_current_node ? 'T' : 'F'), + onto_map->map_node_name)); + } + + /* + * Display all processes scheduled to migrate + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() All Migrating Processes: (%d procs)\n", + cur_datum->num_migrating)); + for(i_proc = 0; i_proc < opal_pointer_array_get_size(&(cur_datum->migrating_procs)); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(&(cur_datum->migrating_procs), i_proc); + if( NULL == proc ) { + continue; + } + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t\"%s\" [0x%x] [%s]\n", + ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name)); + + if( NULL == status_str ) { + asprintf(&status_str, "\t%s Rank %d on Node %s\n", + ORTE_NAME_PRINT(&proc->name), + (int)proc->name.vpid, + proc->node->name); + } else { + tmp_str = strdup(status_str); + free(status_str); + status_str = NULL; + asprintf(&status_str, "%s\t%s Rank %d on Node %s\n", + tmp_str, + ORTE_NAME_PRINT(&proc->name), + (int)proc->name.vpid, + proc->node->name); + } + } + + opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrating_job", true, + status_str); + + if( NULL != tmp_str ) { + free(tmp_str); + tmp_str = NULL; + } + + if( NULL != status_str ) { + free(status_str); + status_str = NULL; + } + + return; +} + +/************************ + * Timing + ************************/ +static void errmgr_crmig_set_time(int idx) +{ + if(idx < ERRMGR_CRMIG_TIMER_MAX ) { + if( timer_start[idx] <= 0.0 ) { + timer_start[idx] = errmgr_crmig_get_time(); + } + } +} + +static void errmgr_crmig_display_all_timers(void) +{ + double diff = 0.0; + char * label = NULL; + + opal_output(0, "Process Migration Timing: ******************** Summary Begin\n"); + + /********** Structure Setup **********/ + label = strdup("Setup"); + diff = timer_start[ERRMGR_CRMIG_TIMER_SETUP] - timer_start[ERRMGR_CRMIG_TIMER_START]; + errmgr_crmig_display_indv_timer_core(diff, label); + free(label); + + /********** Checkpoint **********/ + label = strdup("Checkpoint"); + diff = timer_start[ERRMGR_CRMIG_TIMER_CKPT] - timer_start[ERRMGR_CRMIG_TIMER_SETUP]; + errmgr_crmig_display_indv_timer_core(diff, label); + free(label); + + /********** Termination **********/ + label = strdup("Terminate"); + diff = timer_start[ERRMGR_CRMIG_TIMER_TERM] - timer_start[ERRMGR_CRMIG_TIMER_CKPT]; + errmgr_crmig_display_indv_timer_core(diff, label); + free(label); + + /********** Setup new job **********/ + label = strdup("Setup Relaunch"); + diff = timer_start[ERRMGR_CRMIG_TIMER_RESETUP] - timer_start[ERRMGR_CRMIG_TIMER_TERM]; + errmgr_crmig_display_indv_timer_core(diff, label); + free(label); + + /********** Restart **********/ + label = strdup("Restart"); + diff = timer_start[ERRMGR_CRMIG_TIMER_RESTART] - timer_start[ERRMGR_CRMIG_TIMER_RESETUP]; + errmgr_crmig_display_indv_timer_core(diff, label); + free(label); + + /********** Finish **********/ + label = strdup("Finalize"); + diff = timer_start[ERRMGR_CRMIG_TIMER_FINISH] - timer_start[ERRMGR_CRMIG_TIMER_RESTART]; + errmgr_crmig_display_indv_timer_core(diff, label); + free(label); + + opal_output(0, "Process Migration Timing: ******************** Summary End\n"); +} + +static void errmgr_crmig_clear_timers(void) +{ + int i; + for(i = 0; i < ERRMGR_CRMIG_TIMER_MAX; ++i) { + timer_start[i] = 0.0; + } +} + +static double errmgr_crmig_get_time(void) +{ + double wtime; + +#if OPAL_TIMER_USEC_NATIVE + wtime = (double)opal_timer_base_get_usec() / 1000000.0; +#else + struct timeval tv; + gettimeofday(&tv, NULL); + wtime = tv.tv_sec; + wtime += (double)tv.tv_usec / 1000000.0; +#endif + + return wtime; +} + +static void errmgr_crmig_display_indv_timer_core(double diff, char *str) +{ + double total = 0; + double perc = 0; + + total = timer_start[ERRMGR_CRMIG_TIMER_MAX-1] - timer_start[ERRMGR_CRMIG_TIMER_START]; + perc = (diff/total) * 100; + + opal_output(0, + "errmgr_crmig: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n", + str, + diff, + total, + perc); + return; +} + +#endif /* OPAL_ENABLE_FT_CR */ diff --git a/orte/mca/errmgr/hnpresil/help-orte-errmgr-hnp.txt b/orte/mca/errmgr/hnpresil/help-orte-errmgr-hnp.txt new file mode 100644 index 0000000000..836e46f4b0 --- /dev/null +++ b/orte/mca/errmgr/hnpresil/help-orte-errmgr-hnp.txt @@ -0,0 +1,71 @@ + -*- text -*- +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for ORTE Errmgr HNP module. +# +[errmgr-hnp:unknown-job-error] +An error has occurred in an unknown job. This generally should not happen +except due to an internal ORTE error. + +Job state: %s + +This information should probably be reported to the OMPI developers. +# +[errmgr-hnp:daemon-died] +The system has lost communication with the following daemon: + +Daemon: %s +Node: %s + +The reason for the lost communication channel is unknown. Possible +reasons include failure of the daemon itself, failure of the +connecting fabric/switch, and loss of the host node. Please +check with your system administrator to try and determine the +source of the problem. + +Your job is being terminated as a result. +# +[errmgr-hnp:cannot-relocate] +The system is unable to relocate the specified process: + +Process: %s + +because the application for that process could not be found. This +appears to be a system error. Please report it to the ORTE +developers. + +[autor_recovering_job] +Notice: The processes listed below failed unexpectedly. + Using the last checkpoint to recover the job. + Please standby. +%s +[autor_recovery_complete] +Notice: The job has been successfully recovered from the + last checkpoint. +[autor_failed_to_recover_proc] +Error: The process below has failed. There is no checkpoint available for + this job, so we are terminating the application since automatic + recovery cannot occur. +Internal Name: %s +MCW Rank: %d + +[crmig_migrating_job] +Notice: A migration of this job has been requested. + The processes below will be migrated. + Please standby. +%s +[crmig_migrated_job] +Notice: The processes have been successfully migrated to/from the specified + machines. +[crmig_no_migrating_procs] +Warning: Could not find any processes to migrate on the nodes specified. + You provided the following: +Nodes: %s +Procs: %s diff --git a/orte/mca/errmgr/orted/errmgr_orted.c b/orte/mca/errmgr/orted/errmgr_orted.c index b43bf56422..5826a2c58c 100644 --- a/orte/mca/errmgr/orted/errmgr_orted.c +++ b/orte/mca/errmgr/orted/errmgr_orted.c @@ -3,9 +3,6 @@ * All rights reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,11 +29,9 @@ #include "orte/util/proc_info.h" #include "orte/util/session_dir.h" #include "orte/util/show_help.h" -#include "orte/util/nidmap.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/rml/rml.h" #include "orte/mca/odls/odls.h" -#include "orte/mca/odls/base/base.h" #include "orte/mca/plm/plm_types.h" #include "orte/mca/routed/routed.h" #include "orte/mca/sensor/sensor.h" @@ -58,9 +53,8 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code); static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state); -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch); -static int record_dead_process(orte_process_name_t *proc); -static int send_to_local_applications(opal_pointer_array_t *dead_names); +static void killprocs(orte_jobid_t job, orte_vpid_t vpid); + /* * Module functions: Global @@ -85,11 +79,7 @@ static int suggest_map_targets(orte_proc_t *proc, static int ft_event(int state); -static int post_startup(void); -static int pre_shutdown(void); -static int mark_processes_as_dead(opal_pointer_array_t *dead_procs); -static int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer); /****************** * ORTED module @@ -105,11 +95,11 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = { suggest_map_targets, ft_event, orte_errmgr_base_register_migration_warning, - post_startup, - pre_shutdown, - mark_processes_as_dead, - orte_errmgr_base_set_fault_callback, /* Set callback function */ - failure_notification + NULL, /* post_startup */ + NULL, /* pre_shutdown */ + NULL, /* mark_processes_as_dead */ + NULL, /* set_fault_callback */ + NULL /* failure_notification */ }; /************************ @@ -140,29 +130,20 @@ static int update_state(orte_jobid_t job, int rc=ORTE_SUCCESS; orte_vpid_t null=ORTE_VPID_INVALID; orte_app_context_t *app; - orte_ns_cmp_bitmask_t mask; - + /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return ORTE_SUCCESS; } - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "errmgr:orted:update_state() %s) " - "------- %s state updated for process %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ((NULL == proc) ? "App. Process" : - (proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc))); - + /* if this is a heartbeat failure, let the HNP handle it */ if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate || ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { return ORTE_SUCCESS; } - + /*** UPDATE COMMAND FOR A JOB ***/ if (NULL == proc) { /* this is an update for an entire job */ @@ -199,7 +180,7 @@ static int update_state(orte_jobid_t job, item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; - + /* is this the specified job? */ if (jobdat->jobid == job) { break; @@ -208,7 +189,7 @@ static int update_state(orte_jobid_t job, if (NULL == jobdat) { return ORTE_ERR_NOT_FOUND; } - + switch (jobstate) { case ORTE_JOB_STATE_FAILED_TO_START: failed_start(jobdat, exit_code); @@ -221,10 +202,10 @@ static int update_state(orte_jobid_t job, /* update all procs in job */ update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); /* order all local procs for this job to be killed */ - killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); case ORTE_JOB_STATE_COMM_FAILED: /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* tell the caller we can't recover */ return ORTE_ERR_UNRECOVERABLE; break; @@ -261,16 +242,15 @@ static int update_state(orte_jobid_t job, * lifeline */ if (ORTE_PROC_STATE_COMM_FAILED == state) { - mask = ORTE_NS_CMP_ALL; - /* if it is our own connection, ignore it */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { + if (ORTE_PROC_MY_NAME->jobid == proc->jobid && + ORTE_PROC_MY_NAME->vpid == proc->vpid) { return ORTE_SUCCESS; } /* see if this was a lifeline */ if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { /* kill our children */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* terminate - our routed children will see * us leave and automatically die */ @@ -281,25 +261,21 @@ static int update_state(orte_jobid_t job, /* was it a daemon that failed? */ if (proc->jobid == ORTE_PROC_MY_NAME->jobid) { /* if all my routes are gone, then terminate ourselves */ - if (0 == orte_routed.num_routes() && - 0 == opal_list_get_size(&orte_local_children)) { + if (0 == orte_routed.num_routes()) { orte_quit(); } } - - record_dead_process(proc); - /* if not, then indicate we can continue */ return ORTE_SUCCESS; } - + /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; - + /* is this the specified job? */ if (jobdat->jobid == proc->jobid) { break; @@ -309,7 +285,7 @@ static int update_state(orte_jobid_t job, /* must already be complete */ return ORTE_SUCCESS; } - + /* if there are no local procs for this job, we can * ignore this call */ @@ -330,15 +306,15 @@ static int update_state(orte_jobid_t job, item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + if (child->name->jobid == proc->jobid && + child->name->vpid == proc->vpid) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; child->exit_code = exit_code; /* Decrement the number of local procs */ jobdat->num_local_procs--; /* kill this proc */ - killprocs(proc->jobid, proc->vpid, proc->epoch); + killprocs(proc->jobid, proc->vpid); } app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx); if( jobdat->enable_recovery && child->restarts < app->max_restarts ) { @@ -364,7 +340,7 @@ static int update_state(orte_jobid_t job, /* treat this as normal termination */ goto REPORT_STATE; } - + if (ORTE_PROC_STATE_TERMINATED < state) { if( jobdat->enable_recovery ) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, @@ -375,8 +351,8 @@ static int update_state(orte_jobid_t job, item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + if (child->name->jobid == proc->jobid && + child->name->vpid == proc->vpid) { /* see if this child has reached its local restart limit */ app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx); OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, @@ -403,8 +379,8 @@ static int update_state(orte_jobid_t job, } } } - -REPORT_ABORT: + + REPORT_ABORT: /* if the job hasn't completed and the state is abnormally * terminated, then we need to alert the HNP right away */ @@ -427,8 +403,8 @@ REPORT_ABORT: item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + if (child->name->jobid == proc->jobid && + child->name->vpid == proc->vpid) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; child->exit_code = exit_code; @@ -442,7 +418,7 @@ REPORT_ABORT: opal_list_remove_item(&orte_local_children, &child->super); /* Decrement the number of local procs */ jobdat->num_local_procs--; - + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:orted reporting proc %s aborted to HNP (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -464,15 +440,15 @@ REPORT_ABORT: OBJ_DESTRUCT(&alert); return rc; } - - REPORT_STATE: + +REPORT_STATE: /* find this proc in the local children so we can update its state */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + if (child->name->jobid == proc->jobid && + child->name->vpid == proc->vpid) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; if (0 < pid) { @@ -492,7 +468,7 @@ REPORT_ABORT: * the HNP so it is available to debuggers and anyone * else that needs it */ - + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:orted: sending contact info to HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -509,7 +485,7 @@ REPORT_ABORT: ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } - /* pack all the local child vpids and epochs */ + /* pack all the local child vpids */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { @@ -546,7 +522,7 @@ REPORT_ABORT: } return rc; } - + /* only other state is terminated - see if anyone is left alive */ if (!any_live_children(proc->jobid)) { /* lookup the local jobdat for this job */ @@ -555,7 +531,7 @@ REPORT_ABORT: item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; - + /* is this the specified job? */ if (jobdat->jobid == proc->jobid) { break; @@ -577,8 +553,8 @@ REPORT_ABORT: if (ORTE_SUCCESS != (rc = pack_state_update(&alert, jobdat))) { ORTE_ERROR_LOG(rc); } - -FINAL_CLEANUP: + + FINAL_CLEANUP: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:orted reporting all procs in %s terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -592,7 +568,7 @@ FINAL_CLEANUP: item = next) { child = (orte_odls_child_t*)item; next = opal_list_get_next(item); - + if (jobdat->jobid == child->name->jobid) { opal_list_remove_item(&orte_local_children, &child->super); OBJ_RELEASE(child); @@ -601,11 +577,11 @@ FINAL_CLEANUP: /* ensure the job's local session directory tree is removed */ orte_session_dir_cleanup(jobdat->jobid); - + /* remove this job from our local job data since it is complete */ opal_list_remove_item(&orte_local_jobdata, &jobdat->super); OBJ_RELEASE(jobdat); - + /* send it */ if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) { ORTE_ERROR_LOG(rc); @@ -613,7 +589,6 @@ FINAL_CLEANUP: rc = ORTE_SUCCESS; } OBJ_DESTRUCT(&alert); - /* indicate that the job is complete */ return rc; } @@ -639,131 +614,6 @@ int ft_event(int state) return ORTE_SUCCESS; } -int post_startup(void) { - return ORTE_SUCCESS; -} - -int pre_shutdown(void) { - return ORTE_SUCCESS; -} - -int mark_processes_as_dead(opal_pointer_array_t *dead_procs) { - int i; - orte_process_name_t *name_item; - opal_list_item_t *item; - orte_odls_child_t *child; - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "ORTED %s marking procs as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) { - if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) { - opal_output(0, "NULL found in dead process list."); - continue; - } else { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "ORTED %s marking %s as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item))); - } - - if (name_item->epoch < orte_util_lookup_epoch(name_item)) { - continue; - } - - /* Increment the epoch */ - orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED); - orte_util_set_epoch(name_item, name_item->epoch + 1); - - /* Remove the dead process from my list of children if applicable */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t *) item; - - if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID, - child->name, name_item)) { - opal_list_remove_item(&orte_local_children, item); - break; - } - } - - /* Remove the route from the routing layer */ - orte_routed.delete_route(name_item); - } - - /* Update the routing module */ - orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); - - if (NULL != fault_cbfunc) { - (*fault_cbfunc)(dead_procs); - } - - return ORTE_SUCCESS; -} - -int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) { - opal_pointer_array_t *dead_names; - orte_std_cntr_t n; - int ret = ORTE_SUCCESS, num_failed; - int32_t i; - orte_process_name_t *name_item, proc; - - dead_names = OBJ_NEW(opal_pointer_array_t); - - n = 1; - /* Get the number of failed procs */ - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - return ret; - } - - for (i = 0; i < num_failed; i++) { - /* Unpack the buffer to get the dead process' name. */ - n = 1; - - name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:orted ORTED received process %s failed from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item), - ORTE_NAME_PRINT(sender)); - } - - /* There shouldn't be an issue of receiving this message multiple - * times but it doesn't hurt to double check. - */ - if (proc.epoch < orte_util_lookup_epoch(name_item)) { - opal_output(1, "Received from proc %s local epoch %d", ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item)); - continue; - } - - opal_pointer_array_add(dead_names, name_item); - } - - /* Tell the errmgr so it can handle changing the epoch, routes, etc. */ - orte_errmgr.mark_processes_as_dead(dead_names); - - /* Tell the applications' ORTE layers that there is a failure. */ - if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) { - return ret; - } - - for (i = 0; i < num_failed; i++) { - name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i); - free(name_item); - } - - return ret; -} - /***************** * Local Functions *****************/ @@ -771,14 +621,14 @@ static bool any_live_children(orte_jobid_t job) { opal_list_item_t *item; orte_odls_child_t *child; - + /* the thread is locked elsewhere - don't try to do it again here */ - + for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - + /* is this child part of the specified job? */ if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) && child->alive) { @@ -788,13 +638,13 @@ static bool any_live_children(orte_jobid_t job) /* if we get here, then nobody is left alive from that job */ return false; - + } static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) { int rc; - + /* pack the child's vpid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name->vpid), 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); @@ -829,70 +679,70 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) ORTE_ERROR_LOG(rc); return rc; } - + return ORTE_SUCCESS; } -static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) -{ - int rc; - opal_list_item_t *item, *next; - orte_odls_child_t *child; - orte_vpid_t null=ORTE_VPID_INVALID; - - /* pack the jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* if we are timing things, pack the time the launch msg for this job was recvd */ - if (orte_timing) { - int64_t tmp; - tmp = jobdat->launch_msg_recvd.tv_sec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { + static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) + { + int rc; + opal_list_item_t *item, *next; + orte_odls_child_t *child; + orte_vpid_t null=ORTE_VPID_INVALID; + + /* pack the jobid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return rc; } - tmp = jobdat->launch_msg_recvd.tv_usec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); - /* if this child is part of the job... */ - if (child->name->jobid == jobdat->jobid) { - if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { + /* if we are timing things, pack the time the launch msg for this job was recvd */ + if (orte_timing) { + int64_t tmp; + tmp = jobdat->launch_msg_recvd.tv_sec; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { + ORTE_ERROR_LOG(rc); + return rc; + } + tmp = jobdat->launch_msg_recvd.tv_usec; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { ORTE_ERROR_LOG(rc); return rc; } } + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = next) { + child = (orte_odls_child_t*)item; + next = opal_list_get_next(item); + /* if this child is part of the job... */ + if (child->name->jobid == jobdat->jobid) { + if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + /* flag that this job is complete so the receiver can know */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; } - /* flag that this job is complete so the receiver can know */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; -} static bool all_children_registered(orte_jobid_t job) { opal_list_item_t *item; orte_odls_child_t *child; - + /* the thread is locked elsewhere - don't try to do it again here */ - + for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - + /* is this child part of the specified job? */ if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { /* if this child has terminated, we consider it as having @@ -918,10 +768,10 @@ static bool all_children_registered(orte_jobid_t job) } } } - + /* if we get here, then everyone in the job is currently registered */ return true; - + } static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) @@ -929,14 +779,14 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) opal_list_item_t *item; orte_odls_child_t *child; int rc; - + /* the thread is locked elsewhere - don't try to do it again here */ - + for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - + /* is this child part of the specified job? */ if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { /* pack the child's vpid - must be done in case rml_uri is NULL */ @@ -944,11 +794,10 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) ORTE_ERROR_LOG(rc); return rc; } - /* Pack the child's epoch. */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->epoch), 1, ORTE_EPOCH))) { ORTE_ERROR_LOG(rc); return rc; - } + } /* pack the contact info */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); @@ -956,19 +805,19 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) } } } - + return ORTE_SUCCESS; - + } static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code) { opal_list_item_t *item; orte_odls_child_t *child; - + /* set the state */ jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; - + for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { @@ -997,7 +846,7 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs { opal_list_item_t *item; orte_odls_child_t *child; - + /* update job state */ jobdat->state = jobstate; /* update children */ @@ -1011,29 +860,28 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs } } -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch) +static void killprocs(orte_jobid_t job, orte_vpid_t vpid) { opal_pointer_array_t cmd; orte_proc_t proc; int rc; - + /* stop local sensors for this job */ if (ORTE_VPID_WILDCARD == vpid) { orte_sensor.stop(job); } - - if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) { + + if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) { if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { ORTE_ERROR_LOG(rc); } return; } - + OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); OBJ_CONSTRUCT(&proc, orte_proc_t); proc.name.jobid = job; proc.name.vpid = vpid; - proc.name.epoch = epoch; opal_pointer_array_add(&cmd, &proc); if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { ORTE_ERROR_LOG(rc); @@ -1041,85 +889,3 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch) OBJ_DESTRUCT(&cmd); OBJ_DESTRUCT(&proc); } - -static int record_dead_process(orte_process_name_t *proc) { - opal_pointer_array_t *dead_name; - opal_buffer_t *buffer; - orte_daemon_cmd_flag_t command; - int rc = ORTE_SUCCESS; - int num_failed; - - if (orte_odls_base_default_check_finished(proc)) { - return rc; - } - - dead_name = OBJ_NEW(opal_pointer_array_t); - - opal_pointer_array_add(dead_name, proc); - - /* Mark the process as dead */ - mark_processes_as_dead(dead_name); - - /* Send a message to the HNP */ - buffer = OBJ_NEW(opal_buffer_t); - command = ORTE_PROCESS_FAILED_NOTIFICATION; - - num_failed = 1; - - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - } else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - } else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - } - - orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0); - - OBJ_RELEASE(buffer); - OBJ_RELEASE(dead_name); - - return rc; -} - -int send_to_local_applications(opal_pointer_array_t *dead_names) { - opal_buffer_t *buf; - int ret; - orte_process_name_t *name_item; - int size, i; - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "%s Sending failure to local applications.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - buf = OBJ_NEW(opal_buffer_t); - - size = opal_pointer_array_get_size(dead_names); - - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - for (i = 0; i < size; i++) { - if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - } - } - - if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - OBJ_RELEASE(buf); - - return ORTE_SUCCESS; -} - diff --git a/orte/mca/errmgr/ortedresil/.windows b/orte/mca/errmgr/ortedresil/.windows new file mode 100644 index 0000000000..aa7d7bbbe5 --- /dev/null +++ b/orte/mca/errmgr/ortedresil/.windows @@ -0,0 +1,12 @@ +# +# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module +mca_link_libraries=libopen-rte diff --git a/orte/mca/errmgr/ortedresil/Makefile.am b/orte/mca/errmgr/ortedresil/Makefile.am new file mode 100644 index 0000000000..63c1cf9522 --- /dev/null +++ b/orte/mca/errmgr/ortedresil/Makefile.am @@ -0,0 +1,38 @@ +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = .windows + +dist_pkgdata_DATA = help-orte-errmgr-orted.txt + +sources = \ + errmgr_ortedresil.h \ + errmgr_ortedresil_component.c \ + errmgr_ortedresil.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_errmgr_ortedresil_DSO +component_noinst = +component_install = mca_errmgr_ortedresil.la +else +component_noinst = libmca_errmgr_ortedresil.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_errmgr_ortedresil_la_SOURCES = $(sources) +mca_errmgr_ortedresil_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_errmgr_ortedresil_la_SOURCES =$(sources) +libmca_errmgr_ortedresil_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/ortedresil/errmgr_ortedresil.c b/orte/mca/errmgr/ortedresil/errmgr_ortedresil.c new file mode 100644 index 0000000000..a1061eff31 --- /dev/null +++ b/orte/mca/errmgr/ortedresil/errmgr_ortedresil.c @@ -0,0 +1,1126 @@ +/* + * Copyright (c) 2009-2010 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/output.h" +#include "opal/util/opal_sos.h" +#include "opal/dss/dss.h" + +#include "orte/util/error_strings.h" +#include "orte/util/name_fns.h" +#include "orte/util/proc_info.h" +#include "orte/util/session_dir.h" +#include "orte/util/show_help.h" +#include "orte/util/nidmap.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/odls/odls.h" +#include "orte/mca/odls/base/base.h" +#include "orte/mca/plm/plm_types.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/sensor/sensor.h" +#include "orte/runtime/orte_quit.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "orte/mca/errmgr/base/errmgr_private.h" + +#include "errmgr_ortedresil.h" + +/* Local functions */ +static bool any_live_children(orte_jobid_t job); +static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat); +static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child); +static bool all_children_registered(orte_jobid_t job); +static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf); +static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code); +static void update_local_children(orte_odls_job_t *jobdat, + orte_job_state_t jobstate, + orte_proc_state_t state); +static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch); +static int record_dead_process(orte_process_name_t *proc); +static int send_to_local_applications(opal_pointer_array_t *dead_names); + +/* + * Module functions: Global + */ +static int init(void); +static int finalize(void); + +static int predicted_fault(opal_list_t *proc_list, + opal_list_t *node_list, + opal_list_t *suggested_map); + +static int update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); + +static int suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list); + +static int ft_event(int state); + +static int post_startup(void); +static int pre_shutdown(void); + +static int mark_processes_as_dead(opal_pointer_array_t *dead_procs); +static int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer); + +/****************** + * ORTEDRESIL module + ******************/ +orte_errmgr_base_module_t orte_errmgr_ortedresil_module = { + init, + finalize, + orte_errmgr_base_log, + orte_errmgr_base_abort, + orte_errmgr_base_abort_peers, + update_state, + predicted_fault, + suggest_map_targets, + ft_event, + orte_errmgr_base_register_migration_warning, + post_startup, + pre_shutdown, + mark_processes_as_dead, + orte_errmgr_base_set_fault_callback, /* Set callback function */ + failure_notification +}; + +/************************ + * API Definitions + ************************/ +static int init(void) +{ + return ORTE_SUCCESS; +} + +static int finalize(void) +{ + return ORTE_SUCCESS; +} + +static int update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + opal_list_item_t *item, *next; + orte_odls_job_t *jobdat = NULL; + orte_odls_child_t *child; + opal_buffer_t alert; + orte_plm_cmd_flag_t cmd; + int rc=ORTE_SUCCESS; + orte_vpid_t null=ORTE_VPID_INVALID; + orte_app_context_t *app; + orte_ns_cmp_bitmask_t mask; + + /* + * if orte is trying to shutdown, just let it + */ + if (orte_finalizing) { + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, + "errmgr:ortedresil:update_state() %s) " + "------- %s state updated for process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ((NULL == proc) ? "App. Process" : + (proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), + (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc))); + + /* if this is a heartbeat failure, let the HNP handle it */ + if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate || + ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { + return ORTE_SUCCESS; + } + + /*** UPDATE COMMAND FOR A JOB ***/ + if (NULL == proc) { + /* this is an update for an entire job */ + if (ORTE_JOBID_INVALID == job) { + /* whatever happened, we don't know what job + * it happened to + */ + orte_show_help("help-orte-errmgr-orted.txt", "errmgr-orted:unknown-job-error", + true, orte_job_state_to_str(jobstate)); + OBJ_CONSTRUCT(&alert, opal_buffer_t); + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the "invalid" jobid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &job, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) { + ORTE_ERROR_LOG(rc); + } else { + rc = ORTE_SUCCESS; + } + OBJ_DESTRUCT(&alert); + return rc; + } + + /* lookup the local jobdat for this job */ + jobdat = NULL; + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jobdat = (orte_odls_job_t*)item; + + /* is this the specified job? */ + if (jobdat->jobid == job) { + break; + } + } + if (NULL == jobdat) { + return ORTE_ERR_NOT_FOUND; + } + + switch (jobstate) { + case ORTE_JOB_STATE_FAILED_TO_START: + failed_start(jobdat, exit_code); + break; + case ORTE_JOB_STATE_RUNNING: + /* update all local child states */ + update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING); + break; + case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: + /* update all procs in job */ + update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); + /* order all local procs for this job to be killed */ + killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + case ORTE_JOB_STATE_COMM_FAILED: + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + /* tell the caller we can't recover */ + return ORTE_ERR_UNRECOVERABLE; + break; + case ORTE_JOB_STATE_HEARTBEAT_FAILED: + /* let the HNP handle this */ + return ORTE_SUCCESS; + break; + + default: + break; + } + OBJ_CONSTRUCT(&alert, opal_buffer_t); + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + /* pack the job info */ + if (ORTE_SUCCESS != (rc = pack_state_update(&alert, jobdat))) { + ORTE_ERROR_LOG(rc); + } + /* send it */ + if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) { + ORTE_ERROR_LOG(rc); + } else { + rc = ORTE_SUCCESS; + } + OBJ_DESTRUCT(&alert); + return rc; + } + + /* if this was a failed comm, then see if it was to our + * lifeline + */ + if (ORTE_PROC_STATE_COMM_FAILED == state) { + mask = ORTE_NS_CMP_ALL; + + /* if it is our own connection, ignore it */ + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { + return ORTE_SUCCESS; + } + /* see if this was a lifeline */ + if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { + /* kill our children */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + /* terminate - our routed children will see + * us leave and automatically die + */ + orte_quit(); + } + /* purge the oob */ + orte_rml.purge(proc); + /* was it a daemon that failed? */ + if (proc->jobid == ORTE_PROC_MY_NAME->jobid) { + /* if all my routes are gone, then terminate ourselves */ + if (0 == orte_routed.num_routes() && + 0 == opal_list_get_size(&orte_local_children)) { + orte_quit(); + } + } + + record_dead_process(proc); + + /* if not, then indicate we can continue */ + return ORTE_SUCCESS; + } + + /* lookup the local jobdat for this job */ + jobdat = NULL; + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jobdat = (orte_odls_job_t*)item; + + /* is this the specified job? */ + if (jobdat->jobid == proc->jobid) { + break; + } + } + if (NULL == jobdat) { + /* must already be complete */ + return ORTE_SUCCESS; + } + + /* if there are no local procs for this job, we can + * ignore this call + */ + if (0 == jobdat->num_local_procs) { + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:ortedresil got state %s for proc %s pid %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_proc_state_to_str(state), + ORTE_NAME_PRINT(proc), pid)); + + /*** UPDATE COMMAND FOR A SPECIFIC PROCESS ***/ + if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) { + /* find this proc in the local children */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + mask = ORTE_NS_CMP_ALL; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + if (ORTE_PROC_STATE_UNTERMINATED > child->state) { + child->state = state; + child->exit_code = exit_code; + /* Decrement the number of local procs */ + jobdat->num_local_procs--; + /* kill this proc */ + killprocs(proc->jobid, proc->vpid, proc->epoch); + } + app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx); + if( jobdat->enable_recovery && child->restarts < app->max_restarts ) { + child->restarts++; + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:ortedresil restarting proc %s for the %d time", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), child->restarts)); + rc = orte_odls.restart_proc(child); + } + return rc; + } + } + } + + if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { + if (orte_abort_non_zero_exit) { + /* treat this as an abnormal + * termination - no recovery allowed + */ + goto REPORT_ABORT; + } + /* treat this as normal termination */ + goto REPORT_STATE; + } + + if (ORTE_PROC_STATE_TERMINATED < state) { + if( jobdat->enable_recovery ) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s RECOVERY ENABLED", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* find this proc in the local children */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + mask = ORTE_NS_CMP_ALL; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + /* see if this child has reached its local restart limit */ + app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx); + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s CHECKING RESTARTS %d VS MAX %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + child->restarts, app->max_restarts)); + if (child->restarts < app->max_restarts ) { + /* attempt to restart it locally */ + child->restarts++; + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:ortedresil restarting proc %s for the %d time", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name), child->restarts)); + if (ORTE_SUCCESS != (rc = orte_odls.restart_proc(child))) { + /* reset the child's state as restart_proc would + * have cleared it + */ + child->state = state; + ORTE_ERROR_LOG(rc); + goto REPORT_ABORT; + } + return ORTE_SUCCESS; + } + } + } + } + +REPORT_ABORT: + /* if the job hasn't completed and the state is abnormally + * terminated, then we need to alert the HNP right away + */ + OBJ_CONSTRUCT(&alert, opal_buffer_t); + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + /* pack only the data for this proc - have to start with the jobid + * so the receiver can unpack it correctly + */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &proc->jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* find this proc in the local children */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + mask = ORTE_NS_CMP_ALL; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + if (ORTE_PROC_STATE_UNTERMINATED > child->state) { + child->state = state; + child->exit_code = exit_code; + } + /* now pack the child's info */ + if (ORTE_SUCCESS != (rc = pack_state_for_proc(&alert, child))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* remove the child from our local list as it is no longer alive */ + opal_list_remove_item(&orte_local_children, &child->super); + /* Decrement the number of local procs */ + jobdat->num_local_procs--; + + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:ortedresil reporting proc %s aborted to HNP (local procs = %d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name), + jobdat->num_local_procs)); + + /* release the child object */ + OBJ_RELEASE(child); + /* done with loop */ + break; + } + } + /* send it */ + if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) { + ORTE_ERROR_LOG(rc); + } else { + rc = ORTE_SUCCESS; + } + OBJ_DESTRUCT(&alert); + return rc; + } + + REPORT_STATE: + /* find this proc in the local children so we can update its state */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + mask = ORTE_NS_CMP_ALL; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + if (ORTE_PROC_STATE_UNTERMINATED > child->state) { + child->state = state; + if (0 < pid) { + child->pid = pid; + } + child->exit_code = exit_code; + } + /* done with loop */ + break; + } + } + + if (ORTE_PROC_STATE_REGISTERED == state) { + /* see if everyone in this job has registered */ + if (all_children_registered(proc->jobid)) { + /* once everyone registers, send their contact info to + * the HNP so it is available to debuggers and anyone + * else that needs it + */ + + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:ortedresil: sending contact info to HNP", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + OBJ_CONSTRUCT(&alert, opal_buffer_t); + /* pack init routes command */ + cmd = ORTE_PLM_INIT_ROUTES_CMD; + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + /* pack the jobid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &proc->jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + /* pack all the local child vpids and epochs */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + if (child->name->jobid == proc->jobid) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &child->name->vpid, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &child->name->epoch, 1, ORTE_EPOCH))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + } + } + /* pack an invalid marker */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &null, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + /* add in contact info for all procs in the job */ + if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, &alert))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&alert); + return rc; + } + /* send it */ + if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) { + ORTE_ERROR_LOG(rc); + } else { + rc = ORTE_SUCCESS; + } + OBJ_DESTRUCT(&alert); + } + return rc; + } + + /* only other state is terminated - see if anyone is left alive */ + if (!any_live_children(proc->jobid)) { + /* lookup the local jobdat for this job */ + jobdat = NULL; + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jobdat = (orte_odls_job_t*)item; + + /* is this the specified job? */ + if (jobdat->jobid == proc->jobid) { + break; + } + } + if (NULL == jobdat) { + /* race condition - may not have been formed yet */ + return ORTE_SUCCESS; + } + + OBJ_CONSTRUCT(&alert, opal_buffer_t); + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + /* pack the data for the job */ + if (ORTE_SUCCESS != (rc = pack_state_update(&alert, jobdat))) { + ORTE_ERROR_LOG(rc); + } + +FINAL_CLEANUP: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:ortedresil reporting all procs in %s terminated", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jobdat->jobid))); + + /* remove all of this job's children from the global list - do not lock + * the thread as we are already locked + */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = next) { + child = (orte_odls_child_t*)item; + next = opal_list_get_next(item); + + if (jobdat->jobid == child->name->jobid) { + opal_list_remove_item(&orte_local_children, &child->super); + OBJ_RELEASE(child); + } + } + + /* ensure the job's local session directory tree is removed */ + orte_session_dir_cleanup(jobdat->jobid); + + /* remove this job from our local job data since it is complete */ + opal_list_remove_item(&orte_local_jobdata, &jobdat->super); + OBJ_RELEASE(jobdat); + + /* send it */ + if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) { + ORTE_ERROR_LOG(rc); + } else { + rc = ORTE_SUCCESS; + } + OBJ_DESTRUCT(&alert); + + /* indicate that the job is complete */ + return rc; + } + return ORTE_SUCCESS; +} + +static int predicted_fault(opal_list_t *proc_list, + opal_list_t *node_list, + opal_list_t *suggested_map) +{ + return ORTE_ERR_NOT_IMPLEMENTED; +} + +static int suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list) +{ + return ORTE_ERR_NOT_IMPLEMENTED; +} + +static int ft_event(int state) +{ + return ORTE_SUCCESS; +} + +static int post_startup(void) { + return ORTE_SUCCESS; +} + +static int pre_shutdown(void) { + return ORTE_SUCCESS; +} + +static int mark_processes_as_dead(opal_pointer_array_t *dead_procs) { + int i; + orte_process_name_t *name_item; + opal_list_item_t *item; + orte_odls_child_t *child; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "ORTED %s marking procs as dead", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) { + if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) { + opal_output(0, "NULL found in dead process list."); + continue; + } + + if (name_item->epoch < orte_util_lookup_epoch(name_item)) { + continue; + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "ORTED %s marking %s as dead", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name_item))); + + /* Increment the epoch */ + orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED); + orte_util_set_epoch(name_item, name_item->epoch + 1); + + /* Remove the dead process from my list of children if applicable */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t *) item; + + if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID, + child->name, name_item)) { + opal_list_remove_item(&orte_local_children, item); + OBJ_RELEASE(item); + break; + } + } + + /* Remove the route from the routing layer */ + orte_routed.delete_route(name_item); + } + + /* Update the routing module */ + orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); + + if (NULL != fault_cbfunc) { + (*fault_cbfunc)(dead_procs); + } + + return ORTE_SUCCESS; +} + +static int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) { + opal_pointer_array_t *dead_names; + orte_std_cntr_t n; + int ret = ORTE_SUCCESS, num_failed; + int32_t i; + orte_process_name_t *name_item, proc; + + dead_names = OBJ_NEW(opal_pointer_array_t); + + n = 1; + /* Get the number of failed procs */ + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(ret); + return ret; + } + + for (i = 0; i < num_failed; i++) { + /* Unpack the buffer to get the dead process' name. */ + n = 1; + + name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); + + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + return ret; + } + + if (orte_debug_daemons_flag) { + opal_output(0, "%s errmgr:ortedresil ORTED received process %s failed from %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name_item), + ORTE_NAME_PRINT(sender)); + } + + /* There shouldn't be an issue of receiving this message multiple + * times but it doesn't hurt to double check. + */ + if (proc.epoch < orte_util_lookup_epoch(name_item)) { + opal_output(1, "Received from proc %s local epoch %d", ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item)); + continue; + } + + opal_pointer_array_add(dead_names, name_item); + } + + /* Tell the errmgr so it can handle changing the epoch, routes, etc. */ + mark_processes_as_dead(dead_names); + + /* Tell the applications' ORTE layers that there is a failure. */ + if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) { + return ret; + } + + for (i = 0; i < num_failed; i++) { + name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i); + free(name_item); + } + + return ret; +} + +/***************** + * Local Functions + *****************/ +static bool any_live_children(orte_jobid_t job) +{ + opal_list_item_t *item; + orte_odls_child_t *child; + + /* the thread is locked elsewhere - don't try to do it again here */ + + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + + /* is this child part of the specified job? */ + if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) && + child->alive) { + return true; + } + } + + /* if we get here, then nobody is left alive from that job */ + return false; + +} + +static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) +{ + int rc; + + /* pack the child's vpid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name->vpid), 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the pid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* if we are timing things, pack the time the proc was launched */ + if (orte_timing) { + int64_t tmp; + tmp = child->starttime.tv_sec; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { + ORTE_ERROR_LOG(rc); + return rc; + } + tmp = child->starttime.tv_usec; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + /* pack its state */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack its exit code */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->exit_code, 1, ORTE_EXIT_CODE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; +} + +static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) +{ + int rc; + opal_list_item_t *item, *next; + orte_odls_child_t *child; + orte_vpid_t null=ORTE_VPID_INVALID; + + /* pack the jobid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* if we are timing things, pack the time the launch msg for this job was recvd */ + if (orte_timing) { + int64_t tmp; + tmp = jobdat->launch_msg_recvd.tv_sec; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { + ORTE_ERROR_LOG(rc); + return rc; + } + tmp = jobdat->launch_msg_recvd.tv_usec; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = next) { + child = (orte_odls_child_t*)item; + next = opal_list_get_next(item); + /* if this child is part of the job... */ + if (child->name->jobid == jobdat->jobid) { + if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + /* flag that this job is complete so the receiver can know */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; +} + +static bool all_children_registered(orte_jobid_t job) +{ + opal_list_item_t *item; + orte_odls_child_t *child; + + /* the thread is locked elsewhere - don't try to do it again here */ + + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + + /* is this child part of the specified job? */ + if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { + /* if this child has terminated, we consider it as having + * registered for the purposes of this function. If it never + * did register, then we will send a NULL rml_uri back to + * the HNP, which will then know that the proc did not register. + * If other procs did register, then the HNP can declare an + * abnormal termination + */ + if (ORTE_PROC_STATE_UNTERMINATED < child->state) { + /* this proc has terminated somehow - consider it + * as registered for now + */ + continue; + } + /* if this child is *not* registered yet, return false */ + if (!child->init_recvd) { + return false; + } + /* if this child has registered a finalize, return false */ + if (child->fini_recvd) { + return false; + } + } + } + + /* if we get here, then everyone in the job is currently registered */ + return true; + +} + +static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) +{ + opal_list_item_t *item; + orte_odls_child_t *child; + int rc; + + /* the thread is locked elsewhere - don't try to do it again here */ + + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + + /* is this child part of the specified job? */ + if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { + /* pack the child's vpid - must be done in case rml_uri is NULL */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->vpid), 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* Pack the child's epoch. */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->epoch), 1, ORTE_EPOCH))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the contact info */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + + return ORTE_SUCCESS; + +} + +static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code) +{ + opal_list_item_t *item; + orte_odls_child_t *child; + + /* set the state */ + jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; + + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + if (child->name->jobid == jobdat->jobid) { + if (ORTE_PROC_STATE_LAUNCHED > child->state || + ORTE_PROC_STATE_FAILED_TO_START == child->state) { + /* this proc never launched - flag that the iof + * is complete or else we will hang waiting for + * pipes to close that were never opened + */ + child->iof_complete = true; + /* ditto for waitpid */ + child->waitpid_recvd = true; + } + } + } + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp: job %s reported incomplete start", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jobdat->jobid))); + return; +} + +static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state) +{ + opal_list_item_t *item; + orte_odls_child_t *child; + + /* update job state */ + jobdat->state = jobstate; + /* update children */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + if (jobdat->jobid == child->name->jobid) { + child->state = state; + } + } +} + +static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch) +{ + opal_pointer_array_t cmd; + orte_proc_t proc; + int rc; + + /* stop local sensors for this job */ + if (ORTE_VPID_WILDCARD == vpid) { + orte_sensor.stop(job); + } + + if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) { + if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { + ORTE_ERROR_LOG(rc); + } + return; + } + + OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); + OBJ_CONSTRUCT(&proc, orte_proc_t); + proc.name.jobid = job; + proc.name.vpid = vpid; + proc.name.epoch = epoch; + opal_pointer_array_add(&cmd, &proc); + if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { + ORTE_ERROR_LOG(rc); + } + OBJ_DESTRUCT(&cmd); + OBJ_DESTRUCT(&proc); +} + +static int record_dead_process(orte_process_name_t *proc) { + opal_pointer_array_t *dead_name; + opal_buffer_t *buffer; + orte_daemon_cmd_flag_t command; + int rc = ORTE_SUCCESS; + int num_failed; + + if (orte_odls_base_default_check_finished(proc)) { + return rc; + } + + dead_name = OBJ_NEW(opal_pointer_array_t); + + opal_pointer_array_add(dead_name, proc); + + /* Mark the process as dead */ + mark_processes_as_dead(dead_name); + + /* Send a message to the HNP */ + buffer = OBJ_NEW(opal_buffer_t); + command = ORTE_PROCESS_FAILED_NOTIFICATION; + + num_failed = 1; + + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(rc); + } else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + } else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + } + + orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0); + + OBJ_RELEASE(buffer); + OBJ_RELEASE(dead_name); + + return rc; +} + +int send_to_local_applications(opal_pointer_array_t *dead_names) { + opal_buffer_t *buf; + int ret; + orte_process_name_t *name_item; + int size, i; + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, + "%s Sending failure to local applications.", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + buf = OBJ_NEW(opal_buffer_t); + + size = opal_pointer_array_get_size(dead_names); + + if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buf); + return ret; + } + + for (i = 0; i < size; i++) { + if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buf); + return ret; + } + } + } + + if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buf); + return ret; + } + + OBJ_RELEASE(buf); + + return ORTE_SUCCESS; +} + diff --git a/orte/mca/errmgr/ortedresil/errmgr_ortedresil.h b/orte/mca/errmgr/ortedresil/errmgr_ortedresil.h new file mode 100644 index 0000000000..6d9cefa6c8 --- /dev/null +++ b/orte/mca/errmgr/ortedresil/errmgr_ortedresil.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + */ + +#ifndef MCA_ERRMGR_ORTEDRESIL_EXPORT_H +#define MCA_ERRMGR_ORTEDRESIL_EXPORT_H + +#include "orte_config.h" + +#include "orte/mca/errmgr/errmgr.h" + +BEGIN_C_DECLS + +/* + * Local Component structures + */ + +ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_ortedresil_component; + +ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_ortedresil_module; + +END_C_DECLS + +#endif /* MCA_ERRMGR_ORTEDRESIL_EXPORT_H */ diff --git a/orte/mca/errmgr/ortedresil/errmgr_ortedresil_component.c b/orte/mca/errmgr/ortedresil/errmgr_ortedresil_component.c new file mode 100644 index 0000000000..a3ece3f2f1 --- /dev/null +++ b/orte/mca/errmgr/ortedresil/errmgr_ortedresil_component.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "errmgr_ortedresil.h" + +/* + * Public string for version number + */ +const char *orte_errmgr_ortedresil_component_version_string = + "ORTE ERRMGR ortedresil MCA component version " ORTE_VERSION; + +/* + * Local functionality + */ +static int errmgr_ortedresil_open(void); +static int errmgr_ortedresil_close(void); +static int errmgr_ortedresil_component_query(mca_base_module_t **module, int *priority); + +/* + * Instantiate the public struct with all of our public information + * and pointer to our public functions in it + */ +orte_errmgr_base_component_t mca_errmgr_ortedresil_component = +{ + /* Handle the general mca_component_t struct containing + * meta information about the component itortedresil + */ + { + ORTE_ERRMGR_BASE_VERSION_3_0_0, + /* Component name and version */ + "ortedresil", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + errmgr_ortedresil_open, + errmgr_ortedresil_close, + errmgr_ortedresil_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } +}; + +static int errmgr_ortedresil_open(void) +{ + return ORTE_SUCCESS; +} + +static int errmgr_ortedresil_close(void) +{ + return ORTE_SUCCESS; +} + +static int errmgr_ortedresil_component_query(mca_base_module_t **module, int *priority) +{ + if (ORTE_PROC_IS_DAEMON) { + /* keep our priority low so that other modules are higher + * and will run before us + */ + *priority = 0; + *module = (mca_base_module_t *)&orte_errmgr_ortedresil_module; + return ORTE_SUCCESS; + } + + *priority = -1; + *module = NULL; + return ORTE_ERROR; +} + diff --git a/orte/mca/errmgr/ortedresil/help-orte-errmgr-orted.txt b/orte/mca/errmgr/ortedresil/help-orte-errmgr-orted.txt new file mode 100644 index 0000000000..c6d43f1f77 --- /dev/null +++ b/orte/mca/errmgr/ortedresil/help-orte-errmgr-orted.txt @@ -0,0 +1,14 @@ + -*- text -*- +# +# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for ORTE RecoS IGNORE framework. +# diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index 124a097f69..befe43f3db 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -240,10 +240,12 @@ int orte_ess_base_app_setup(void) } /* Execute the post-startup errmgr code */ - if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) { - ORTE_ERROR_LOG(ret); - error = "orte_errmgr.post_startup"; - goto error; + if (NULL != orte_errmgr.post_startup) { + if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) { + ORTE_ERROR_LOG(ret); + error = "orte_errmgr.post_startup"; + goto error; + } } /* if we are an ORTE app - and not an MPI app - then @@ -278,7 +280,9 @@ error: int orte_ess_base_app_finalize(void) { - orte_errmgr.pre_shutdown(); + if (NULL != orte_errmgr.pre_shutdown) { + orte_errmgr.pre_shutdown(); + } orte_notifier_base_close(); diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index 2a08182c21..1325cc818b 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -505,10 +505,12 @@ int orte_ess_base_orted_setup(char **hosts) orte_sensor.start(ORTE_PROC_MY_NAME->jobid); /* Execute the post-startup errmgr code */ - if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) { - ORTE_ERROR_LOG(ret); - error = "orte_errmgr.post_startup"; - goto error; + if (NULL != orte_errmgr.post_startup) { + if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) { + ORTE_ERROR_LOG(ret); + error = "orte_errmgr.post_startup"; + goto error; + } } return ORTE_SUCCESS; @@ -523,7 +525,9 @@ int orte_ess_base_orted_setup(char **hosts) int orte_ess_base_orted_finalize(void) { - orte_errmgr.pre_shutdown(); + if (NULL != orte_errmgr.pre_shutdown) { + orte_errmgr.pre_shutdown(); + } /* stop the local sensors */ orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);