diff --git a/ompi/runtime/ompi_mpi_init.c b/ompi/runtime/ompi_mpi_init.c index 706a34cfef..3f751a98ee 100644 --- a/ompi/runtime/ompi_mpi_init.c +++ b/ompi/runtime/ompi_mpi_init.c @@ -371,7 +371,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided) } /* Register errhandler callback with orte errmgr */ - orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback); + if (NULL != orte_errmgr.set_fault_callback) { + orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback); + } /* Figure out the final MPI thread levels. If we were not compiled for support for MPI threads, then don't allow diff --git a/orte/mca/errmgr/app/errmgr_app.c b/orte/mca/errmgr/app/errmgr_app.c index 459112fc75..97332cc655 100644 --- a/orte/mca/errmgr/app/errmgr_app.c +++ b/orte/mca/errmgr/app/errmgr_app.c @@ -1,13 +1,9 @@ /* - * Copyright (c) 2009-2011 The Trustees of Indiana University. + * Copyright (c) 2009-2010 The Trustees of Indiana University. * All rights reserved. - * * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. * - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * * $COPYRIGHT$ * * Additional copyrights may follow @@ -26,15 +22,11 @@ #endif #include "opal/util/output.h" -#include "opal/dss/dss.h" -#include "opal/mca/event/event.h" #include "orte/util/error_strings.h" #include "orte/util/name_fns.h" #include "orte/util/show_help.h" -#include "orte/util/nidmap.h" #include "orte/runtime/orte_globals.h" -#include "orte/runtime/orte_wait.h" #include "orte/mca/routed/routed.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" @@ -56,22 +48,9 @@ static int update_state(orte_jobid_t job, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code); - static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs); -static int post_startup(void); -static int pre_shutdown(void); - -void epoch_change_recv(int status, - orte_process_name_t *sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, - void *cbdata); -void epoch_change(int fd, - short event, - void *data); - /****************** * HNP module ******************/ @@ -86,11 +65,11 @@ orte_errmgr_base_module_t orte_errmgr_app_module = { NULL, NULL, orte_errmgr_base_register_migration_warning, - post_startup, - pre_shutdown, - NULL, - orte_errmgr_base_set_fault_callback, - NULL + NULL, /* post_startup */ + NULL, /* pre_shutdown */ + NULL, /* mark_processes_as_dead */ + NULL, /* set_fault_callback */ + NULL /* failure_notification */ }; /************************ @@ -113,8 +92,6 @@ static int update_state(orte_jobid_t job, pid_t pid, orte_exit_code_t exit_code) { - orte_ns_cmp_bitmask_t mask; - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:app: job %s reported state %s" " for proc %s state %s exit_code %d", @@ -132,9 +109,9 @@ static int update_state(orte_jobid_t job, } if (ORTE_PROC_STATE_COMM_FAILED == state) { - mask = ORTE_NS_CMP_ALL; /* if it is our own connection, ignore it */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { + if (ORTE_PROC_MY_NAME->jobid == proc->vpid && + ORTE_PROC_MY_NAME->vpid == proc->vpid) { return ORTE_SUCCESS; } @@ -148,95 +125,6 @@ static int update_state(orte_jobid_t job, return ORTE_SUCCESS; } -static int post_startup(void) { - int ret = ORTE_SUCCESS; - - ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON, - ORTE_RML_TAG_EPOCH_CHANGE, - ORTE_RML_PERSISTENT, - epoch_change_recv, - NULL); - - return ret; -} - -static int pre_shutdown(void) { - int ret = ORTE_SUCCESS; - - ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON, - ORTE_RML_TAG_EPOCH_CHANGE); - - return ret; -} - -void epoch_change_recv(int status, - orte_process_name_t *sender, - opal_buffer_t *buffer, - orte_rml_tag_t tag, - void *cbdata) { - - ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change); -} - -void epoch_change(int fd, - short event, - void *data) { - orte_message_event_t *mev = (orte_message_event_t *) data; - opal_buffer_t *buffer = mev->buffer; - orte_process_name_t *proc; - int n = 1, ret, num_dead, i; - opal_pointer_array_t *procs; - - if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) { - return; - } - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Received epoch change notification", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - procs = OBJ_NEW(opal_pointer_array_t); - - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return; - } - - proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead); - for (i = 0; i < num_dead; i++) { - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - return; - } - proc[i].epoch++; - orte_util_set_epoch(&proc[i], proc[i].epoch); - - opal_pointer_array_add(procs, &proc[i]); - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Epoch for %s updated", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&proc[i]))); - } - - if (NULL != fault_cbfunc && 0 < num_dead) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Calling fault callback", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - (*fault_cbfunc)(procs); - } else { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s errmgr:app Calling fault callback failed!", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - } - - free(proc); - OBJ_RELEASE(procs); -} - static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs) { int ret, exit_status = ORTE_SUCCESS; @@ -278,7 +166,7 @@ static int orte_errmgr_app_abort_peers(orte_process_name_t *procs, orte_std_cntr goto cleanup; } -cleanup: + cleanup: OBJ_DESTRUCT(&buffer); return exit_status; diff --git a/orte/mca/errmgr/appresil/.windows b/orte/mca/errmgr/appresil/.windows new file mode 100644 index 0000000000..aa7d7bbbe5 --- /dev/null +++ b/orte/mca/errmgr/appresil/.windows @@ -0,0 +1,12 @@ +# +# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module +mca_link_libraries=libopen-rte diff --git a/orte/mca/errmgr/appresil/Makefile.am b/orte/mca/errmgr/appresil/Makefile.am new file mode 100644 index 0000000000..4513d841cd --- /dev/null +++ b/orte/mca/errmgr/appresil/Makefile.am @@ -0,0 +1,36 @@ +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = .windows + +sources = \ + errmgr_appresil.h \ + errmgr_appresil_component.c \ + errmgr_appresil.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_errmgr_appresil_DSO +component_noinst = +component_install = mca_errmgr_appresil.la +else +component_noinst = libmca_errmgr_appresil.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_errmgr_appresil_la_SOURCES = $(sources) +mca_errmgr_appresil_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_errmgr_appresil_la_SOURCES =$(sources) +libmca_errmgr_appresil_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/appresil/errmgr_appresil.c b/orte/mca/errmgr/appresil/errmgr_appresil.c new file mode 100644 index 0000000000..72f61d9a6c --- /dev/null +++ b/orte/mca/errmgr/appresil/errmgr_appresil.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2009-2011 The Trustees of Indiana University. + * All rights reserved. + * + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/output.h" +#include "opal/dss/dss.h" +#include "opal/mca/event/event.h" + +#include "orte/util/error_strings.h" +#include "orte/util/name_fns.h" +#include "orte/util/show_help.h" +#include "orte/util/nidmap.h" +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_wait.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/rml_types.h" +#include "orte/mca/odls/odls_types.h" + +#include "orte/mca/errmgr/base/base.h" +#include "orte/mca/errmgr/base/errmgr_private.h" +#include "errmgr_appresil.h" + +/* + * Module functions: Global + */ +static int init(void); +static int finalize(void); + +static int update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); + +static int orte_errmgr_appresil_abort_peers(orte_process_name_t *procs, + orte_std_cntr_t num_procs); + +static int post_startup(void); +static int pre_shutdown(void); + +void epoch_change_recv(int status, + orte_process_name_t *sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, + void *cbdata); +void epoch_change(int fd, + short event, + void *data); + +/****************** + * HNP module + ******************/ +orte_errmgr_base_module_t orte_errmgr_appresil_module = { + init, + finalize, + orte_errmgr_base_log, + orte_errmgr_base_abort, + orte_errmgr_appresil_abort_peers, + update_state, + NULL, + NULL, + NULL, + orte_errmgr_base_register_migration_warning, + post_startup, + pre_shutdown, + NULL, + orte_errmgr_base_set_fault_callback, + NULL +}; + +/************************ + * API Definitions + ************************/ +static int init(void) +{ + return ORTE_SUCCESS; +} + +static int finalize(void) +{ + return ORTE_SUCCESS; +} + +static int update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + orte_ns_cmp_bitmask_t mask; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:appresil: job %s reported state %s" + " for proc %s state %s exit_code %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + orte_job_state_to_str(jobstate), + (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state), exit_code)); + + /* + * if orte is trying to shutdown, just let it + */ + if (orte_finalizing) { + return ORTE_SUCCESS; + } + + if (ORTE_PROC_STATE_COMM_FAILED == state) { + mask = ORTE_NS_CMP_ALL; + /* if it is our own connection, ignore it */ + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { + return ORTE_SUCCESS; + } + + /* delete the route */ + orte_routed.delete_route(proc); + /* see is this was a lifeline */ + if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { + return ORTE_ERR_UNRECOVERABLE; + } + } + return ORTE_SUCCESS; +} + +static int post_startup(void) { + int ret = ORTE_SUCCESS; + + ret = orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON, + ORTE_RML_TAG_EPOCH_CHANGE, + ORTE_RML_PERSISTENT, + epoch_change_recv, + NULL); + + return ret; +} + +static int pre_shutdown(void) { + int ret = ORTE_SUCCESS; + + ret = orte_rml.recv_cancel(ORTE_PROC_MY_DAEMON, + ORTE_RML_TAG_EPOCH_CHANGE); + + return ret; +} + +void epoch_change_recv(int status, + orte_process_name_t *sender, + opal_buffer_t *buffer, + orte_rml_tag_t tag, + void *cbdata) { + + ORTE_MESSAGE_EVENT(sender, buffer, tag, epoch_change); +} + +void epoch_change(int fd, + short event, + void *data) { + orte_message_event_t *mev = (orte_message_event_t *) data; + opal_buffer_t *buffer = mev->buffer; + orte_process_name_t *proc; + int n = 1, ret, num_dead, i; + opal_pointer_array_t *procs; + + if (orte_finalizing || orte_job_term_ordered || orte_orteds_term_ordered) { + return; + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:appresil Received epoch change notification", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + procs = OBJ_NEW(opal_pointer_array_t); + + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_dead, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(ret); + opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + return; + } + + proc = (orte_process_name_t *) malloc(sizeof(orte_process_name_t) * num_dead); + for (i = 0; i < num_dead; i++) { + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &proc[i], &n, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + opal_output(0, "%s Error unpacking message.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + return; + } + proc[i].epoch++; + orte_util_set_epoch(&proc[i], proc[i].epoch); + + opal_pointer_array_add(procs, &proc[i]); + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:appresil Epoch for %s updated", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc[i]))); + } + + if (NULL != fault_cbfunc && 0 < num_dead) { + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:appresil Calling fault callback", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + (*fault_cbfunc)(procs); + } else { + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:appresil Calling fault callback failed!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + } + + free(proc); + OBJ_RELEASE(procs); +} + +static int orte_errmgr_appresil_abort_peers(orte_process_name_t *procs, orte_std_cntr_t num_procs) +{ + int ret, exit_status = ORTE_SUCCESS; + opal_buffer_t buffer; + orte_std_cntr_t i; + orte_daemon_cmd_flag_t command = ORTE_DAEMON_ABORT_PROCS_CALLED; + + /* + * Pack up the list of processes and send them to the HNP + */ + OBJ_CONSTRUCT(&buffer, opal_buffer_t); + + if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &command, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* pack number of processes */ + if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(num_procs), 1, ORTE_STD_CNTR))) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* Pack the list of names */ + for( i = 0; i < num_procs; ++i ) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(&buffer, &(procs[i]), 1, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + } + + /* Send to HNP for termination */ + if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &buffer, ORTE_RML_TAG_DAEMON, 0))) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + +cleanup: + OBJ_DESTRUCT(&buffer); + + return exit_status; +} diff --git a/orte/mca/errmgr/appresil/errmgr_appresil.h b/orte/mca/errmgr/appresil/errmgr_appresil.h new file mode 100644 index 0000000000..d72d4056f7 --- /dev/null +++ b/orte/mca/errmgr/appresil/errmgr_appresil.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + */ + +#ifndef MCA_ERRMGR_APPRESIL_EXPORT_H +#define MCA_ERRMGR_APPRESIL_EXPORT_H + +#include "orte_config.h" + +#include "orte/mca/errmgr/errmgr.h" + +BEGIN_C_DECLS + +/* + * Local Component structures + */ + +ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_appresil_component; + +ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_appresil_module; + +END_C_DECLS + +#endif /* MCA_ERRMGR_APPRESIL_EXPORT_H */ diff --git a/orte/mca/errmgr/appresil/errmgr_appresil_component.c b/orte/mca/errmgr/appresil/errmgr_appresil_component.c new file mode 100644 index 0000000000..33023d1052 --- /dev/null +++ b/orte/mca/errmgr/appresil/errmgr_appresil_component.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "errmgr_appresil.h" + +/* + * Public string for version number + */ +const char *orte_errmgr_appresil_component_version_string = + "ORTE ERRMGR appresil MCA component version " ORTE_VERSION; + +/* + * Local functionality + */ +static int errmgr_appresil_open(void); +static int errmgr_appresil_close(void); +static int errmgr_appresil_component_query(mca_base_module_t **module, int *priority); + +/* + * Instantiate the public struct with all of our public information + * and pointer to our public functions in it + */ +orte_errmgr_base_component_t mca_errmgr_appresil_component = +{ + /* Handle the general mca_component_t struct containing + * meta information about the component itapp + */ + { + ORTE_ERRMGR_BASE_VERSION_3_0_0, + /* Component name and version */ + "appresil", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + errmgr_appresil_open, + errmgr_appresil_close, + errmgr_appresil_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + /* Verbosity level */ + 0, + /* opal_output handler */ + -1, + /* Default priority */ + 0 +}; + +static int errmgr_appresil_open(void) +{ + return ORTE_SUCCESS; +} + +static int errmgr_appresil_close(void) +{ + return ORTE_SUCCESS; +} + +static int errmgr_appresil_component_query(mca_base_module_t **module, int *priority) +{ + if (ORTE_PROC_IS_APP) { + /* keep our priority low so that other modules are higher + * and will run before us + */ + *priority = 0; + *module = (mca_base_module_t *)&orte_errmgr_appresil_module; + return ORTE_SUCCESS; + } + + *priority = -1; + *module = NULL; + return ORTE_ERROR; +} diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.c b/orte/mca/errmgr/hnp/errmgr_hnp.c index d065d952ff..381a3d4338 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp.c @@ -1,11 +1,8 @@ /* - * Copyright (c) 2009-2011 The Trustees of Indiana University. + * Copyright (c) 2009-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -40,14 +37,11 @@ #include "orte/mca/routed/routed.h" #include "orte/mca/debugger/base/base.h" #include "orte/mca/notifier/notifier.h" -#include "orte/mca/grpcomm/grpcomm.h" -#include "orte/mca/ess/ess.h" #include "orte/util/error_strings.h" #include "orte/util/name_fns.h" #include "orte/util/proc_info.h" #include "orte/util/show_help.h" -#include "orte/util/nidmap.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_locks.h" @@ -56,7 +50,6 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/errmgr_private.h" - #include "errmgr_hnp.h" /********************** @@ -83,16 +76,11 @@ static orte_errmgr_base_module_t global_module = { /* FT Event hook */ orte_errmgr_hnp_global_ft_event, orte_errmgr_base_register_migration_warning, - /* Post-startup */ - orte_errmgr_hnp_global_post_startup, - /* Pre-shutdown */ - orte_errmgr_hnp_global_pre_shutdown, - /* Mark as dead */ - orte_errmgr_hnp_global_mark_processes_as_dead, - /* Set the callback */ - orte_errmgr_base_set_fault_callback, - /* Receive failure notification */ - orte_errmgr_hnp_global_failure_notification + NULL, /* post_startup */ + NULL, /* pre_shutdown */ + NULL, /* mark_processes_as_dead */ + NULL, /* set_fault_callback */ + NULL /* failure_notification */ }; @@ -104,11 +92,10 @@ static void failed_start(orte_job_t *jdata); static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, orte_proc_state_t state, orte_exit_code_t exit_code); static void check_job_complete(orte_job_t *jdata); -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch); +static void killprocs(orte_jobid_t job, orte_vpid_t vpid); static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, orte_proc_state_t state, orte_exit_code_t exit_code); static orte_odls_child_t* proc_is_local(orte_process_name_t *proc); -static int send_to_local_applications(opal_pointer_array_t *dead_names); /************************ * API Definitions @@ -168,7 +155,7 @@ int orte_errmgr_hnp_global_module_init(void) goto cleanup; } -cleanup: + cleanup: return exit_status; } @@ -206,7 +193,7 @@ int orte_errmgr_hnp_global_module_finalize(void) goto cleanup; } -cleanup: + cleanup: return exit_status; } @@ -275,7 +262,7 @@ int orte_errmgr_hnp_global_update_state(orte_jobid_t job, } } -cleanup: + cleanup: return exit_status; } @@ -306,7 +293,7 @@ int orte_errmgr_hnp_global_predicted_fault(opal_list_t *proc_list, goto cleanup; } -cleanup: + cleanup: return exit_status; #else return ORTE_ERR_NOT_IMPLEMENTED; @@ -342,7 +329,7 @@ int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc, } } -cleanup: + cleanup: return exit_status; #else return ORTE_ERR_NOT_IMPLEMENTED; @@ -374,7 +361,7 @@ int orte_errmgr_hnp_global_ft_event(int state) goto cleanup; } -cleanup: + cleanup: return exit_status; } @@ -404,7 +391,8 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, orte_odls_child_t *child; int rc; orte_app_context_t *app; - + orte_proc_t *pdat; + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp: job %s reported state %s" " for proc %s state %s pid %d exit_code %d", @@ -435,7 +423,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, hnp_abort(job, exit_code); return ORTE_SUCCESS; } - + /* get the job object */ if (NULL == (jdata = orte_get_job_data_object(job))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); @@ -443,7 +431,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, } /* update the state */ jdata->state = jobstate; - + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp: job %s reported state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -536,7 +524,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, exit_code); /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); check_job_complete(jdata); /* set the local proc states */ /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't @@ -548,7 +536,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, break; case ORTE_JOB_STATE_COMM_FAILED: /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); check_job_complete(jdata); /* set the local proc states */ /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't @@ -560,7 +548,7 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, break; case ORTE_JOB_STATE_HEARTBEAT_FAILED: /* order all local procs for this job to be killed */ - killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + killprocs(jdata->jobid, ORTE_VPID_WILDCARD); check_job_complete(jdata); /* set the local proc states */ /* the job object for this job will have been NULL'd * in the array if the job was solely local. If it isn't @@ -629,11 +617,6 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, /* guess not - let it fall thru to abort */ } } - - if (ORTE_PROC_STATE_ABORTED_BY_SIG == state) { - exit_code = 0; - } - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); check_job_complete(jdata); /* need to set the job state */ /* the job object for this job will have been NULL'd @@ -645,168 +628,174 @@ int orte_errmgr_hnp_base_global_update_state(orte_jobid_t job, } break; - case ORTE_PROC_STATE_FAILED_TO_START: - case ORTE_PROC_STATE_CALLED_ABORT: - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die + case ORTE_PROC_STATE_FAILED_TO_START: + case ORTE_PROC_STATE_CALLED_ABORT: + orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); + check_job_complete(jdata); + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + + case ORTE_PROC_STATE_REGISTERED: + case ORTE_PROC_STATE_RUNNING: + orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); + break; + + case ORTE_PROC_STATE_LAUNCHED: + /* record the pid for this child */ + orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); + break; + + case ORTE_PROC_STATE_TERMINATED: + case ORTE_PROC_STATE_TERM_NON_ZERO: + case ORTE_PROC_STATE_KILLED_BY_CMD: + orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); + check_job_complete(jdata); + break; + + case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: + if (jdata->enable_recovery) { + killprocs(proc->jobid, proc->vpid); + /* is this a local proc */ + if (NULL != (child = proc_is_local(proc))) { + /* local proc - see if it has reached its restart limit */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); + if (child->restarts < app->max_restarts) { + child->restarts++; + if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { + return ORTE_SUCCESS; + } + /* reset the child's state as restart_proc would + * have cleared it + */ + child->state = state; + /* see if we can relocate it somewhere else */ + if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { + return ORTE_SUCCESS; + } + /* let it fall thru to abort */ + } + } else { + /* this is a remote process - see if we can relocate it */ + if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { + return ORTE_SUCCESS; + } + /* guess not - let it fall thru to abort */ + } + } + /* kill all jobs */ + orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); + check_job_complete(jdata); /* need to set the job state */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + + case ORTE_PROC_STATE_COMM_FAILED: + /* is this to a daemon? */ + if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { + /* if this is my own connection, ignore it */ + if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s My own connection - ignoring it", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + break; + } + /* if we have ordered orteds to terminate, record it */ + if (orte_orteds_term_ordered) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s Daemons terminating - recording daemon %s as gone", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); + /* remove from dependent routes, if it is one */ + orte_routed.route_lost(proc); + /* update daemon job */ + orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, 0); + /* check for complete */ + check_job_complete(jdata); + break; + } + /* if abort is in progress, see if this one failed to tell + * us it had terminated */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - hnp_abort(jdata->jobid, exit_code); + if (orte_abnormal_term_ordered) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s Abort in progress - recording daemon %s as gone", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); + /* remove from dependent routes, if it is one */ + orte_routed.route_lost(proc); + /* update daemon job */ + orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code); + /* check for complete */ + check_job_complete(jdata); + break; } - break; - case ORTE_PROC_STATE_REGISTERED: - case ORTE_PROC_STATE_RUNNING: - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - break; + /* delete the route */ + orte_routed.delete_route(proc); + /* purge the oob */ + orte_rml.purge(proc); - case ORTE_PROC_STATE_LAUNCHED: - /* record the pid for this child */ - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - break; - - case ORTE_PROC_STATE_TERMINATED: - case ORTE_PROC_STATE_TERM_NON_ZERO: - case ORTE_PROC_STATE_KILLED_BY_CMD: - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); - break; - - case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: - if (jdata->enable_recovery) { - killprocs(proc->jobid, proc->vpid, proc->epoch); - /* is this a local proc */ - if (NULL != (child = proc_is_local(proc))) { - /* local proc - see if it has reached its restart limit */ - app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); - if (child->restarts < app->max_restarts) { - child->restarts++; - if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { - return ORTE_SUCCESS; - } - /* reset the child's state as restart_proc would - * have cleared it - */ - child->state = state; - /* see if we can relocate it somewhere else */ - if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { - return ORTE_SUCCESS; - } - /* let it fall thru to abort */ - } - } else { - /* this is a remote process - see if we can relocate it */ - if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { - return ORTE_SUCCESS; - } - /* guess not - let it fall thru to abort */ - } - } - /* kill all jobs */ - orte_errmgr_hnp_update_proc(jdata, proc, state, pid, exit_code); - check_job_complete(jdata); /* need to set the job state */ - /* the job object for this job will have been NULL'd - * in the array if the job was solely local. If it isn't - * NULL, then we need to tell everyone else to die - */ - if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { - hnp_abort(jdata->jobid, exit_code); - } - break; - - case ORTE_PROC_STATE_COMM_FAILED: - /* is this to a daemon? */ - if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - /* if this is my own connection, ignore it */ - if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s My own connection - ignoring it", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - break; - } - /* if we have ordered orteds to terminate, record it */ - if (orte_orteds_term_ordered) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s Daemons terminating - recording daemon %s as gone", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); - /* remove from dependent routes, if it is one */ - orte_routed.route_lost(proc); - /* update daemon job */ - orte_errmgr_hnp_record_dead_process(proc); - /* We'll check if the job was complete when we get the - * message back from the HNP notifying us of the dead - * process - */ - break; - } - /* if abort is in progress, see if this one failed to tell - * us it had terminated - */ - if (orte_abnormal_term_ordered) { - OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, - "%s Abort in progress - recording daemon %s as gone", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); - /* remove from dependent routes, if it is one */ - orte_routed.route_lost(proc); - /* update daemon job */ - orte_errmgr_hnp_record_dead_process(proc); - /* We'll check if the job was complete when we get the - * message back from the HNP notifying us of the dead - * process - */ - break; - } - - /* delete the route */ - orte_routed.delete_route(proc); - /* purge the oob */ - orte_rml.purge(proc); - - if( orte_enable_recovery ) { - /* relocate its processes */ - if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) { - /* unable to relocate for some reason */ - opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); - /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); - /* kill all jobs */ - hnp_abort(ORTE_JOBID_WILDCARD, exit_code); - /* check if all is complete so we can terminate */ - check_job_complete(jdata); - } - } else { - if (ORTE_SUCCESS != orte_errmgr_hnp_record_dead_process(proc)) { - /* The process is already dead so don't keep trying to do - * this stuff. */ - return ORTE_SUCCESS; - } - /* We'll check if the job was complete when we get the - * message back from the HNP notifying us of the dead - * process */ - } - } - break; - - case ORTE_PROC_STATE_HEARTBEAT_FAILED: - /* heartbeats are only from daemons */ if( orte_enable_recovery ) { /* relocate its processes */ + if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) { + /* unable to relocate for some reason */ + opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); + /* kill all jobs */ + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); + /* check if all is complete so we can terminate */ + check_job_complete(jdata); + } } else { - orte_errmgr_hnp_record_dead_process(proc); + if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true, + ORTE_VPID_PRINT(proc->vpid), "Unknown"); + } else { + orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:daemon-died", true, + ORTE_VPID_PRINT(proc->vpid), + (NULL == pdat->node) ? "Unknown" : + ((NULL == pdat->node->name) ? "Unknown" : pdat->node->name)); + } + /* remove this proc from the daemon job */ + orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code); /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* kill all jobs */ hnp_abort(ORTE_JOBID_WILDCARD, exit_code); - return ORTE_ERR_UNRECOVERABLE; + /* check if all is complete so we can terminate */ + check_job_complete(jdata); } - break; + } + break; - default: - break; + case ORTE_PROC_STATE_HEARTBEAT_FAILED: + /* heartbeats are only from daemons */ + if( orte_enable_recovery ) { + /* relocate its processes */ + } else { + orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code); + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); + /* kill all jobs */ + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); + return ORTE_ERR_UNRECOVERABLE; + } + break; + + default: + break; } return ORTE_SUCCESS; @@ -817,177 +806,13 @@ int orte_errmgr_hnp_base_global_ft_event(int state) return ORTE_SUCCESS; } -int orte_errmgr_hnp_global_post_startup(void) { - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_global_pre_shutdown(void) { - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) { - orte_std_cntr_t n; - int ret = ORTE_SUCCESS, num_failed; - opal_pointer_array_t *dead_names; - int32_t i; - orte_process_name_t *name_item; - orte_epoch_t epoch; - orte_job_t *jdat; - orte_proc_t *pdat, *pdat2; - opal_buffer_t *answer; - orte_daemon_cmd_flag_t command; - - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:hnp HNP received process failed from orted %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(sender)); - } - - n = 1; - /* Get the number of failed procs */ - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - return ret; - } - - dead_names = OBJ_NEW(opal_pointer_array_t); - - for (i = 0; i < num_failed; i++) { - name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - - /* Unpack the buffer to get the dead process' name. */ - n = 1; - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - return ret; - } - - /* Check to see if the message is telling us about an old epoch. - * If so ignore the message. - */ - epoch = orte_util_lookup_epoch(name_item); - if (name_item->epoch < epoch) { - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:hnp HNP ignoring duplicate notification for %s failure (reported epoch: %s local epoch: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item), - ORTE_EPOCH_PRINT(name_item->epoch), - ORTE_EPOCH_PRINT(epoch)); - } - free(name_item); - continue; - } else { - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:hnp HNP received notification for %s failure (reported epoch: %s local epoch: %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item), - ORTE_EPOCH_PRINT(name_item->epoch), - ORTE_EPOCH_PRINT(epoch)); - } - } - - opal_pointer_array_add(dead_names, name_item); - - /* Check to see if the message is telling us about an orted and - * it is from another orted. Orteds don't have the list of all - * the application processes so they don't know if there were - * any child processes on the nodes that they are reporting. */ - if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, sender, ORTE_PROC_MY_NAME)) { - if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) { - continue; - } else if (NULL == (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid))) { - continue; - } else if (NULL == pdat->node) { - continue; - } - - if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) { - for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) { - if (NULL == (pdat2 = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) { - continue; - } else { - name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - - name_item->jobid = pdat2->name.jobid; - name_item->vpid = pdat2->name.vpid; - name_item->epoch = orte_util_lookup_epoch(&(pdat2->name)); - - opal_pointer_array_add(dead_names, name_item); - } - } - } - } - - } - - /* Update the number of failed process so any duplicates don't get - * re-reported. - */ - num_failed = opal_pointer_array_get_size(dead_names); - - if (num_failed > 0) { - orte_errmgr.mark_processes_as_dead(dead_names); - - if (!orte_orteds_term_ordered) { - /* Send a message out to all the orteds to inform them that the - * process is dead. Long live the process (or not if it is so - * decided)! - */ - answer = OBJ_NEW(opal_buffer_t); - command = ORTE_PROCESS_FAILED_NOTIFICATION; - - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); - return ret; - } - - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_failed, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); - return ret; - } - - for (i = 0; i < opal_pointer_array_get_size(dead_names); i++) { - if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, name_item, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); - return ret; - } - } - } - - if (ORTE_SUCCESS != (ret = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, answer, ORTE_RML_TAG_DAEMON))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(answer); - return ret; - } - - /* Tell the applications' ORTE layers that there is a failure. */ - if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) { - return ret; - } - } - - for (i = 0; i < num_failed; i++) { - name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i); - free(name_item); - } - } - - OBJ_RELEASE(dead_names); - - return ret; -} - /***************** * Local Functions *****************/ static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code) { int rc; - + /* if we are already in progress, then ignore this call */ if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, @@ -996,7 +821,7 @@ static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code) ORTE_JOBID_PRINT(job), exit_code)); return; } - + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp: abort called on job %s with status %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -1030,14 +855,14 @@ static void failed_start(orte_job_t *jdata) orte_odls_job_t *jobdat; orte_odls_child_t *child; orte_proc_t *proc; - + /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; - + /* is this the specified job? */ if (jobdat->jobid == jdata->jobid) { break; @@ -1048,7 +873,7 @@ static void failed_start(orte_job_t *jdata) return; } jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; - + for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = next) { @@ -1070,7 +895,7 @@ static void failed_start(orte_job_t *jdata) } } } - + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, "%s errmgr:hnp: job %s reported incomplete start", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -1084,14 +909,14 @@ static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobsta orte_odls_job_t *jobdat; orte_odls_child_t *child; orte_proc_t *proc; - + /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; - + /* is this the specified job? */ if (jobdat->jobid == jdata->jobid) { break; @@ -1207,7 +1032,7 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata, } } } - + /*** UPDATE REMOTE CHILD ***/ for (i=0; i < jdata->procs->size; i++) { if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { @@ -1275,14 +1100,14 @@ static void check_job_complete(orte_job_t *jdata) */ continue; } - + if (0 != proc->exit_code) { non_zero++; if (0 == lowest) { lowest = proc->exit_code; } } - + switch (proc->state) { case ORTE_PROC_STATE_KILLED_BY_CMD: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, @@ -1333,7 +1158,6 @@ static void check_job_complete(orte_job_t *jdata) ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } break; -#if 0 case ORTE_PROC_STATE_ABORTED_BY_SIG: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed proc %s aborted by signal", @@ -1349,7 +1173,6 @@ static void check_job_complete(orte_job_t *jdata) ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } break; -#endif case ORTE_PROC_STATE_TERM_WO_SYNC: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:hnp:check_job_completed proc %s terminated without sync", @@ -1372,7 +1195,6 @@ static void check_job_complete(orte_job_t *jdata) } break; case ORTE_PROC_STATE_COMM_FAILED: -#if 0 if (!jdata->abort) { jdata->state = ORTE_JOB_STATE_COMM_FAILED; /* point to the lowest rank to cause the problem */ @@ -1382,7 +1204,6 @@ static void check_job_complete(orte_job_t *jdata) jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } -#endif break; case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: if (!jdata->abort) { @@ -1452,7 +1273,7 @@ static void check_job_complete(orte_job_t *jdata) break; } } - + if (jdata->abort) { /* the job aborted - turn off any sensors on this job */ orte_sensor.stop(jdata->jobid); @@ -1487,7 +1308,7 @@ static void check_job_complete(orte_job_t *jdata) ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); } - + /* if this job is a continuously operating one, then don't do * anything further - just return here */ @@ -1496,7 +1317,7 @@ static void check_job_complete(orte_job_t *jdata) ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls)) { goto CHECK_ALIVE; } - + /* if the job that is being checked is the HNP, then we are * trying to terminate the orteds. In that situation, we * do -not- check all jobs - we simply notify the HNP @@ -1507,9 +1328,9 @@ static void check_job_complete(orte_job_t *jdata) * This can happen if a ctrl-c hits in the "wrong" place * while launching */ -CHECK_DAEMONS: + CHECK_DAEMONS: if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { - if ((jdata->num_procs - 1) <= jdata->num_terminated) { /* Subtract one for the HNP */ + if (0 == orte_routed.num_routes()) { /* orteds are done! */ OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s orteds complete - exiting", @@ -1523,7 +1344,7 @@ CHECK_DAEMONS: } return; } - + /* Release the resources used by this job. Since some errmgrs may want * to continue using resources allocated to the job as part of their * fault recovery procedure, we only do this once the job is "complete". @@ -1565,8 +1386,8 @@ CHECK_DAEMONS: OBJ_RELEASE(map); jdata->map = NULL; } - -CHECK_ALIVE: + + CHECK_ALIVE: /* now check to see if all jobs are done - release this jdata * object when we find it */ @@ -1672,29 +1493,28 @@ CHECK_ALIVE: } } -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch) +static void killprocs(orte_jobid_t job, orte_vpid_t vpid) { opal_pointer_array_t cmd; orte_proc_t proc; int rc; - + /* stop local sensors for this job */ if (ORTE_VPID_WILDCARD == vpid) { orte_sensor.stop(job); } - - if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) { + + if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) { if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { ORTE_ERROR_LOG(rc); } return; } - + OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); OBJ_CONSTRUCT(&proc, orte_proc_t); proc.name.jobid = job; proc.name.vpid = vpid; - proc.name.epoch = epoch; opal_pointer_array_add(&cmd, &proc); if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { ORTE_ERROR_LOG(rc); @@ -1731,7 +1551,7 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, */ if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { /* remove this proc from the daemon job */ - orte_errmgr_hnp_record_dead_process(proc); + orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, exit_code); /* check to see if any other nodes are "alive" */ if (!orte_hnp_is_allocated && jdata->num_procs == 1) { return ORTE_ERR_FATAL; @@ -1815,10 +1635,10 @@ static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, if (app->max_restarts < pdata->restarts) { return ORTE_ERR_RESTART_LIMIT_EXCEEDED; } - + /* reset the job params for restart */ orte_plm_base_reset_job(jdata); - + /* flag the current node as not-to-be-used */ pdata->node->state = ORTE_NODE_STATE_DO_NOT_USE; @@ -1842,7 +1662,7 @@ static orte_odls_child_t* proc_is_local(orte_process_name_t *proc) { orte_odls_child_t *child; opal_list_item_t *item; - + child = NULL; for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); @@ -1856,229 +1676,59 @@ static orte_odls_child_t* proc_is_local(orte_process_name_t *proc) return NULL; } -int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc) { - orte_job_t *jdat; - orte_proc_t *pdat; - opal_buffer_t *buffer; - orte_daemon_cmd_flag_t command; - int i, rc, num_failed; - opal_pointer_array_t *dead_names; - orte_process_name_t *name_item; - orte_proc_t *proc_item; - - if (NULL == (jdat = orte_get_job_data_object(proc->jobid))) { - opal_output(0, "Can't find job object"); - return ORTE_ERR_NOT_FOUND; - } - - if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) && - ORTE_PROC_STATE_TERMINATED != pdat->state) { - - /* Make sure that the epochs match. */ - if (proc->epoch != pdat->name.epoch) { - opal_output(1, "The epoch does not match the current epoch. Throwing the request out."); - return ORTE_SUCCESS; - } - - dead_names = OBJ_NEW(opal_pointer_array_t); - - if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { - opal_pointer_array_add(dead_names, &(pdat->name)); - - for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) { - if (NULL == (proc_item = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) { - continue; - } - - opal_pointer_array_add(dead_names, &(proc_item->name)); - } - } - - if (!orte_orteds_term_ordered) { - /* - * Send a message to the other daemons so they know that a daemon has - * died. - */ - buffer = OBJ_NEW(opal_buffer_t); - command = ORTE_PROCESS_FAILED_NOTIFICATION; - - num_failed = opal_pointer_array_get_size(dead_names); - - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - } else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - } else { - - /* Iterate of the list of dead procs and send them along with - * the rest. The HNP needs this info so it can tell the other - * ORTEDs and they can inform the appropriate applications. - */ - for (i = 0; i < num_failed; i++) { - if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, name_item, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(buffer); - } - } - } - - OBJ_RELEASE(dead_names); - - orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0); - - OBJ_RELEASE(buffer); - } - } else { - orte_errmgr_hnp_global_mark_processes_as_dead(dead_names); - } - } - - return ORTE_SUCCESS; -} - -int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs) { - int i; - orte_process_name_t *name_item; - orte_job_t *jdat; +void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat, + orte_vpid_t vpid, + orte_proc_state_t state, + orte_exit_code_t exit_code) +{ + orte_job_t *jdt; orte_proc_t *pdat; orte_node_t *node; + int i; - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "HNP %s marking procs as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - /* Iterate over the list of processes */ - for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) { - if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) { - opal_output(1, "NULL found in dead process list."); - continue; - } - - if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "%s Job data not found.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - return ORTE_ERR_NOT_FOUND; - } - - if (NULL != (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid)) && - ORTE_PROC_STATE_TERMINATED != pdat->state) { - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "HNP %s marking %s as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(&pdat->name))); - - /* Make sure the epochs match, if not it probably means that we - * already reported this failure. */ - if (name_item->epoch != pdat->name.epoch) { + if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, vpid)) && + ORTE_PROC_STATE_TERMINATED != pdat->state) { + /* need to record that this one died */ + pdat->state = state; + pdat->exit_code = exit_code; + ORTE_UPDATE_EXIT_STATUS(exit_code); + /* remove it from the job array */ + opal_pointer_array_set_item(jdat->procs, vpid, NULL); + orte_process_info.num_procs--; + jdat->num_procs--; + /* mark the node as down so it won't be used in mapping + * procs to be relaunched + */ + node = pdat->node; + node->state = ORTE_NODE_STATE_DOWN; + node->daemon = NULL; + OBJ_RELEASE(pdat); /* maintain accounting */ + /* mark all procs on this node as having terminated */ + for (i=0; i < node->procs->size; i++) { + if (NULL == (pdat = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } - - orte_util_set_epoch(name_item, name_item->epoch + 1); - - /* Remove it from the job array */ - opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL); - orte_process_info.num_procs--; - jdat->num_procs--; - - /* Check if this is an ORTED */ - if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) { - /* Mark the node as down so it won't be used in mapping anymore. */ - node = pdat->node; - node->state = ORTE_NODE_STATE_DOWN; - node->daemon = NULL; + /* get the job data object for this process */ + if (NULL == (jdt = orte_get_job_data_object(pdat->name.jobid))) { + /* It is possible that the process job finishes before the daemons. + * In that case the process state is set to normal termination, and + * the job data has already been cleared. So no need to throw an + * error. + */ + if( ORTE_PROC_STATE_TERMINATED != pdat->state ) { + opal_output(0, + "%s Error: Failed to find job_data for proc %s (%s) on node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&pdat->name), + orte_proc_state_to_str(pdat->state), + node->name ); + /* major problem */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + } + continue; } - - OBJ_RELEASE(pdat); - - /* Create a new proc object that will keep track of the epoch - * information */ - pdat = OBJ_NEW(orte_proc_t); - pdat->name.jobid = jdat->jobid; - pdat->name.vpid = name_item->vpid; - pdat->name.epoch = name_item->epoch + 1; - - /* Set the state as terminated so we'll know the process isn't - * actually there. */ - pdat->state = ORTE_PROC_STATE_TERMINATED; - - opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); - jdat->num_procs++; - jdat->num_terminated++; - } else { - opal_output(0, "Proc data not found for %s", ORTE_NAME_PRINT(name_item)); - /* Create a new proc object that will keep track of the epoch - * information */ - pdat = OBJ_NEW(orte_proc_t); - pdat->name.jobid = jdat->jobid; - pdat->name.vpid = name_item->vpid; - pdat->name.epoch = name_item->epoch + 1; - - /* Set the state as terminated so we'll know the process isn't - * actually there. */ - pdat->state = ORTE_PROC_STATE_TERMINATED; - - opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); - jdat->num_procs++; - jdat->num_terminated++; - } - - check_job_complete(jdat); - } - - if (!orte_orteds_term_ordered) { - /* Need to update the orted routing module. */ - orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); - - if (NULL != fault_cbfunc) { - (*fault_cbfunc)(dead_procs); + pdat->state = ORTE_PROC_STATE_ABORTED; + jdt->num_terminated++; } } - - return ORTE_SUCCESS; -} - -int send_to_local_applications(opal_pointer_array_t *dead_names) { - opal_buffer_t *buf; - int ret = ORTE_SUCCESS; - orte_process_name_t *name_item; - int size, i; - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "%s Sending failure to local applications.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - buf = OBJ_NEW(opal_buffer_t); - - size = opal_pointer_array_get_size(dead_names); - - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - for (i = 0; i < size; i++) { - if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - } - } - - if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - OBJ_RELEASE(buf); - - return ret; } diff --git a/orte/mca/errmgr/hnp/errmgr_hnp.h b/orte/mca/errmgr/hnp/errmgr_hnp.h index 4c296d0d8f..5c54a8f537 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp.h +++ b/orte/mca/errmgr/hnp/errmgr_hnp.h @@ -1,8 +1,5 @@ /* * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. * * $COPYRIGHT$ * @@ -60,6 +57,10 @@ void orte_errmgr_hnp_update_proc(orte_job_t *jdata, orte_proc_state_t state, pid_t pid, orte_exit_code_t exit_code); +void orte_errmgr_hnp_record_dead_daemon(orte_job_t *jdat, + orte_vpid_t vpid, + orte_proc_state_t state, + orte_exit_code_t exit_code); /*************************** * Module functions: Global @@ -80,11 +81,6 @@ int orte_errmgr_hnp_global_suggest_map_targets(orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list); int orte_errmgr_hnp_global_ft_event(int state); -int orte_errmgr_hnp_global_post_startup(void); -int orte_errmgr_hnp_global_pre_shutdown(void); -int orte_errmgr_hnp_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs); -int orte_errmgr_hnp_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer); -int orte_errmgr_hnp_record_dead_process(orte_process_name_t *proc); /* HNP Versions */ int orte_errmgr_hnp_base_global_init(void); diff --git a/orte/mca/errmgr/hnp/errmgr_hnp_autor.c b/orte/mca/errmgr/hnp/errmgr_hnp_autor.c index 485332fdaa..e598c93a32 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp_autor.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp_autor.c @@ -1,10 +1,7 @@ /* - * Copyright (c) 2009-2011 The Trustees of Indiana University. + * Copyright (c) 2009-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. * * $COPYRIGHT$ * @@ -394,7 +391,6 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc, orte_node_t *node = NULL; bool found = false; int num_removed = 0, num_to_remove; - orte_ns_cmp_bitmask_t mask; if( NULL == current_global_jobdata ) { return ORTE_SUCCESS; @@ -414,8 +410,8 @@ int orte_errmgr_hnp_autor_global_suggest_map_targets(orte_proc_t *proc, item = opal_list_get_next(item) ) { wp_item = (errmgr_autor_wp_item_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &wp_item->name, &proc->name)) { + if( wp_item->name.vpid == proc->name.vpid && + wp_item->name.jobid == proc->name.jobid ) { found = true; break; } @@ -522,7 +518,6 @@ static void errmgr_autor_process_fault_app(orte_job_t *jdata, wp_item = OBJ_NEW(errmgr_autor_wp_item_t); wp_item->name.jobid = proc->jobid; wp_item->name.vpid = proc->vpid; - wp_item->name.epoch = proc->epoch; wp_item->state = state; opal_list_append(procs_pending_recovery, &(wp_item->super)); @@ -617,7 +612,7 @@ static void errmgr_autor_process_fault_daemon(orte_job_t *jdata, /* * Record the dead daemon */ - orte_errmgr_hnp_record_dead_process(proc); + orte_errmgr_hnp_record_dead_daemon(jdata, proc->vpid, state, 0); return; } @@ -626,7 +621,6 @@ void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp) { wp->name.jobid = ORTE_JOBID_INVALID; wp->name.vpid = ORTE_VPID_INVALID; - wp->name.epoch = ORTE_EPOCH_MIN; wp->state = 0; } @@ -635,7 +629,6 @@ void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp) { wp->name.jobid = ORTE_JOBID_INVALID; wp->name.vpid = ORTE_VPID_INVALID; - wp->name.epoch = ORTE_EPOCH_INVALID; wp->state = 0; } diff --git a/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c b/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c index b6a45d51db..63d21e1322 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp_crmig.c @@ -2,9 +2,6 @@ * Copyright (c) 2009-2010 The Trustees of Indiana University. * All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. * * $COPYRIGHT$ * @@ -750,7 +747,6 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_ close_iof_stdin = true; iof_name.jobid = proc->name.jobid; iof_name.vpid = proc->name.vpid; - iof_name.epoch = proc->name.epoch; } } } @@ -807,7 +803,6 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_ close_iof_stdin = true; iof_name.jobid = proc->name.jobid; iof_name.vpid = proc->name.vpid; - iof_name.epoch = proc->name.epoch; } } } @@ -855,7 +850,6 @@ static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_ close_iof_stdin = true; iof_name.jobid = proc->name.jobid; iof_name.vpid = proc->name.vpid; - iof_name.epoch = proc->name.epoch; } } } diff --git a/orte/mca/errmgr/hnpresil/.windows b/orte/mca/errmgr/hnpresil/.windows new file mode 100644 index 0000000000..aa7d7bbbe5 --- /dev/null +++ b/orte/mca/errmgr/hnpresil/.windows @@ -0,0 +1,12 @@ +# +# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module +mca_link_libraries=libopen-rte diff --git a/orte/mca/errmgr/hnpresil/Makefile.am b/orte/mca/errmgr/hnpresil/Makefile.am new file mode 100644 index 0000000000..cb4f030a86 --- /dev/null +++ b/orte/mca/errmgr/hnpresil/Makefile.am @@ -0,0 +1,40 @@ +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = .windows + +dist_pkgdata_DATA = help-orte-errmgr-hnp.txt + +sources = \ + errmgr_hnpresil.h \ + errmgr_hnpresil_component.c \ + errmgr_hnpresil.c \ + errmgr_hnpresil_autor.c \ + errmgr_hnpresil_crmig.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_errmgr_hnpresil_DSO +component_noinst = +component_install = mca_errmgr_hnpresil.la +else +component_noinst = libmca_errmgr_hnpresil.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_errmgr_hnpresil_la_SOURCES = $(sources) +mca_errmgr_hnpresil_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_errmgr_hnpresil_la_SOURCES =$(sources) +libmca_errmgr_hnpresil_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/hnpresil/errmgr_hnpresil.c b/orte/mca/errmgr/hnpresil/errmgr_hnpresil.c new file mode 100644 index 0000000000..2070a5533e --- /dev/null +++ b/orte/mca/errmgr/hnpresil/errmgr_hnpresil.c @@ -0,0 +1,2112 @@ +/* + * Copyright (c) 2009-2011 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif +#ifdef HAVE_SYS_WAIT_H +#include +#endif + +#include "opal/util/output.h" +#include "opal/util/opal_sos.h" +#include "opal/dss/dss.h" + +#include "orte/mca/rml/rml.h" +#include "orte/mca/odls/odls.h" +#include "orte/mca/odls/base/base.h" +#include "orte/mca/plm/base/plm_private.h" +#include "orte/mca/plm/plm.h" +#include "orte/mca/rmaps/rmaps_types.h" +#include "orte/mca/sensor/sensor.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/debugger/base/base.h" +#include "orte/mca/notifier/notifier.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/mca/ess/ess.h" + +#include "orte/util/error_strings.h" +#include "orte/util/name_fns.h" +#include "orte/util/proc_info.h" +#include "orte/util/show_help.h" +#include "orte/util/nidmap.h" + +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_locks.h" +#include "orte/runtime/orte_quit.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "orte/mca/errmgr/base/errmgr_private.h" + +#include "errmgr_hnpresil.h" + +/********************** + * C/R Mgr Components + * Global: HNP + **********************/ +static orte_errmgr_base_module_t global_module = { + /** Initialization Function */ + orte_errmgr_hnpresil_global_module_init, + /** Finalization Function */ + orte_errmgr_hnpresil_global_module_finalize, + /** Error Log */ + orte_errmgr_base_log, + /** Forced Abort */ + orte_errmgr_base_abort, + /** Peer Force Abort */ + orte_errmgr_base_abort_peers, + /** Update State */ + orte_errmgr_hnpresil_global_update_state, + /* Predicted Fault */ + orte_errmgr_hnpresil_global_predicted_fault, + /* Suggest proc to node mapping */ + orte_errmgr_hnpresil_global_suggest_map_targets, + /* FT Event hook */ + orte_errmgr_hnpresil_global_ft_event, + orte_errmgr_base_register_migration_warning, + /* Post-startup */ + orte_errmgr_hnpresil_global_post_startup, + /* Pre-shutdown */ + orte_errmgr_hnpresil_global_pre_shutdown, + /* Mark as dead */ + orte_errmgr_hnpresil_global_mark_processes_as_dead, + /* Set the callback */ + orte_errmgr_base_set_fault_callback, + /* Receive failure notification */ + orte_errmgr_hnpresil_global_failure_notification +}; + + +/* + * Local functions + */ +static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code); +static void failed_start(orte_job_t *jdata); +static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, + orte_proc_state_t state, orte_exit_code_t exit_code); +static void check_job_complete(orte_job_t *jdata); +static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch); +static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, + orte_proc_state_t state, orte_exit_code_t exit_code); +static orte_odls_child_t* proc_is_local(orte_process_name_t *proc); +static int send_to_local_applications(opal_pointer_array_t *dead_names); + +/************************ + * API Definitions + ************************/ +int orte_errmgr_hnpresil_component_query(mca_base_module_t **module, int *priority) +{ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp:component_query()"); + + if( ORTE_PROC_IS_HNP ) { + *priority = mca_errmgr_hnpresil_component.super.priority; + *module = (mca_base_module_t *)&global_module; + } + /* Daemons and Apps have their own components */ + else { + *module = NULL; + *priority = -1; + } + + return ORTE_SUCCESS; +} + +/******************* + * Global Functions + ********************/ +int orte_errmgr_hnpresil_global_module_init(void) +{ + int ret, exit_status = ORTE_SUCCESS; + +#if OPAL_ENABLE_FT_CR + if( mca_errmgr_hnpresil_component.crmig_enabled ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_crmig_global_module_init()) ) { + exit_status = ret; + goto cleanup; + } + } + else { + /* Still need the tool listener so we can tell it that we cannot do + * anything if they ask. + */ + if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_init()) ) { + ORTE_ERROR_LOG(ret); + return ret; + } + } + + if( mca_errmgr_hnpresil_component.autor_enabled ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_autor_global_module_init()) ) { + exit_status = ret; + goto cleanup; + } + } +#endif /* OPAL_ENABLE_FT_CR */ + + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_base_global_init()) ) { + exit_status = ret; + goto cleanup; + } + +cleanup: + return exit_status; +} + +int orte_errmgr_hnpresil_global_module_finalize(void) +{ + int ret, exit_status = ORTE_SUCCESS; + +#if OPAL_ENABLE_FT_CR + if( mca_errmgr_hnpresil_component.crmig_enabled ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_crmig_global_module_finalize()) ) { + exit_status = ret; + goto cleanup; + } + } + else { + /* Still need the tool listener so we can tell it that we cannot do + * anything if they ask. + */ + if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_finalize()) ) { + ORTE_ERROR_LOG(ret); + return ret; + } + } + + if( mca_errmgr_hnpresil_component.autor_enabled ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_autor_global_module_finalize()) ) { + exit_status = ret; + goto cleanup; + } + } +#endif /* OPAL_ENABLE_FT_CR */ + + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_base_global_finalize()) ) { + exit_status = ret; + goto cleanup; + } + +cleanup: + return exit_status; +} + +int orte_errmgr_hnpresil_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + int ret, exit_status = ORTE_SUCCESS; + + mca_errmgr_hnpresil_component.ignore_current_update = false; + + if (orte_finalizing || + orte_job_term_ordered || + ORTE_PROC_STATE_TERMINATED == state ) { + mca_errmgr_hnpresil_component.term_in_progress = true; + } + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, + "errmgr:hnp:update_state() %s) " + "------- %s state updated for process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ((NULL == proc_name) ? "App. Process" : + (proc_name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), + (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name))); + +#if OPAL_ENABLE_FT_CR + if( mca_errmgr_hnpresil_component.crmig_enabled && + !mca_errmgr_hnpresil_component.autor_in_progress) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_crmig_global_update_state(job, + jobstate, + proc_name, + state, + pid, + exit_code)) ) { + exit_status = ret; + goto cleanup; + } + } + + if( mca_errmgr_hnpresil_component.autor_enabled && + !mca_errmgr_hnpresil_component.crmig_in_progress) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_autor_global_update_state(job, + jobstate, + proc_name, + state, + pid, + exit_code)) ) { + exit_status = ret; + goto cleanup; + } + } +#endif /* OPAL_ENABLE_FT_CR */ + + if( !mca_errmgr_hnpresil_component.ignore_current_update ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_base_global_update_state(job, + jobstate, + proc_name, + state, + pid, + exit_code)) ) { + exit_status = ret; + goto cleanup; + } + } + +cleanup: + return exit_status; +} + +int orte_errmgr_hnpresil_global_predicted_fault(opal_list_t *proc_list, + opal_list_t *node_list, + opal_list_t *suggested_map) +{ +#if OPAL_ENABLE_FT_CR + int ret, exit_status = ORTE_SUCCESS; + + if( mca_errmgr_hnpresil_component.crmig_enabled ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_crmig_global_predicted_fault(proc_list, + node_list, + suggested_map)) ) { + exit_status = ret; + goto cleanup; + } + } + /* + * If Process migration is not enabled, then return an error the tool + * which will print an appropriate message for the user. + */ + else { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp:predicted_fault() Command line asked for a migration, but it is not enabled\n")); + orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_ERROR); + exit_status = ORTE_ERR_NOT_IMPLEMENTED; + goto cleanup; + } + +cleanup: + return exit_status; +#else + return ORTE_ERR_NOT_IMPLEMENTED; +#endif /* OPAL_ENABLE_FT_CR */ +} + +int orte_errmgr_hnpresil_global_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list) +{ +#if OPAL_ENABLE_FT_CR + int ret, exit_status = ORTE_ERR_NOT_IMPLEMENTED; + + if( mca_errmgr_hnpresil_component.crmig_enabled && + !mca_errmgr_hnpresil_component.autor_in_progress ) { + exit_status = ORTE_SUCCESS; + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_crmig_global_suggest_map_targets(proc, + oldnode, + node_list)) ) { + exit_status = ret; + goto cleanup; + } + } + + if( mca_errmgr_hnpresil_component.autor_enabled && + !mca_errmgr_hnpresil_component.crmig_in_progress ) { + exit_status = ORTE_SUCCESS; + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_autor_global_suggest_map_targets(proc, + oldnode, + node_list)) ) { + exit_status = ret; + goto cleanup; + } + } + +cleanup: + return exit_status; +#else + return ORTE_ERR_NOT_IMPLEMENTED; +#endif /* OPAL_ENABLE_FT_CR */ +} + +int orte_errmgr_hnpresil_global_ft_event(int state) +{ + int ret, exit_status = ORTE_SUCCESS; + +#if OPAL_ENABLE_FT_CR + if( !mca_errmgr_hnpresil_component.crmig_enabled ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_crmig_global_ft_event(state)) ) { + exit_status = ret; + goto cleanup; + } + } + + if( !mca_errmgr_hnpresil_component.autor_enabled ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_autor_global_ft_event(state)) ) { + exit_status = ret; + goto cleanup; + } + } +#endif /* OPAL_ENABLE_FT_CR */ + + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_base_global_ft_event(state)) ) { + exit_status = ret; + goto cleanup; + } + +cleanup: + return exit_status; +} + + +/********************** + * From HNP + **********************/ +int orte_errmgr_hnpresil_base_global_init(void) +{ + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_base_global_finalize(void) +{ + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_base_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + orte_job_t *jdata; + orte_exit_code_t sts; + orte_odls_child_t *child; + int rc; + orte_app_context_t *app; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp: job %s reported state %s" + " for proc %s state %s pid %d exit_code %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + orte_job_state_to_str(jobstate), + (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc), + orte_proc_state_to_str(state), pid, exit_code)); + + /* + * if orte is trying to shutdown, just let it + */ + if (orte_finalizing) { + return ORTE_SUCCESS; + } + + if (NULL == proc) { + /* this is an update for an entire local job */ + if (ORTE_JOBID_INVALID == job) { + /* whatever happened, we don't know what job + * it happened to + */ + if (ORTE_JOB_STATE_NEVER_LAUNCHED == jobstate) { + orte_never_launched = true; + } + orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:unknown-job-error", + true, orte_job_state_to_str(jobstate)); + hnp_abort(job, exit_code); + return ORTE_SUCCESS; + } + + /* get the job object */ + if (NULL == (jdata = orte_get_job_data_object(job))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + /* update the state */ + jdata->state = jobstate; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp: job %s reported state %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid), + orte_job_state_to_str(jobstate))); + + switch (jobstate) { + case ORTE_JOB_STATE_TERMINATED: + /* support batch-operated jobs */ + update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_TERMINATED, 0); + jdata->num_terminated = jdata->num_procs; + check_job_complete(jdata); + break; + + case ORTE_JOB_STATE_ABORTED: + /* support batch-operated jobs */ + update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_ABORTED, exit_code); + jdata->num_terminated = jdata->num_procs; + check_job_complete(jdata); + break; + + case ORTE_JOB_STATE_FAILED_TO_START: + failed_start(jdata); + check_job_complete(jdata); /* set the local proc states */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(job))) { + sts = exit_code; + if (ORTE_PROC_MY_NAME->jobid == job && !orte_abnormal_term_ordered) { + /* set the flag indicating that a daemon failed so we use the proper + * methods for attempting to shutdown the rest of the system + */ + orte_abnormal_term_ordered = true; + if (WIFSIGNALED(exit_code)) { /* died on signal */ +#ifdef WCOREDUMP + if (WCOREDUMP(exit_code)) { + orte_show_help("help-plm-base.txt", "daemon-died-signal-core", true, + WTERMSIG(exit_code)); + sts = WTERMSIG(exit_code); + } else { + orte_show_help("help-plm-base.txt", "daemon-died-signal", true, + WTERMSIG(exit_code)); + sts = WTERMSIG(exit_code); + } +#else + orte_show_help("help-plm-base.txt", "daemon-died-signal", true, + WTERMSIG(exit_code)); + sts = WTERMSIG(exit_code); +#endif /* WCOREDUMP */ + } else { + orte_show_help("help-plm-base.txt", "daemon-died-no-signal", true, + WEXITSTATUS(exit_code)); + sts = WEXITSTATUS(exit_code); + } + } + hnp_abort(jdata->jobid, sts); + } + break; + case ORTE_JOB_STATE_RUNNING: + /* update all procs in job */ + update_local_procs_in_job(jdata, jobstate, ORTE_PROC_STATE_RUNNING, 0); + /* record that we reported */ + jdata->num_daemons_reported++; + /* report if requested */ + if (orte_report_launch_progress) { + if (0 == jdata->num_daemons_reported % 100 || jdata->num_daemons_reported == orte_process_info.num_procs) { + opal_output(orte_clean_output, "Reported: %d (out of %d) daemons - %d (out of %d) procs", + (int)jdata->num_daemons_reported, (int)orte_process_info.num_procs, + (int)jdata->num_launched, (int)jdata->num_procs); + } + } + break; + case ORTE_JOB_STATE_NEVER_LAUNCHED: + orte_never_launched = true; + jdata->num_terminated = jdata->num_procs; + check_job_complete(jdata); /* set the local proc states */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(job))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: + /* update all procs in job */ + update_local_procs_in_job(jdata, jobstate, + ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED, + exit_code); + /* order all local procs for this job to be killed */ + killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + check_job_complete(jdata); /* set the local proc states */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(job))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + case ORTE_JOB_STATE_COMM_FAILED: + /* order all local procs for this job to be killed */ + killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + check_job_complete(jdata); /* set the local proc states */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(job))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + case ORTE_JOB_STATE_HEARTBEAT_FAILED: + /* order all local procs for this job to be killed */ + killprocs(jdata->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + check_job_complete(jdata); /* set the local proc states */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(job))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + + default: + break; + } + return ORTE_SUCCESS; + } + + /* get the job object */ + if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { + /* if the orteds are terminating, check job complete */ + if (orte_orteds_term_ordered) { + opal_output(0, "TERM ORDERED - CHECKING COMPLETE"); + check_job_complete(NULL); + return ORTE_SUCCESS; + } else { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + } + +#if OPAL_ENABLE_FT_CR + /* Notify the process state to the notifier framework if it is + active and selected. */ + orte_errmgr_base_proc_state_notify(state, proc); +#endif + + /* update is for a specific proc */ + switch (state) { + case ORTE_PROC_STATE_ABORTED: + case ORTE_PROC_STATE_ABORTED_BY_SIG: + case ORTE_PROC_STATE_TERM_WO_SYNC: + if( jdata->enable_recovery ) { + /* is this a local proc */ + if (NULL != (child = proc_is_local(proc))) { + /* local proc - see if it has reached its restart limit */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); + if (child->restarts < app->max_restarts) { + child->restarts++; + if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { + return ORTE_SUCCESS; + } + /* reset the child's state as restart_proc would + * have cleared it + */ + child->state = state; + /* see if we can relocate it somewhere else */ + if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { + return ORTE_SUCCESS; + } + /* let it fall thru to abort */ + } + } else { + /* this is a remote process - see if we can relocate it */ + if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { + return ORTE_SUCCESS; + } + /* guess not - let it fall thru to abort */ + } + } + + if (ORTE_PROC_STATE_ABORTED_BY_SIG == state) { + exit_code = 0; + } + + orte_errmgr_hnpresil_update_proc(jdata, proc, state, pid, exit_code); + check_job_complete(jdata); /* need to set the job state */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + + case ORTE_PROC_STATE_FAILED_TO_START: + case ORTE_PROC_STATE_CALLED_ABORT: + orte_errmgr_hnpresil_update_proc(jdata, proc, state, pid, exit_code); + check_job_complete(jdata); + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + + case ORTE_PROC_STATE_REGISTERED: + case ORTE_PROC_STATE_RUNNING: + orte_errmgr_hnpresil_update_proc(jdata, proc, state, pid, exit_code); + break; + + case ORTE_PROC_STATE_LAUNCHED: + /* record the pid for this child */ + orte_errmgr_hnpresil_update_proc(jdata, proc, state, pid, exit_code); + break; + + case ORTE_PROC_STATE_TERMINATED: + case ORTE_PROC_STATE_TERM_NON_ZERO: + case ORTE_PROC_STATE_KILLED_BY_CMD: + orte_errmgr_hnpresil_update_proc(jdata, proc, state, pid, exit_code); + check_job_complete(jdata); + break; + + case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: + if (jdata->enable_recovery) { + killprocs(proc->jobid, proc->vpid, proc->epoch); + /* is this a local proc */ + if (NULL != (child = proc_is_local(proc))) { + /* local proc - see if it has reached its restart limit */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, child->app_idx); + if (child->restarts < app->max_restarts) { + child->restarts++; + if (ORTE_SUCCESS == (rc = orte_odls.restart_proc(child))) { + return ORTE_SUCCESS; + } + /* reset the child's state as restart_proc would + * have cleared it + */ + child->state = state; + /* see if we can relocate it somewhere else */ + if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { + return ORTE_SUCCESS; + } + /* let it fall thru to abort */ + } + } else { + /* this is a remote process - see if we can relocate it */ + if (ORTE_SUCCESS == hnp_relocate(jdata, proc, state, exit_code)) { + return ORTE_SUCCESS; + } + /* guess not - let it fall thru to abort */ + } + } + /* kill all jobs */ + orte_errmgr_hnpresil_update_proc(jdata, proc, state, pid, exit_code); + check_job_complete(jdata); /* need to set the job state */ + /* the job object for this job will have been NULL'd + * in the array if the job was solely local. If it isn't + * NULL, then we need to tell everyone else to die + */ + if (NULL != (jdata = orte_get_job_data_object(proc->jobid))) { + hnp_abort(jdata->jobid, exit_code); + } + break; + + case ORTE_PROC_STATE_COMM_FAILED: + /* is this to a daemon? */ + if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { + /* if this is my own connection, ignore it */ + if (ORTE_PROC_MY_NAME->vpid == proc->vpid) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s My own connection - ignoring it", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + break; + } + /* if we have ordered orteds to terminate, record it */ + if (orte_orteds_term_ordered) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s Daemons terminating - recording daemon %s as gone", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); + /* remove from dependent routes, if it is one */ + orte_routed.route_lost(proc); + /* update daemon job */ + orte_errmgr_hnpresil_record_dead_process(proc); + /* We'll check if the job was complete when we get the + * message back from the HNP notifying us of the dead + * process + */ + check_job_complete(jdata); + break; + } + /* if abort is in progress, see if this one failed to tell + * us it had terminated + */ + if (orte_abnormal_term_ordered) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s Abort in progress - recording daemon %s as gone", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); + /* remove from dependent routes, if it is one */ + orte_routed.route_lost(proc); + /* update daemon job */ + orte_errmgr_hnpresil_record_dead_process(proc); + /* We'll check if the job was complete when we get the + * message back from the HNP notifying us of the dead + * process + */ + check_job_complete(jdata); + break; + } + + /* delete the route */ + orte_routed.delete_route(proc); + /* purge the oob */ + orte_rml.purge(proc); + + if( orte_enable_recovery ) { + /* relocate its processes */ + if (ORTE_SUCCESS != (rc = hnp_relocate(jdata, proc, state, exit_code))) { + /* unable to relocate for some reason */ + opal_output(0, "%s UNABLE TO RELOCATE PROCS FROM FAILED DAEMON %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)); + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + /* kill all jobs */ + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); + /* check if all is complete so we can terminate */ + check_job_complete(jdata); + } + } else { + if (ORTE_SUCCESS != orte_errmgr_hnpresil_record_dead_process(proc)) { + /* The process is already dead so don't keep trying to do + * this stuff. */ + return ORTE_SUCCESS; + } + /* We'll check if the job was complete when we get the + * message back from the HNP notifying us of the dead + * process */ + } + } + break; + + case ORTE_PROC_STATE_HEARTBEAT_FAILED: + /* heartbeats are only from daemons */ + if( orte_enable_recovery ) { + /* relocate its processes */ + } else { + orte_errmgr_hnpresil_record_dead_process(proc); + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + /* kill all jobs */ + hnp_abort(ORTE_JOBID_WILDCARD, exit_code); + return ORTE_ERR_UNRECOVERABLE; + } + break; + + default: + break; + } + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_base_global_ft_event(int state) +{ + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_global_post_startup(void) { + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_global_pre_shutdown(void) { + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) { + orte_std_cntr_t n; + int ret = ORTE_SUCCESS, num_failed; + opal_pointer_array_t *dead_names; + int32_t i; + orte_process_name_t *name_item; + orte_epoch_t epoch; + orte_job_t *jdat; + orte_proc_t *pdat, *pdat2; + opal_buffer_t *answer; + orte_daemon_cmd_flag_t command; + + if (orte_debug_daemons_flag) { + opal_output(0, "%s errmgr:hnp HNP received process failed from orted %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(sender)); + } + + n = 1; + /* Get the number of failed procs */ + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(ret); + return ret; + } + + dead_names = OBJ_NEW(opal_pointer_array_t); + + for (i = 0; i < num_failed; i++) { + name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); + + /* Unpack the buffer to get the dead process' name. */ + n = 1; + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + return ret; + } + + /* Check to see if the message is telling us about an old epoch. + * If so ignore the message. + */ + epoch = orte_util_lookup_epoch(name_item); + if (name_item->epoch < epoch) { + if (orte_debug_daemons_flag) { + opal_output(0, "%s errmgr:hnp HNP ignoring duplicate notification for %s failure (reported epoch: %s local epoch: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name_item), + ORTE_EPOCH_PRINT(name_item->epoch), + ORTE_EPOCH_PRINT(epoch)); + } + free(name_item); + continue; + } else { + if (orte_debug_daemons_flag) { + opal_output(0, "%s errmgr:hnp HNP received notification for %s failure (reported epoch: %s local epoch: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name_item), + ORTE_EPOCH_PRINT(name_item->epoch), + ORTE_EPOCH_PRINT(epoch)); + } + } + + opal_pointer_array_add(dead_names, name_item); + + /* Check to see if the message is telling us about an orted and + * it is from another orted. Orteds don't have the list of all + * the application processes so they don't know if there were + * any child processes on the nodes that they are reporting. */ + if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, sender, ORTE_PROC_MY_NAME)) { + if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) { + continue; + } else if (NULL == (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid))) { + continue; + } else if (NULL == pdat->node) { + continue; + } + + if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) { + for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) { + if (NULL == (pdat2 = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) { + continue; + } + + /* ignore this process if it has already terminated */ + if (ORTE_PROC_STATE_TERMINATED <= pdat2->state) { + continue; + } + + /* the proc must have been alive, so notify everyone that it died */ + name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); + + name_item->jobid = pdat2->name.jobid; + name_item->vpid = pdat2->name.vpid; + name_item->epoch = orte_util_lookup_epoch(&(pdat2->name)); + + opal_pointer_array_add(dead_names, name_item); + } + } + } + + } + + /* Update the number of failed process so any duplicates don't get + * re-reported. + */ + num_failed = opal_pointer_array_get_size(dead_names); + + if (num_failed > 0) { + orte_errmgr.mark_processes_as_dead(dead_names); + + if (!orte_orteds_term_ordered) { + /* Send a message out to all the orteds to inform them that the + * process is dead. Long live the process (or not if it is so + * decided)! + */ + answer = OBJ_NEW(opal_buffer_t); + command = ORTE_PROCESS_FAILED_NOTIFICATION; + + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &command, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); + return ret; + } + + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &num_failed, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); + return ret; + } + + for (i = 0; i < opal_pointer_array_get_size(dead_names); i++) { + if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, name_item, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); + return ret; + } + } + } + + if (ORTE_SUCCESS != (ret = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, answer, ORTE_RML_TAG_DAEMON))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); + return ret; + } + + /* Tell the applications' ORTE layers that there is a failure. */ + if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) { + return ret; + } + } + + for (i = 0; i < num_failed; i++) { + name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i); + free(name_item); + } + } + + OBJ_RELEASE(dead_names); + + return ret; +} + +/***************** + * Local Functions + *****************/ +static void hnp_abort(orte_jobid_t job, orte_exit_code_t exit_code) +{ + int rc; + + /* if we are already in progress, then ignore this call */ + if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp: abort in progress, ignoring abort on job %s with status %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), exit_code)); + return; + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp: abort called on job %s with status %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), exit_code)); + + /* if debuggers are running, clean up */ + orte_debugger.finalize(); + + /* set control params to indicate we are terminating */ + orte_job_term_ordered = true; + orte_abnormal_term_ordered = true; + orte_enable_recovery = false; + + /* set the exit status, just in case whomever called us failed + * to do so - it can only be done once, so we are protected + * from overwriting it + */ + ORTE_UPDATE_EXIT_STATUS(exit_code); + + /* tell the plm to terminate the orteds - they will automatically + * kill their local procs + */ + if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) { + ORTE_ERROR_LOG(rc); + } +} + +static void failed_start(orte_job_t *jdata) +{ + opal_list_item_t *item, *next; + orte_odls_job_t *jobdat; + orte_odls_child_t *child; + orte_proc_t *proc; + + /* lookup the local jobdat for this job */ + jobdat = NULL; + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jobdat = (orte_odls_job_t*)item; + + /* is this the specified job? */ + if (jobdat->jobid == jdata->jobid) { + break; + } + } + if (NULL == jobdat) { + /* race condition - may not have been formed yet */ + return; + } + jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; + + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = next) { + next = opal_list_get_next(item); + child = (orte_odls_child_t*)item; + if (child->name->jobid == jobdat->jobid) { + if (ORTE_PROC_STATE_LAUNCHED > child->state || + ORTE_PROC_STATE_UNTERMINATED < child->state) { + /* get the master proc object */ + proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); + proc->state = child->state; + proc->exit_code = child->exit_code; + /* update the counter so we can terminate */ + jdata->num_terminated++; + /* remove the child from our list */ + opal_list_remove_item(&orte_local_children, &child->super); + OBJ_RELEASE(child); + jobdat->num_local_procs--; + } + } + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp: job %s reported incomplete start", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid))); +} + +static void update_local_procs_in_job(orte_job_t *jdata, orte_job_state_t jobstate, + orte_proc_state_t state, orte_exit_code_t exit_code) +{ + opal_list_item_t *item, *next; + orte_odls_job_t *jobdat; + orte_odls_child_t *child; + orte_proc_t *proc; + + /* lookup the local jobdat for this job */ + jobdat = NULL; + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jobdat = (orte_odls_job_t*)item; + + /* is this the specified job? */ + if (jobdat->jobid == jdata->jobid) { + break; + } + } + if (NULL == jobdat) { + /* race condition - may not have been formed yet */ + return; + } + jobdat->state = jobstate; + jdata->state = jobstate; + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = next) { + next = opal_list_get_next(item); + child = (orte_odls_child_t*)item; + if (jdata->jobid == child->name->jobid) { + child->state = state; + proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); + proc->state = state; + if (proc->exit_code < exit_code) { + proc->exit_code = exit_code; + } + if (ORTE_PROC_STATE_UNTERMINATED < state) { + opal_list_remove_item(&orte_local_children, &child->super); + OBJ_RELEASE(child); + jdata->num_terminated++; + jobdat->num_local_procs--; + } else if (ORTE_PROC_STATE_RUNNING) { + jdata->num_launched++; + } else if (ORTE_PROC_STATE_REGISTERED == state) { + jdata->num_reported++; + if (jdata->dyn_spawn_active && + jdata->num_reported == jdata->num_procs) { + OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, + &jdata->dyn_spawn_cond, + &jdata->dyn_spawn_active); + } + } + } + } +} + +void orte_errmgr_hnpresil_update_proc(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + opal_list_item_t *item, *next; + orte_odls_child_t *child; + orte_proc_t *proct; + orte_odls_job_t *jobdat, *jdat; + int i; + + jobdat = NULL; + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jdat = (orte_odls_job_t*)item; + if (jdat->jobid == jdata->jobid) { + jobdat = jdat; + break; + } + } + if (NULL == jobdat) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + } + + /*** UPDATE LOCAL CHILD ***/ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = next) { + next = opal_list_get_next(item); + child = (orte_odls_child_t*)item; + if (child->name->jobid == proc->jobid) { + if (child->name->vpid == proc->vpid) { + child->state = state; + if (0 < pid) { + child->pid = pid; + } + child->exit_code = exit_code; + proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, child->name->vpid); + proct->state = state; + if (0 < pid) { + proct->pid = pid; + } + proct->exit_code = exit_code; + if (ORTE_PROC_STATE_UNTERMINATED < state) { + if (!jdata->enable_recovery) { + opal_list_remove_item(&orte_local_children, &child->super); + OBJ_RELEASE(child); + if (NULL != jobdat) { + jobdat->num_local_procs--; + } + } + jdata->num_terminated++; + } else if (ORTE_PROC_STATE_RUNNING == state) { + jdata->num_launched++; + if (jdata->num_launched == jdata->num_procs) { + jdata->state = ORTE_JOB_STATE_RUNNING; + } + } else if (ORTE_PROC_STATE_REGISTERED == state) { + jdata->num_reported++; + if (jdata->dyn_spawn_active && + jdata->num_reported == jdata->num_procs) { + OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, + &jdata->dyn_spawn_cond, + &jdata->dyn_spawn_active); + } + } + return; + } + } + } + + /*** UPDATE REMOTE CHILD ***/ + for (i=0; i < jdata->procs->size; i++) { + if (NULL == (proct = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + continue; + } + if (proct->name.jobid != proc->jobid || + proct->name.vpid != proc->vpid) { + continue; + } + proct->state = state; + if (0 < pid) { + proct->pid = pid; + } + proct->exit_code = exit_code; + if (ORTE_PROC_STATE_REGISTERED == state) { + jdata->num_reported++; + if (jdata->dyn_spawn_active && + jdata->num_reported == jdata->num_procs) { + OPAL_RELEASE_THREAD(&jdata->dyn_spawn_lock, + &jdata->dyn_spawn_cond, + &jdata->dyn_spawn_active); + } + } else if (ORTE_PROC_STATE_UNTERMINATED < state) { + /* update the counter so we can terminate */ + jdata->num_terminated++; + } else if (ORTE_PROC_STATE_RUNNING == state) { + jdata->num_launched++; + if (jdata->num_launched == jdata->num_procs) { + jdata->state = ORTE_JOB_STATE_RUNNING; + } + } + return; + } +} + +static void check_job_complete(orte_job_t *jdata) +{ + orte_proc_t *proc; + int i; + orte_std_cntr_t j; + orte_job_t *job; + orte_node_t *node; + orte_job_map_t *map; + orte_std_cntr_t index; + bool one_still_alive; + orte_vpid_t non_zero=0, lowest=0; + char *msg; + +#if 0 + /* Check if FileM is active. If so then keep processing. */ + OPAL_ACQUIRE_THREAD(&orte_filem_base_lock, &orte_filem_base_cond, &orte_filem_base_is_active); +#endif + if (NULL == jdata) { + /* just check to see if the daemons are complete */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_complete - received NULL job, checking daemons", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + goto CHECK_DAEMONS; + } + + for (i=0; i < jdata->procs->size && !jdata->abort; i++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + /* the proc array may no longer be left justified, so + * we need to check everything + */ + continue; + } + + if (0 != proc->exit_code) { + non_zero++; + if (0 == lowest) { + lowest = proc->exit_code; + } + } + + switch (proc->state) { + case ORTE_PROC_STATE_KILLED_BY_CMD: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s killed by cmd", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + /* we ordered this proc to die, so it isn't an abnormal termination + * and we don't flag it as such - just check the remaining jobs to + * see if anyone is still alive + */ + if (jdata->num_terminated >= jdata->num_procs) { + /* this job has terminated - now we need to check to see if ALL + * the other jobs have also completed and wakeup if that is true + */ + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_KILLED_BY_CMD; + } + } + goto CHECK_ALIVE; + break; + case ORTE_PROC_STATE_ABORTED: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s aborted", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_FAILED_TO_START: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr_hnpresil:check_job_completed proc %s failed to start", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_FAILED_TO_START; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; +#if 0 + case ORTE_PROC_STATE_ABORTED_BY_SIG: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s aborted by signal", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; +#endif + case ORTE_PROC_STATE_TERM_WO_SYNC: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s terminated without sync", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + /* now treat a special case - if the proc exit'd without a required + * sync, it may have done so with a zero exit code. We want to ensure + * that the user realizes there was an error, so in this -one- case, + * we overwrite the process' exit code with the default error code + */ + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + } + break; + case ORTE_PROC_STATE_COMM_FAILED: +#if 0 + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_COMM_FAILED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } +#endif + break; + case ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED: + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_CALLED_ABORT: + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_CALLED_ABORT; + /* point to the first proc to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_HEARTBEAT_FAILED: + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + break; + case ORTE_PROC_STATE_TERM_NON_ZERO: + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + if (orte_abort_non_zero_exit) { + if (!jdata->abort) { + jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + } + } + break; + + default: + if (ORTE_PROC_STATE_UNTERMINATED < proc->state && + jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed proc %s terminated and continuous", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name))); + if (!jdata->abort) { + proc->state = ORTE_PROC_STATE_ABORTED; + jdata->state = ORTE_JOB_STATE_ABORTED; + /* point to the lowest rank to cause the problem */ + jdata->aborted_proc = proc; + /* retain the object so it doesn't get free'd */ + OBJ_RETAIN(proc); + jdata->abort = true; + ORTE_UPDATE_EXIT_STATUS(proc->exit_code); + } + } + break; + } + } + + if (jdata->abort) { + /* the job aborted - turn off any sensors on this job */ + orte_sensor.stop(jdata->jobid); + } + + if (ORTE_JOB_STATE_UNTERMINATED > jdata->state && + jdata->num_terminated >= jdata->num_procs) { + /* this job has terminated */ + jdata->state = ORTE_JOB_STATE_TERMINATED; + + /* turn off any sensor monitors on this job */ + orte_sensor.stop(jdata->jobid); + + if (0 < non_zero) { + if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { + /* update the exit code */ + ORTE_UPDATE_EXIT_STATUS(lowest); + } + + /* warn user */ + opal_output(orte_clean_output, + "-------------------------------------------------------\n" + "While %s job %s terminated normally, %s %s. Further examination may be required.\n" + "-------------------------------------------------------", + (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", + (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), + ORTE_VPID_PRINT(non_zero), + (1 == non_zero) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes."); + } + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed declared job %s normally terminated - checking all jobs", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid))); + } + + /* if this job is a continuously operating one, then don't do + * anything further - just return here + */ + if (NULL != jdata && + (ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls || + ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls)) { + goto CHECK_ALIVE; + } + + /* if the job that is being checked is the HNP, then we are + * trying to terminate the orteds. In that situation, we + * do -not- check all jobs - we simply notify the HNP + * that the orteds are complete. Also check special case + * if jdata is NULL - we want + * to definitely declare the job done if the orteds + * have completed, no matter what else may be happening. + * This can happen if a ctrl-c hits in the "wrong" place + * while launching + */ +CHECK_DAEMONS: + if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { + if ((jdata->num_procs - 1) <= jdata->num_terminated) { /* Subtract one for the HNP */ + /* orteds are done! */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s orteds complete - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + if (NULL == jdata) { + jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); + } + jdata->state = ORTE_JOB_STATE_TERMINATED; + orte_quit(); + return; + } + return; + } + + /* Release the resources used by this job. Since some errmgrs may want + * to continue using resources allocated to the job as part of their + * fault recovery procedure, we only do this once the job is "complete". + * Note that an aborted/killed job -is- flagged as complete and will + * therefore have its resources released. We need to do this after + * we call the errmgr so that any attempt to restart the job will + * avoid doing so in the exact same place as the current job + */ + if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { + map = jdata->map; + for (index = 0; index < map->nodes->size; index++) { + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { + continue; + } + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s releasing procs from node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name)); + for (i = 0; i < node->procs->size; i++) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { + continue; + } + if (proc->name.jobid != jdata->jobid) { + /* skip procs from another job */ + continue; + } + node->slots_inuse--; + node->num_procs--; + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s releasing proc %s from node %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name), node->name)); + /* set the entry in the node array to NULL */ + opal_pointer_array_set_item(node->procs, i, NULL); + /* release the proc once for the map entry */ + OBJ_RELEASE(proc); + } + } + OBJ_RELEASE(map); + jdata->map = NULL; + } + +CHECK_ALIVE: + /* now check to see if all jobs are done - release this jdata + * object when we find it + */ + one_still_alive = false; + for (j=1; j < orte_job_data->size; j++) { + if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { + /* since we are releasing jdata objects as we + * go, we can no longer assume that the job_data + * array is left justified + */ + continue; + } + /* if this is the job we are checking AND it normally terminated, + * then go ahead and release it. We cannot release it if it + * abnormally terminated as mpirun needs the info so it can + * report appropriately to the user + * + * NOTE: do not release the primary job (j=1) so we + * can pretty-print completion message + */ + if (NULL != jdata && job->jobid == jdata->jobid && + (jdata->state == ORTE_JOB_STATE_TERMINATED || + jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD)) { + /* release this object, ensuring that the + * pointer array internal accounting + * is maintained! + */ + if (1 < j) { + opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ + OBJ_RELEASE(jdata); + } + continue; + } + /* if the job is flagged to not be monitored, skip it */ + if (ORTE_JOB_CONTROL_DO_NOT_MONITOR & job->controls) { + continue; + } + /* when checking for job termination, we must be sure to NOT check + * our own job as it - rather obviously - has NOT terminated! + */ + if (job->num_terminated < job->num_procs) { + /* we have at least one job that is not done yet - we cannot + * just return, though, as we need to ensure we cleanout the + * job data for the job that just completed + */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed job %s is not terminated (%d:%d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job->jobid), + job->num_terminated, job->num_procs)); + one_still_alive = true; + } + else { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed job %s is terminated (%d vs %d [%s])", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job->jobid), + job->num_terminated, job->num_procs, + (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); + } + } + /* if a job is still alive, we just return */ + if (one_still_alive) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed at least one job is not terminated", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + return; + } + /* if we get here, then all jobs are done, so terminate */ + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:hnp:check_job_completed all jobs terminated", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* set the exit status to 0 - this will only happen if it + * wasn't already set by an error condition + */ + ORTE_UPDATE_EXIT_STATUS(0); + /* provide a notifier message if that framework is active - ignored otherwise */ + if (NULL != (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, 1))) { + if (NULL == job->name) { + job->name = strdup(orte_process_info.nodename); + } + if (NULL == job->instance) { + asprintf(&job->instance, "%d", orte_process_info.pid); + } + if (0 == orte_exit_status) { + asprintf(&msg, "Job %s:%s complete", job->name, job->instance); + orte_notifier.log(ORTE_NOTIFIER_INFO, 0, msg); + } else { + asprintf(&msg, "Job %s:%s terminated abnormally", job->name, job->instance); + orte_notifier.log(ORTE_NOTIFIER_ALERT, orte_exit_status, msg); + } + free(msg); + /* this job object will be release during finalize */ + } + + orte_jobs_complete(); + /* if I am the only daemon alive, then I can exit now */ + if (0 == orte_routed.num_routes()) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s orteds complete - exiting", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + orte_quit(); + } +} + +static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch) +{ + opal_pointer_array_t cmd; + orte_proc_t proc; + int rc; + + /* stop local sensors for this job */ + if (ORTE_VPID_WILDCARD == vpid) { + orte_sensor.stop(job); + } + + if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) { + if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { + ORTE_ERROR_LOG(rc); + } + return; + } + + OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); + OBJ_CONSTRUCT(&proc, orte_proc_t); + proc.name.jobid = job; + proc.name.vpid = vpid; + proc.name.epoch = epoch; + opal_pointer_array_add(&cmd, &proc); + if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { + ORTE_ERROR_LOG(rc); + } + OBJ_DESTRUCT(&cmd); + OBJ_DESTRUCT(&proc); +} + +static int hnp_relocate(orte_job_t *jdata, orte_process_name_t *proc, + orte_proc_state_t state, orte_exit_code_t exit_code) +{ + orte_job_t *jdat; + orte_proc_t *pdata, *pdt, *pdt2; + orte_node_t *node, *nd; + orte_app_context_t *app; + char *app_name; + int rc, i, n; + + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, + "%s CHECKING ON RELOCATE FOR APP %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + + /* get the proc_t object for this process */ + pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); + if (NULL == pdata) { + opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc)); + return ORTE_ERR_NOT_FOUND; + } + + /* set the state */ + pdata->state = state; + + /* retain the node id */ + node = pdata->node; + + /* if it is a daemon that died, we need to flag all of its procs + * to be relocated + */ + if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { + /* remove this proc from the daemon job */ + orte_errmgr_hnpresil_record_dead_process(proc); + /* check to see if any other nodes are "alive" */ + if (!orte_hnp_is_allocated && jdata->num_procs == 1) { + return ORTE_ERR_FATAL; + } + app_name = "orted"; + /* scan the procs looking for each unique jobid on the node */ + for (i=0; i < node->procs->size; i++) { + if (NULL == (pdt = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { + continue; + } + /* get the job data object for this process */ + if (NULL == (jdat = orte_get_job_data_object(pdt->name.jobid))) { + /* major problem */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + continue; + } + /* since the node was used in this job's map, release + * it so that accounting is maintained + */ + OBJ_RELEASE(node); + /* mark this proc as dead so it will be restarted */ + pdt->state = ORTE_PROC_STATE_ABORTED; + /* remove this proc from the node */ + OBJ_RELEASE(pdt); /* maintains accounting */ + opal_pointer_array_set_item(node->procs, i, NULL); + /* maintain accounting on num procs alive in case this can't restart */ + jdat->num_terminated++; + /* look for all other procs on this node from the same job */ + for (n=0; n < node->procs->size; n++) { + if (NULL == (pdt2 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) { + continue; + } + if (pdt2->name.jobid == pdt->name.jobid) { + /* mark this proc as having aborted */ + pdt2->state = ORTE_PROC_STATE_ABORTED; + /* remove it from the node */ + OBJ_RELEASE(pdt2); + opal_pointer_array_set_item(node->procs, n, NULL); + /* maintain accounting on num procs alive */ + jdat->num_terminated++; + } + } + /* and remove the node from the map */ + for (n=0; n < jdat->map->nodes->size; n++) { + if (NULL == (nd = (orte_node_t*)opal_pointer_array_get_item(jdat->map->nodes, n))) { + continue; + } + if (nd->index == node->index) { + opal_pointer_array_set_item(jdat->map->nodes, n, NULL); + OBJ_RELEASE(node); /* maintain accounting */ + break; + } + } + /* reset the job params for this job */ + orte_plm_base_reset_job(jdat); + + /* relaunch the job */ + opal_output(0, "%s RELOCATING APPS FOR JOB %s FROM NODE %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdat->jobid), node->name); + if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdat))) { + opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc)); + return rc; + } + } + + return ORTE_SUCCESS; + } + + /* otherwise, we are an app - try to relocate us to another node */ + app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pdata->app_idx); + if (NULL == app) { + /* no way to restart this job */ + orte_show_help("help-orte-errmgr-hnp.txt", "errmgr-hnp:cannot-relocate", true, + ORTE_NAME_PRINT(proc)); + return ORTE_ERR_NOT_FOUND; + } + app_name = app->app; + /* track that we are attempting to restart */ + pdata->restarts++; + /* have we exceeded the number of restarts for this proc? */ + if (app->max_restarts < pdata->restarts) { + return ORTE_ERR_RESTART_LIMIT_EXCEEDED; + } + + /* reset the job params for restart */ + orte_plm_base_reset_job(jdata); + + /* flag the current node as not-to-be-used */ + pdata->node->state = ORTE_NODE_STATE_DO_NOT_USE; + + /* restart the job - the spawn function will remap and + * launch the replacement proc(s) + */ + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, + "%s RELOCATING APP %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + + if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) { + opal_output(0, "FAILED TO RESTART APP %s on error %s", app_name, ORTE_ERROR_NAME(rc)); + return rc; + } + + return ORTE_SUCCESS; +} + +static orte_odls_child_t* proc_is_local(orte_process_name_t *proc) +{ + orte_odls_child_t *child; + opal_list_item_t *item; + + child = NULL; + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + if (child->name->jobid == proc->jobid && + child->name->vpid == proc->vpid) { + return child; + } + } + return NULL; +} + +static void cbfunc(int status, + orte_process_name_t *peer, + opal_buffer_t *buffer, + orte_rml_tag_t tag, + void* cbdata) { + OBJ_RELEASE(buffer); +} + +int orte_errmgr_hnpresil_record_dead_process(orte_process_name_t *proc) { + orte_job_t *jdat; + orte_proc_t *pdat; + opal_buffer_t *buffer; + orte_daemon_cmd_flag_t command; + int i, rc, num_failed; + opal_pointer_array_t *dead_names; + orte_process_name_t *name_item; + orte_proc_t *proc_item; + + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, + "%s RECORDING DEAD PROCESS %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + + if (NULL == (jdat = orte_get_job_data_object(proc->jobid))) { + opal_output(0, "Can't find job object"); + return ORTE_ERR_NOT_FOUND; + } + + if (NULL != (pdat = (orte_proc_t*)opal_pointer_array_get_item(jdat->procs, proc->vpid)) && + ORTE_PROC_STATE_TERMINATED < pdat->state) { + + /* Make sure that the epochs match. */ + if (proc->epoch != pdat->name.epoch) { + opal_output(1, "The epoch does not match the current epoch. Throwing the request out."); + return ORTE_SUCCESS; + } + + dead_names = OBJ_NEW(opal_pointer_array_t); + + if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { + opal_pointer_array_add(dead_names, &(pdat->name)); + + for (i = 0; i < opal_pointer_array_get_size(pdat->node->procs); i++) { + if (NULL == (proc_item = (orte_proc_t *) opal_pointer_array_get_item(pdat->node->procs, i))) { + continue; + } + + opal_pointer_array_add(dead_names, &(proc_item->name)); + } + } + + if (!orte_orteds_term_ordered) { + /* + * Send a message to the other daemons so they know that a daemon has + * died. + */ + buffer = OBJ_NEW(opal_buffer_t); + command = ORTE_PROCESS_FAILED_NOTIFICATION; + + num_failed = opal_pointer_array_get_size(dead_names); + + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buffer); + } else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buffer); + } else { + + /* Iterate of the list of dead procs and send them along with + * the rest. The HNP needs this info so it can tell the other + * ORTEDs and they can inform the appropriate applications. + */ + for (i = 0; i < num_failed; i++) { + if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, name_item, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buffer); + } + } + } + + OBJ_RELEASE(dead_names); + + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base.output, + "%s SENDING DEAD PROCESS MESSAGE TO HNP", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0, cbfunc, NULL); + } + } else { + orte_errmgr_hnpresil_global_mark_processes_as_dead(dead_names); + } + } + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs) { + int i; + orte_process_name_t *name_item; + orte_job_t *jdat; + orte_proc_t *pdat; + orte_node_t *node; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "HNP %s marking procs as dead", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + /* Iterate over the list of processes */ + for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) { + if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) { + opal_output(1, "NULL found in dead process list."); + continue; + } + + if (NULL == (jdat = orte_get_job_data_object(name_item->jobid))) { + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s Job data not found.", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + return ORTE_ERR_NOT_FOUND; + } + + if (NULL != (pdat = (orte_proc_t *) opal_pointer_array_get_item(jdat->procs, name_item->vpid)) && + pdat->state < ORTE_PROC_STATE_TERMINATED) { + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "HNP %s marking %s as dead", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&pdat->name))); + + /* Make sure the epochs match, if not it probably means that we + * already reported this failure. */ + if (name_item->epoch != pdat->name.epoch) { + continue; + } + + orte_util_set_epoch(name_item, name_item->epoch + 1); + + /* Remove it from the job array */ + opal_pointer_array_set_item(jdat->procs, name_item->vpid, NULL); + orte_process_info.num_procs--; + jdat->num_procs--; + + /* Check if this is an ORTED */ + if (ORTE_PROC_MY_NAME->jobid == name_item->jobid) { + /* Mark the node as down so it won't be used in mapping anymore. */ + node = pdat->node; + node->state = ORTE_NODE_STATE_DOWN; + node->daemon = NULL; + } + + OBJ_RELEASE(pdat); + + /* Create a new proc object that will keep track of the epoch + * information */ + pdat = OBJ_NEW(orte_proc_t); + pdat->name.jobid = jdat->jobid; + pdat->name.vpid = name_item->vpid; + pdat->name.epoch = name_item->epoch + 1; + + /* Set the state as terminated so we'll know the process isn't + * actually there. */ + pdat->state = ORTE_PROC_STATE_TERMINATED; + + opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); + jdat->num_procs++; + jdat->num_terminated++; + } else { + opal_output(0, "Proc data not found for %s", ORTE_NAME_PRINT(name_item)); + /* Create a new proc object that will keep track of the epoch + * information */ + pdat = OBJ_NEW(orte_proc_t); + pdat->name.jobid = jdat->jobid; + pdat->name.vpid = name_item->vpid; + pdat->name.epoch = name_item->epoch + 1; + + /* Set the state as terminated so we'll know the process isn't + * actually there. */ + pdat->state = ORTE_PROC_STATE_TERMINATED; + + opal_pointer_array_set_item(jdat->procs, name_item->vpid, pdat); + jdat->num_procs++; + jdat->num_terminated++; + } + + check_job_complete(jdat); + } + + if (!orte_orteds_term_ordered) { + /* Need to update the orted routing module. */ + orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); + + if (NULL != fault_cbfunc) { + (*fault_cbfunc)(dead_procs); + } + } + + return ORTE_SUCCESS; +} + +int send_to_local_applications(opal_pointer_array_t *dead_names) { + opal_buffer_t *buf; + int ret = ORTE_SUCCESS; + orte_process_name_t *name_item; + int size, i; + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, + "%s Sending failure to local applications.", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + buf = OBJ_NEW(opal_buffer_t); + + size = opal_pointer_array_get_size(dead_names); + + if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buf); + return ret; + } + + for (i = 0; i < size; i++) { + if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buf); + return ret; + } + } + } + + if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buf); + return ret; + } + + OBJ_RELEASE(buf); + + return ret; +} diff --git a/orte/mca/errmgr/hnpresil/errmgr_hnpresil.h b/orte/mca/errmgr/hnpresil/errmgr_hnpresil.h new file mode 100644 index 0000000000..d9ac6ddcc6 --- /dev/null +++ b/orte/mca/errmgr/hnpresil/errmgr_hnpresil.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + */ + +#ifndef MCA_ERRMGR_HNPRESIL_EXPORT_H +#define MCA_ERRMGR_HNPRESIL_EXPORT_H + +#include "orte_config.h" + +#include "orte/mca/errmgr/errmgr.h" + +BEGIN_C_DECLS + +/* + * Local Component structures + */ +struct orte_errmgr_hnpresil_component_t { + orte_errmgr_base_component_t super; /** Base Errmgr component */ + + bool ignore_current_update; + bool term_in_progress; + +#if OPAL_ENABLE_FT_CR + /* State of the Recovery */ + bool crmig_in_progress; + bool autor_in_progress; + + /* CRMig Options */ + bool crmig_enabled; + bool crmig_timing_enabled; + + /* AutoR Options */ + bool autor_enabled; + bool autor_timing_enabled; + int autor_recovery_delay; + bool autor_skip_oldnode; +#endif +}; +typedef struct orte_errmgr_hnpresil_component_t orte_errmgr_hnpresil_component_t; +OPAL_MODULE_DECLSPEC extern orte_errmgr_hnpresil_component_t mca_errmgr_hnpresil_component; + +int orte_errmgr_hnpresil_component_query(mca_base_module_t **module, int *priority); + +void orte_errmgr_hnpresil_update_proc(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); + +/*************************** + * Module functions: Global + ***************************/ +int orte_errmgr_hnpresil_global_module_init(void); +int orte_errmgr_hnpresil_global_module_finalize(void); + +int orte_errmgr_hnpresil_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); +int orte_errmgr_hnpresil_global_predicted_fault(opal_list_t *proc_list, + opal_list_t *node_list, + opal_list_t *suggested_map); +int orte_errmgr_hnpresil_global_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list); +int orte_errmgr_hnpresil_global_ft_event(int state); +int orte_errmgr_hnpresil_global_post_startup(void); +int orte_errmgr_hnpresil_global_pre_shutdown(void); +int orte_errmgr_hnpresil_global_mark_processes_as_dead(opal_pointer_array_t *dead_procs); +int orte_errmgr_hnpresil_global_failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer); +int orte_errmgr_hnpresil_record_dead_process(orte_process_name_t *proc); + +/* hnpresil Versions */ +int orte_errmgr_hnpresil_base_global_init(void); +int orte_errmgr_hnpresil_base_global_finalize(void); +int orte_errmgr_hnpresil_base_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); +int orte_errmgr_hnpresil_base_global_ft_event(int state); + +#if OPAL_ENABLE_FT_CR +/* CRMig Versions */ +int orte_errmgr_hnpresil_crmig_global_module_init(void); +int orte_errmgr_hnpresil_crmig_global_module_finalize(void); + +int orte_errmgr_hnpresil_crmig_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); +int orte_errmgr_hnpresil_crmig_global_predicted_fault(opal_list_t *proc_list, + opal_list_t *node_list, + opal_list_t *suggested_map); +int orte_errmgr_hnpresil_crmig_global_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list); +int orte_errmgr_hnpresil_crmig_global_ft_event(int state); + +/* AutoR Versions */ +int orte_errmgr_hnpresil_autor_global_module_init(void); +int orte_errmgr_hnpresil_autor_global_module_finalize(void); + +int orte_errmgr_hnpresil_autor_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); +int orte_errmgr_hnpresil_autor_global_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list); +int orte_errmgr_hnpresil_autor_global_ft_event(int state); +#endif + +END_C_DECLS + +#endif /* MCA_ERRMGR_HNPRESIL_EXPORT_H */ diff --git a/orte/mca/errmgr/hnpresil/errmgr_hnpresil_autor.c b/orte/mca/errmgr/hnpresil/errmgr_hnpresil_autor.c new file mode 100644 index 0000000000..6a51b7f239 --- /dev/null +++ b/orte/mca/errmgr/hnpresil/errmgr_hnpresil_autor.c @@ -0,0 +1,1033 @@ +/* + * Copyright (c) 2009-2011 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/show_help.h" +#include "opal/util/output.h" +#include "opal/util/opal_environ.h" +#include "opal/util/basename.h" +#include "opal/util/argv.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/mca/crs/crs.h" +#include "opal/mca/crs/base/base.h" +#include "opal/mca/event/event.h" + +#include "orte/util/error_strings.h" +#include "orte/util/name_fns.h" +#include "orte/util/proc_info.h" +#include "orte/runtime/orte_globals.h" +#include "opal/dss/dss.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/rml_types.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/iof/iof.h" +#include "orte/mca/plm/plm.h" +#include "orte/mca/plm/base/base.h" +#include "orte/mca/plm/base/plm_private.h" +#include "orte/mca/filem/filem.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/runtime/orte_wait.h" +#include "orte/mca/rmaps/rmaps_types.h" +#include "orte/mca/snapc/snapc.h" +#include "orte/mca/snapc/base/base.h" +#include "orte/mca/sstore/sstore.h" +#include "orte/mca/sstore/base/base.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "orte/mca/errmgr/base/errmgr_private.h" + +#include "errmgr_hnpresil.h" + +#include MCA_timer_IMPLEMENTATION_HEADER + +#if OPAL_ENABLE_FT_CR +/************************ + * Work Pool structures + ************************/ +struct errmgr_autor_wp_item_t { + /** List super object */ + opal_list_item_t super; + + /** ORTE Process name */ + orte_process_name_t name; + + /** State that was passed with it */ + orte_proc_state_t state; +}; +typedef struct errmgr_autor_wp_item_t errmgr_autor_wp_item_t; + +OBJ_CLASS_DECLARATION(errmgr_autor_wp_item_t); + +void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp); +void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp); + +OBJ_CLASS_INSTANCE(errmgr_autor_wp_item_t, + opal_list_item_t, + errmgr_autor_wp_item_construct, + errmgr_autor_wp_item_destruct); + +/************************************ + * Locally Global vars & functions :) + ************************************/ +static orte_jobid_t current_global_jobid = ORTE_JOBID_INVALID; +static orte_job_t *current_global_jobdata = NULL; + +static bool autor_mask_faults = false; + +static opal_list_t *procs_pending_recovery = NULL; +static bool autor_timer_active = false; +static opal_event_t *autor_timer_event = NULL; + +static void errmgr_autor_recover_processes(int fd, short event, void *cbdata); +static int autor_set_current_job_info(orte_job_t *given_jdata, orte_process_name_t *proc_name); + +static int display_procs(void ); +static int autor_procs_sort_compare_fn(opal_list_item_t **a, + opal_list_item_t **b); + +static int orte_errmgr_hnpresil_autor_global_process_fault(orte_job_t *jdata, + orte_process_name_t *proc_name, + orte_proc_state_t state); +static void errmgr_autor_process_fault_app(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state); +static void errmgr_autor_process_fault_daemon(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state); + +static int check_if_terminated(opal_pointer_array_t *procs); +static int check_if_restarted(opal_pointer_array_t *procs); + +/* + * Timer stuff + */ +static void errmgr_autor_set_time(int idx); +static void errmgr_autor_display_all_timers(void); +static void errmgr_autor_clear_timers(void); + +static double errmgr_autor_get_time(void); +static void errmgr_autor_display_indv_timer_core(double diff, char *str); +static double timer_start[OPAL_CR_TIMER_MAX]; + +#define ERRMGR_AUTOR_TIMER_START 0 +#define ERRMGR_AUTOR_TIMER_SETUP 1 +#define ERRMGR_AUTOR_TIMER_TERM 2 +#define ERRMGR_AUTOR_TIMER_RESETUP 3 +#define ERRMGR_AUTOR_TIMER_RESTART 4 +#define ERRMGR_AUTOR_TIMER_FINISH 5 +#define ERRMGR_AUTOR_TIMER_MAX 6 + +#define ERRMGR_AUTOR_CLEAR_TIMERS() \ + { \ + if(OPAL_UNLIKELY(mca_errmgr_hnpresil_component.autor_timing_enabled > 0)) { \ + errmgr_autor_clear_timers(); \ + } \ + } + +#define ERRMGR_AUTOR_SET_TIMER(idx) \ + { \ + if(OPAL_UNLIKELY(mca_errmgr_hnpresil_component.autor_timing_enabled > 0)) { \ + errmgr_autor_set_time(idx); \ + } \ + } + +#define ERRMGR_AUTOR_DISPLAY_ALL_TIMERS() \ + { \ + if(OPAL_UNLIKELY(mca_errmgr_hnpresil_component.autor_timing_enabled > 0)) { \ + errmgr_autor_display_all_timers(); \ + } \ + } + +/************************ + * Function Definitions: Global + ************************/ +int orte_errmgr_hnpresil_autor_global_module_init(void) +{ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(autor):init()"); + + procs_pending_recovery = OBJ_NEW(opal_list_t); + + current_global_jobid = ORTE_JOBID_INVALID; + current_global_jobdata = NULL; + + if( NULL == autor_timer_event ) { + autor_timer_event = opal_event_evtimer_new(opal_event_base, errmgr_autor_recover_processes, NULL); + } + + ERRMGR_AUTOR_CLEAR_TIMERS(); + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_autor_global_module_finalize(void) +{ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(autor):finalize()"); + + if( NULL != procs_pending_recovery ) { + OBJ_RELEASE(procs_pending_recovery); + procs_pending_recovery = NULL; + } + if( NULL != autor_timer_event ) { + free(autor_timer_event); + autor_timer_event = NULL; + } + + current_global_jobid = ORTE_JOBID_INVALID; + current_global_jobdata = NULL; + + ERRMGR_AUTOR_CLEAR_TIMERS(); + + return ORTE_SUCCESS; +} + +static int autor_set_current_job_info(orte_job_t *given_jdata, orte_process_name_t *proc_name) +{ + orte_job_t *jdata = NULL; + int i; + + /* + * If we already figured it out, then just move ahead + */ + if( NULL != current_global_jobdata ) { + if( given_jdata->jobid != ORTE_PROC_MY_NAME->jobid && + given_jdata->jobid != current_global_jobdata->jobid ) { + current_global_jobdata = given_jdata; + current_global_jobid = given_jdata->jobid; + } + return ORTE_SUCCESS; + } + + /* + * If this references the application, and not the daemons + */ + if( given_jdata->jobid != ORTE_PROC_MY_NAME->jobid ) { + current_global_jobdata = given_jdata; + current_global_jobid = given_jdata->jobid; + return ORTE_SUCCESS; + } + + /* + * Otherwise iterate through the job structure and find the first job. + */ + for(i = 0; i < orte_job_data->size; ++i ) { + if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { + continue; + } + /* Exclude outselves */ + if( jdata->jobid == ORTE_PROC_MY_NAME->jobid ) { + continue; + } + current_global_jobdata = jdata; + current_global_jobid = jdata->jobid; + break; + } + + if( NULL == current_global_jobdata ) { + opal_output(0, "errmgr:hnp(autor):process_fault(): Global) Error: Cannot find the jdata for the current job."); + return ORTE_ERROR; + } + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_autor_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + orte_proc_t *loc_proc = NULL; + orte_job_t *jdata = NULL; + int ret = ORTE_SUCCESS, exit_status = ORTE_SUCCESS; + int32_t i; + + /* + * if orte is trying to shutdown, just let it + */ + if( mca_errmgr_hnpresil_component.term_in_progress ) { + return ORTE_SUCCESS; + } + + if( NULL != proc_name && + OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc_name) ) { + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp(autor): Update reported on self (%s), state %s. Skip...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc_name), + orte_proc_state_to_str(state) )); + return ORTE_SUCCESS; + } + + /* + * Get the job data object for this process + */ + if( NULL != proc_name ) { /* Get job from proc's jobid */ + jdata = orte_get_job_data_object(proc_name->jobid); + } else { /* Get from the general job */ + jdata = orte_get_job_data_object(job); + } + if( NULL == jdata ) { + opal_output(0, "%s errmgr:hnp(autor):update_state() Error: Cannot find job %s for Process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) ); + ret = ORTE_ERROR; + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* + * If this is a tool, ignore + */ + if( jdata->num_apps == 0 && + OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) { + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp(autor): An external tool disconnected. Ignore...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + exit_status = ORTE_SUCCESS; + goto cleanup; + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp(autor): job %s reported state %s" + " for proc %s state %s exit_code %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + orte_job_state_to_str(jobstate), + (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name), + orte_proc_state_to_str(state), exit_code)); + + if( ORTE_JOB_STATE_RESTART == jobstate ) { + for(i = 0; i < jdata->procs->size; ++i) { + if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + continue; + } + break; + } + + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_autor_global_process_fault(jdata, &(loc_proc->name), state)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + } + else if( ORTE_PROC_STATE_ABORTED_BY_SIG == state || + ORTE_PROC_STATE_COMM_FAILED == state ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_autor_global_process_fault(jdata, proc_name, state)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + } + else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) { + if( autor_mask_faults ) { + mca_errmgr_hnpresil_component.ignore_current_update = true; + orte_errmgr_hnpresil_update_proc(jdata, proc_name, state, 0, exit_code); + } + } + + cleanup: + return ret; +} + +static int orte_errmgr_hnpresil_autor_global_process_fault(orte_job_t *jdata, + orte_process_name_t *proc_name, + orte_proc_state_t state) +{ + int ret; + + /* + * Recover from the process failure by relaunching. + */ + if( ORTE_SUCCESS != (ret = autor_set_current_job_info(jdata, proc_name)) ) { + ORTE_ERROR_LOG(ret); + return ORTE_SUCCESS; /* JJH: Do this for now. Need to fix the flag for normal shutdown */ + /*return ret;*/ + } + + current_global_jobdata->controls |= ORTE_JOB_CONTROL_RECOVERABLE; + + if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) { + errmgr_autor_process_fault_daemon(jdata, proc_name, state); + } else { + orte_errmgr_hnpresil_update_proc(jdata, proc_name, state, 0, 0); + errmgr_autor_process_fault_app(jdata, proc_name, state); + } + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_autor_global_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list) +{ + opal_list_item_t *item = NULL; + errmgr_autor_wp_item_t *wp_item = NULL; + orte_node_t *node = NULL; + bool found = false; + int num_removed = 0, num_to_remove; + orte_ns_cmp_bitmask_t mask; + + if( NULL == current_global_jobdata ) { + return ORTE_SUCCESS; + } + + /* JJH Nasty Hack */ + num_to_remove = current_global_jobdata->num_procs / 2; + num_to_remove += 1; + + /* + * Find this process in the known failures list + */ + found = false; + if( mca_errmgr_hnpresil_component.autor_skip_oldnode ) { + for(item = opal_list_get_first(procs_pending_recovery); + item != opal_list_get_end(procs_pending_recovery); + item = opal_list_get_next(item) ) { + wp_item = (errmgr_autor_wp_item_t*)item; + + mask = ORTE_NS_CMP_ALL; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &wp_item->name, &proc->name)) { + found = true; + break; + } + } + } + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor): suggest_map() " + "Process remapping: %s oldnode %s, %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&proc->name), + oldnode->name, + (found ? "Failed Proc." : "Good Proc.") )); + + /* + * If not a failed process, then return it to the oldnode + * If failed process, do not place it back on the same node + */ + num_removed = 0; + for( item = opal_list_get_first(node_list); + item != opal_list_get_end(node_list); + item = opal_list_get_next(item) ) { + node = (orte_node_t*)item; + if( found ) { + if( num_removed >= num_to_remove ) { + break; + } + /* JJH Nasty Hack */ +#if 0 + /* Remove oldnode (if more than one node) */ + if( node == oldnode && 1 < opal_list_get_size(node_list) ) { + opal_output(0, "JJH Remove Node (%s)", node->name); + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + } +#else + if( 1 < opal_list_get_size(node_list) ) { + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + } +#endif + num_removed++; + } else { + /* Stay on same node */ + if( node != oldnode ) { + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + } + } + } + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_autor_global_ft_event(int state) +{ + return ORTE_SUCCESS; +} + + +/***************** + * Local Functions + *****************/ +static void errmgr_autor_process_fault_app(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state) +{ + errmgr_autor_wp_item_t *wp_item = NULL; + struct timeval soon; + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor): process_fault() " + "Process fault! proc %s (0x%x)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + state)); + + if( !orte_sstore_base_is_checkpoint_available ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor): process_fault() " + "No checkpoints are available for this job! Cannot Automaticly Recover!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) )); + opal_show_help("help-orte-errmgr-hnp.txt", "autor_failed_to_recover_proc", true, + ORTE_NAME_PRINT(proc), proc->vpid); + return; + } + + mca_errmgr_hnpresil_component.ignore_current_update = true; + + /* + * If we are already in the shutdown stage of the recovery, then just skip it + */ + if( autor_mask_faults ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor):process_fault() " + "Currently recovering the job. Failure masked!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + return; + } + + /* + * Append this process to the list to process + */ + wp_item = OBJ_NEW(errmgr_autor_wp_item_t); + wp_item->name.jobid = proc->jobid; + wp_item->name.vpid = proc->vpid; + wp_item->name.epoch = proc->epoch; + wp_item->state = state; + + opal_list_append(procs_pending_recovery, &(wp_item->super)); + + /* + * Activate the timer, if it is not already setup + */ + if( !autor_timer_active ) { + autor_timer_active = true; + + opal_event_evtimer_set(opal_event_base, autor_timer_event, errmgr_autor_recover_processes, NULL); + soon.tv_sec = mca_errmgr_hnpresil_component.autor_recovery_delay; + soon.tv_usec = 0; + opal_event_evtimer_add(autor_timer_event, &soon); + } + + return; +} + +static void errmgr_autor_process_fault_daemon(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state) +{ + orte_proc_t *loc_proc = NULL, *child_proc = NULL; + orte_std_cntr_t i_proc; + int32_t i; + + OPAL_OUTPUT_VERBOSE((15, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor): process_fault_daemon() " + "------- Daemon fault reported! proc %s (0x%x)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), + state)); + + /* + * Set the process state in the job data structure + */ + for(i = 0; i < jdata->procs->size; ++i) { + if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + continue; + } + + if( loc_proc->name.vpid != proc->vpid) { + continue; + } + + loc_proc->state = state; + + break; + } + + /* + * Remove the route to this process + */ + orte_routed.delete_route(proc); + + /* + * If the aborted daemon had active processes on its node, then we should + * make sure to signal that all the children are gone. + */ + if( loc_proc->node->num_procs > 0 ) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, + "%s errmgr:base: stabalize_runtime() " + "------- Daemon lost with the following processes", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + for(i_proc = 0; i_proc < opal_pointer_array_get_size(loc_proc->node->procs); ++i_proc) { + child_proc = (orte_proc_t*)opal_pointer_array_get_item(loc_proc->node->procs, i_proc); + if( NULL == child_proc ) { + continue; + } + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, + "%s errmgr:base: stabalize_runtime() " + "\t %s [0x%x]", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&child_proc->name), + child_proc->state)); + + if( child_proc->last_errmgr_state < child_proc->state ) { + child_proc->last_errmgr_state = child_proc->state; + orte_errmgr.update_state(child_proc->name.jobid, ORTE_JOB_STATE_COMM_FAILED, + &(child_proc->name), ORTE_PROC_STATE_COMM_FAILED, + 0, 1); + } + } + } else { + /* This daemon had no children, so just mask the failure */ + mca_errmgr_hnpresil_component.ignore_current_update = true; + } + + /* + * Record the dead daemon + */ + orte_errmgr_hnpresil_record_dead_process(proc); + + return; +} + +void errmgr_autor_wp_item_construct(errmgr_autor_wp_item_t *wp) +{ + wp->name.jobid = ORTE_JOBID_INVALID; + wp->name.vpid = ORTE_VPID_INVALID; + wp->name.epoch = ORTE_EPOCH_MIN; + + wp->state = 0; +} + +void errmgr_autor_wp_item_destruct(errmgr_autor_wp_item_t *wp) +{ + wp->name.jobid = ORTE_JOBID_INVALID; + wp->name.vpid = ORTE_VPID_INVALID; + wp->name.epoch = ORTE_EPOCH_INVALID; + + wp->state = 0; +} + +static int display_procs(void ) +{ + opal_list_item_t *item = NULL; + errmgr_autor_wp_item_t *wp_item = NULL; + char *proc_str = NULL; + char *tmp_str = NULL; + + for(item = opal_list_get_first(procs_pending_recovery); + item != opal_list_get_end(procs_pending_recovery); + item = opal_list_get_next(item) ) { + wp_item = (errmgr_autor_wp_item_t*)item; + + if( NULL == proc_str ) { + asprintf(&proc_str, "\t%s Rank %d\n", + ORTE_NAME_PRINT(&(wp_item->name)), + (int)wp_item->name.vpid); + } else { + tmp_str = strdup(proc_str); + free(proc_str); + proc_str = NULL; + asprintf(&proc_str, "%s\t%s Rank %d\n", + tmp_str, + ORTE_NAME_PRINT(&(wp_item->name)), + (int)wp_item->name.vpid); + } + } + + opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovering_job", true, + proc_str); + + if( NULL != tmp_str ) { + free(tmp_str); + tmp_str = NULL; + } + + if( NULL != proc_str ) { + free(proc_str); + proc_str = NULL; + } + + return ORTE_SUCCESS; +} + +static int autor_procs_sort_compare_fn(opal_list_item_t **a, + opal_list_item_t **b) +{ + errmgr_autor_wp_item_t *wp_a, *wp_b; + + wp_a = (errmgr_autor_wp_item_t*)(*a); + wp_b = (errmgr_autor_wp_item_t*)(*b); + + if( wp_a->name.vpid > wp_b->name.vpid ) { + return 1; + } + else if( wp_a->name.vpid == wp_b->name.vpid ) { + return 0; + } + else { + return -1; + } +} + +static void errmgr_autor_recover_processes(int fd, short event, void *cbdata) +{ + int ret, exit_status = ORTE_SUCCESS; + opal_list_item_t *item = NULL; + errmgr_autor_wp_item_t *wp_item = NULL; + orte_std_cntr_t i_proc; + orte_proc_t *proc = NULL; + orte_sstore_base_global_snapshot_info_t *snapshot = NULL; + char * tmp_str = NULL; + + autor_mask_faults = true; + ERRMGR_AUTOR_CLEAR_TIMERS(); + ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_START); + + /* + * Display the processes that are to be recovered + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor):recover() " + "------- Display known failed processes in the job %s -------", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(current_global_jobdata->jobid))); + + opal_list_sort(procs_pending_recovery, autor_procs_sort_compare_fn); + display_procs(); + + /* + * Find the latest checkpoint + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor):recover() " + "------- Find the latest checkpoint for the job %s -------", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(current_global_jobdata->jobid))); + + snapshot = OBJ_NEW(orte_sstore_base_global_snapshot_info_t); + if( ORTE_SUCCESS != (ret = orte_sstore.request_global_snapshot_data(&orte_sstore_handle_last_stable, snapshot)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_SETUP); + + /* + * Safely terminate the entire job + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(autor):recover() " + "------- Safely terminate the job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); + if( NULL == proc ) { + continue; + } + if( proc->state < ORTE_PROC_STATE_UNTERMINATED ) { + proc->state = ORTE_PROC_STATE_MIGRATING; + } + if( current_global_jobdata->stdin_target == proc->name.vpid ) { + orte_iof.close(&(proc->name), ORTE_IOF_STDIN); + } + } + + orte_plm.terminate_procs(current_global_jobdata->procs); + + /* + * Wait for the job to terminate all processes + */ + while(!check_if_terminated(current_global_jobdata->procs) ) { + opal_progress(); + } + + ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_TERM); + + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(autor):recover() " + "------- Done waiting for termination of job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + current_global_jobdata->num_terminated = current_global_jobdata->num_procs; + orte_plm_base_reset_job(current_global_jobdata); + + /* + * Construct the app contexts to restart + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "%s errmgr:hnp(autor):recover() " + "------- Rebuild job %s app context -------", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(current_global_jobdata->jobid))); + for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); + if( NULL == proc ) { + continue; + } + + if( ORTE_SUCCESS != (ret = orte_errmgr_base_update_app_context_for_cr_recovery(current_global_jobdata, + proc, + &(snapshot->local_snapshots))) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\tAdjusted: \"%s\" [0x%d] [%s]\n", + ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name)); + } + + ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_RESETUP); + + /* + * Spawn the restarted job + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(autor):recover() " + "------- Respawning the job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + orte_snapc_base_has_recovered = false; + autor_mask_faults = false; /* Failures pass this point are worth noting */ + orte_plm.spawn(current_global_jobdata); + + /* + * Wait for all the processes to restart + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(autor):recover() " + "------- Waiting for restart -------"); + while(!check_if_restarted(current_global_jobdata->procs) ) { + opal_progress(); + } + + ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_RESTART); + + /* + * All done + */ + while( !orte_snapc_base_has_recovered ) { + opal_progress(); + } + + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(autor):recover() " + "------- Finished recovering job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + opal_show_help("help-orte-errmgr-hnp.txt", "autor_recovery_complete", true); + + ERRMGR_AUTOR_SET_TIMER(ERRMGR_AUTOR_TIMER_FINISH); + + cleanup: + while(NULL != (item = opal_list_remove_first(procs_pending_recovery))) { + wp_item = (errmgr_autor_wp_item_t*)item; + OBJ_RELEASE(wp_item); + } + + if( NULL != tmp_str ) { + free(tmp_str); + tmp_str = NULL; + } + + ERRMGR_AUTOR_DISPLAY_ALL_TIMERS(); + + autor_timer_active = false; + autor_mask_faults = false; + + return; +} + +static int check_if_terminated(opal_pointer_array_t *procs) +{ + orte_std_cntr_t i_proc; + orte_proc_t *proc = NULL; + bool is_done; + + if( NULL == procs ){ + return true; + } + + is_done = true; + for(i_proc = 0; i_proc < opal_pointer_array_get_size(procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(procs, i_proc); + if( NULL == proc ) { + continue; + } + + if( proc->state < ORTE_PROC_STATE_UNTERMINATED || + proc->state == ORTE_PROC_STATE_MIGRATING ) { + is_done = false; + break; + } + } + + if( !is_done ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t Still waiting for termination: \"%s\" [0x%x] < [0x%x]\n", + ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_UNTERMINATED)); + } + + return is_done; +} + +static int check_if_restarted(opal_pointer_array_t *procs) +{ + orte_std_cntr_t i_proc; + orte_proc_t *proc = NULL; + bool is_done; + + if( NULL == procs ){ + return true; + } + + is_done = true; + for(i_proc = 0; i_proc < opal_pointer_array_get_size(procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(procs, i_proc); + if( NULL == proc ) { + continue; + } + + if( !(ORTE_PROC_STATE_RUNNING & proc->state) ) { + is_done = false; + break; + } + } + + if( !is_done ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t Still waiting for restart: \"%s\" [0x%x] != [0x%x]\n", + ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_RUNNING)); + } + + return is_done; +} + +/************************ + * Timing + ************************/ +static void errmgr_autor_set_time(int idx) +{ + if(idx < ERRMGR_AUTOR_TIMER_MAX ) { + if( timer_start[idx] <= 0.0 ) { + timer_start[idx] = errmgr_autor_get_time(); + } + } +} + +static void errmgr_autor_display_all_timers(void) +{ + double diff = 0.0; + char * label = NULL; + + opal_output(0, "Auto. Recovery Timing: ******************** Summary Begin\n"); + + /********** Structure Setup **********/ + label = strdup("Setup"); + diff = timer_start[ERRMGR_AUTOR_TIMER_SETUP] - timer_start[ERRMGR_AUTOR_TIMER_START]; + errmgr_autor_display_indv_timer_core(diff, label); + free(label); + + /********** Termination **********/ + label = strdup("Terminate"); + diff = timer_start[ERRMGR_AUTOR_TIMER_TERM] - timer_start[ERRMGR_AUTOR_TIMER_SETUP]; + errmgr_autor_display_indv_timer_core(diff, label); + free(label); + + /********** Setup new job **********/ + label = strdup("Setup Relaunch"); + diff = timer_start[ERRMGR_AUTOR_TIMER_RESETUP] - timer_start[ERRMGR_AUTOR_TIMER_TERM]; + errmgr_autor_display_indv_timer_core(diff, label); + free(label); + + /********** Restart **********/ + label = strdup("Restart"); + diff = timer_start[ERRMGR_AUTOR_TIMER_RESTART] - timer_start[ERRMGR_AUTOR_TIMER_RESETUP]; + errmgr_autor_display_indv_timer_core(diff, label); + free(label); + + /********** Finish **********/ + label = strdup("Finalize"); + diff = timer_start[ERRMGR_AUTOR_TIMER_FINISH] - timer_start[ERRMGR_AUTOR_TIMER_RESTART]; + errmgr_autor_display_indv_timer_core(diff, label); + free(label); + + opal_output(0, "Auto. Recovery Timing: ******************** Summary End\n"); +} + +static void errmgr_autor_clear_timers(void) +{ + int i; + for(i = 0; i < ERRMGR_AUTOR_TIMER_MAX; ++i) { + timer_start[i] = 0.0; + } +} + +static double errmgr_autor_get_time(void) +{ + double wtime; + +#if OPAL_TIMER_USEC_NATIVE + wtime = (double)opal_timer_base_get_usec() / 1000000.0; +#else + struct timeval tv; + gettimeofday(&tv, NULL); + wtime = tv.tv_sec; + wtime += (double)tv.tv_usec / 1000000.0; +#endif + + return wtime; +} + +static void errmgr_autor_display_indv_timer_core(double diff, char *str) +{ + double total = 0; + double perc = 0; + + total = timer_start[ERRMGR_AUTOR_TIMER_MAX-1] - timer_start[ERRMGR_AUTOR_TIMER_START]; + perc = (diff/total) * 100; + + opal_output(0, + "errmgr_autor: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n", + str, + diff, + total, + perc); + return; +} + +#endif /* OPAL_ENABLE_FT_CR */ diff --git a/orte/mca/errmgr/hnpresil/errmgr_hnpresil_component.c b/orte/mca/errmgr/hnpresil/errmgr_hnpresil_component.c new file mode 100644 index 0000000000..d96654d739 --- /dev/null +++ b/orte/mca/errmgr/hnpresil/errmgr_hnpresil_component.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "orte/mca/errmgr/base/errmgr_private.h" +#include "errmgr_hnpresil.h" + +/* + * Public string for version number + */ +const char *orte_errmgr_hnpresil_component_version_string = + "ORTE ERRMGR hnpresil MCA component version " ORTE_VERSION; + +/* + * Local functionality + */ +static int orte_errmgr_hnpresil_open(void); +static int orte_errmgr_hnpresil_close(void); + +/* + * Instantiate the public struct with all of our public information + * and pointer to our public functions in it + */ +orte_errmgr_hnpresil_component_t mca_errmgr_hnpresil_component = { + /* First do the base component stuff */ + { + /* Handle the general mca_component_t struct containing + * meta information about the component hnp + */ + { + ORTE_ERRMGR_BASE_VERSION_3_0_0, + /* Component name and version */ + "hnpresil", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + orte_errmgr_hnpresil_open, + orte_errmgr_hnpresil_close, + orte_errmgr_hnpresil_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + /* Verbosity level */ + 0, + /* opal_output handler */ + -1, + /* Default priority */ + 0 + } +}; + +static int orte_errmgr_hnpresil_open(void) +{ + int val; + + /* + * This should be the last componet to ever get used since + * it doesn't do anything. + */ + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "priority", + "Priority of the ERRMGR hnp component", + false, false, + mca_errmgr_hnpresil_component.super.priority, + &mca_errmgr_hnpresil_component.super.priority); + + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "verbose", + "Verbose level for the ERRMGR hnp component", + false, false, + mca_errmgr_hnpresil_component.super.verbose, + &mca_errmgr_hnpresil_component.super.verbose); + /* If there is a custom verbose level for this component than use it + * otherwise take our parents level and output channel + */ + if ( 0 != mca_errmgr_hnpresil_component.super.verbose) { + mca_errmgr_hnpresil_component.super.output_handle = opal_output_open(NULL); + opal_output_set_verbosity(mca_errmgr_hnpresil_component.super.output_handle, + mca_errmgr_hnpresil_component.super.verbose); + } else { + mca_errmgr_hnpresil_component.super.output_handle = orte_errmgr_base.output; + } + +#if OPAL_ENABLE_FT_CR + /**************************** + * CRMig (C/R Process Migration) MCA Options + ****************************/ + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "crmig_timing", + "Enable Process Migration timer", + false, false, + 0, &val); + mca_errmgr_hnpresil_component.crmig_timing_enabled = OPAL_INT_TO_BOOL(val); + + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "crmig_enable", + "Enable Process Migration (Default: 0/off)", + false, false, + 0, &val); + mca_errmgr_hnpresil_component.crmig_enabled = OPAL_INT_TO_BOOL(val); + + /**************************** + * AutoR (Automatic Recovery) MCA Options + ****************************/ + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "autor_timing", + "Enable Automatic Recovery timer", + false, false, + 0, &val); + mca_errmgr_hnpresil_component.autor_timing_enabled = OPAL_INT_TO_BOOL(val); + + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "autor_enable", + "Enable Automatic Recovery (Default: 0/off)", + false, false, + 0, &val); + mca_errmgr_hnpresil_component.autor_enabled = OPAL_INT_TO_BOOL(val); + + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "autor_recovery_delay", + "Number of seconds to wait before starting to recover the job after a failure" + " [Default: 1 sec]", + false, false, + 1, &val); + mca_errmgr_hnpresil_component.autor_recovery_delay = val; + + mca_base_param_reg_int(&mca_errmgr_hnpresil_component.super.base_version, + "autor_skip_oldnode", + "Skip the old node from failed proc, even if it is still available" + " [Default: Enabled]", + false, false, + 1, &val); + mca_errmgr_hnpresil_component.autor_skip_oldnode = OPAL_INT_TO_BOOL(val); +#else + val = 0; /* Silence compiler warning */ +#endif /* OPAL_ENABLE_FT_CR */ + + /* + * Debug Output + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open()"); + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: priority = %d", + mca_errmgr_hnpresil_component.super.priority); + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: verbosity = %d", + mca_errmgr_hnpresil_component.super.verbose); +#if OPAL_ENABLE_FT_CR + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: --- CR Migration Options ---"); + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: Process Migration = %s", + (mca_errmgr_hnpresil_component.crmig_enabled ? "Enabled" : "Disabled")); + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: timing = %s", + (mca_errmgr_hnpresil_component.crmig_timing_enabled ? "Enabled" : "Disabled")); + + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: --- Auto. Recovery Options ---"); + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: Auto. Recover = %s", + (mca_errmgr_hnpresil_component.autor_enabled ? "Enabled" : "Disabled")); + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: timing = %s", + (mca_errmgr_hnpresil_component.autor_timing_enabled ? "Enabled" : "Disabled")); + opal_output_verbose(20, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: open: recover_delay = %d", + mca_errmgr_hnpresil_component.autor_recovery_delay); + + mca_errmgr_hnpresil_component.crmig_in_progress = false; + mca_errmgr_hnpresil_component.autor_in_progress = false; + mca_errmgr_hnpresil_component.term_in_progress = false; +#endif /* OPAL_ENABLE_FT_CR */ + + return ORTE_SUCCESS; +} + +static int orte_errmgr_hnpresil_close(void) +{ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp: close()"); + + return ORTE_SUCCESS; +} diff --git a/orte/mca/errmgr/hnpresil/errmgr_hnpresil_crmig.c b/orte/mca/errmgr/hnpresil/errmgr_hnpresil_crmig.c new file mode 100644 index 0000000000..e56c451649 --- /dev/null +++ b/orte/mca/errmgr/hnpresil/errmgr_hnpresil_crmig.c @@ -0,0 +1,1517 @@ +/* + * Copyright (c) 2009-2010 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/show_help.h" +#include "opal/util/output.h" +#include "opal/util/opal_environ.h" +#include "opal/util/basename.h" +#include "opal/util/argv.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/mca/crs/crs.h" +#include "opal/mca/crs/base/base.h" + +#include "orte/util/error_strings.h" +#include "orte/util/name_fns.h" +#include "orte/util/proc_info.h" +#include "orte/runtime/orte_globals.h" +#include "opal/dss/dss.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/rml_types.h" +#include "orte/mca/iof/iof.h" +#include "orte/mca/plm/plm.h" +#include "orte/mca/plm/base/base.h" +#include "orte/mca/plm/base/plm_private.h" +#include "orte/mca/filem/filem.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/runtime/orte_wait.h" +#include "orte/mca/rmaps/rmaps_types.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/snapc/snapc.h" +#include "orte/mca/snapc/base/base.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "orte/mca/errmgr/base/errmgr_private.h" + +#include "errmgr_hnpresil.h" + +#include MCA_timer_IMPLEMENTATION_HEADER + +#if OPAL_ENABLE_FT_CR + +/************************************ + * Locally Global vars & functions :) + ************************************/ +static orte_jobid_t current_global_jobid = ORTE_JOBID_INVALID; +static orte_job_t *current_global_jobdata = NULL; + +static bool migrating_underway = false; +static bool migrating_terminated = false; +static bool migrating_restarted = false; + +static opal_list_t *current_onto_mapping_general = NULL; +static opal_list_t *current_onto_mapping_exclusive = NULL; + +/*** Command Line Interactions */ +static int current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE; + +static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_nodes, opal_list_t *onto_map); + +static int orte_errmgr_hnpresil_crmig_global_process_fault(orte_job_t *jdata, + orte_process_name_t *proc_name, + orte_proc_state_t state); +static void errmgr_crmig_process_fault_app(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state); +static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state); + +static bool check_if_duplicate_proc(orte_proc_t *proc, opal_pointer_array_t *migrating_procs); +static int check_if_terminated(opal_pointer_array_t *migrating_procs); +static int check_if_restarted(opal_pointer_array_t *migrating_procs); + +static int check_and_pre_map(opal_list_t *off_procs, + opal_list_t *off_nodes, + orte_snapc_base_quiesce_t *cur_datum); + +static void display_request(opal_list_t *off_procs, + opal_list_t *off_nodes, + orte_snapc_base_quiesce_t *cur_datum); + +/* + * Timer stuff + */ +static void errmgr_crmig_set_time(int idx); +static void errmgr_crmig_display_all_timers(void); +static void errmgr_crmig_clear_timers(void); + +static double errmgr_crmig_get_time(void); +static void errmgr_crmig_display_indv_timer_core(double diff, char *str); +static double timer_start[OPAL_CR_TIMER_MAX]; + +#define ERRMGR_CRMIG_TIMER_START 0 +#define ERRMGR_CRMIG_TIMER_SETUP 1 +#define ERRMGR_CRMIG_TIMER_CKPT 2 +#define ERRMGR_CRMIG_TIMER_TERM 3 +#define ERRMGR_CRMIG_TIMER_RESETUP 4 +#define ERRMGR_CRMIG_TIMER_RESTART 5 +#define ERRMGR_CRMIG_TIMER_FINISH 6 +#define ERRMGR_CRMIG_TIMER_MAX 7 + +#define ERRMGR_CRMIG_CLEAR_TIMERS() \ + { \ + if(OPAL_UNLIKELY(mca_errmgr_hnpresil_component.crmig_timing_enabled > 0)) { \ + errmgr_crmig_clear_timers(); \ + } \ + } + +#define ERRMGR_CRMIG_SET_TIMER(idx) \ + { \ + if(OPAL_UNLIKELY(mca_errmgr_hnpresil_component.crmig_timing_enabled > 0)) { \ + errmgr_crmig_set_time(idx); \ + } \ + } + +#define ERRMGR_CRMIG_DISPLAY_ALL_TIMERS() \ + { \ + if(OPAL_UNLIKELY(mca_errmgr_hnpresil_component.crmig_timing_enabled > 0)) { \ + errmgr_crmig_display_all_timers(); \ + } \ + } + +/************************ + * Function Definitions: Global + ************************/ +int orte_errmgr_hnpresil_crmig_global_module_init(void) +{ + int ret; + + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig): init()"); + + migrating_underway = false; + + current_global_jobid = ORTE_JOBID_INVALID; + current_global_jobdata = NULL; + + /* + * Initialize the connection to the orte-migrate tool + */ + if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_init()) ) { + ORTE_ERROR_LOG(ret); + return ret; + } + + ERRMGR_CRMIG_CLEAR_TIMERS(); + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_crmig_global_module_finalize(void) +{ + int ret; + + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig): finalize()"); + + /* + * Finalize the connection to the orte-migrate tool + */ + if( ORTE_SUCCESS != (ret = orte_errmgr_base_tool_finalize()) ) { + ORTE_ERROR_LOG(ret); + return ret; + } + + migrating_underway = false; + + current_global_jobid = ORTE_JOBID_INVALID; + current_global_jobdata = NULL; + + ERRMGR_CRMIG_CLEAR_TIMERS(); + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_crmig_global_predicted_fault(opal_list_t *proc_list, + opal_list_t *node_list, + opal_list_t *suggested_map) +{ + int ret, exit_status = ORTE_SUCCESS; + orte_job_t *jdata = NULL; + int i; + + /* + * JJH: RETURN HERE + * If we are already migrating, then reject this request + */ + if( migrating_underway ) { + ; + } + + /* + * Determine the jobid for this migration + * JJH: Assumes only one job active at any one time + */ + for(i = 0; i < orte_job_data->size; ++i ) { + if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, i))) { + continue; + } + /* Exclude outselves */ + if( jdata->jobid == ORTE_PROC_MY_NAME->jobid ) { + continue; + } + current_global_jobdata = jdata; + current_global_jobid = jdata->jobid; + break; + } + if( NULL == current_global_jobdata ) { + opal_output(0, "errmgr:hnp(crmig):predicted_fault(): Global) Error: Cannot find the jdata for the current job."); + ORTE_ERROR_LOG(ORTE_ERROR); + return ORTE_ERROR; + } + current_global_jobdata->controls |= ORTE_JOB_CONTROL_RECOVERABLE; + + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_REQUEST; + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /************************* + * Kick off the migration + *************************/ + if( ORTE_SUCCESS != (ret = errmgr_crmig_global_migrate(proc_list, node_list, suggested_map)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /************************ + * Set up the Command Line listener again + *************************/ + if( ORTE_ERRMGR_MIGRATE_STATE_ERROR != current_migration_status ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(ORTE_ERRMGR_MIGRATE_STATE_NONE)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrated_job", true); + } + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_NONE; + + cleanup: + return exit_status; +} + +int orte_errmgr_hnpresil_crmig_global_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + orte_job_t *jdata = NULL; + int ret = ORTE_SUCCESS; + + /* + * if orte is trying to shutdown, just let it + */ + if( mca_errmgr_hnpresil_component.term_in_progress ) { + return ORTE_SUCCESS; + } + + /* + * Get the job data object for this process + */ + if( NULL != proc_name ) { /* Get job from proc's jobid */ + jdata = orte_get_job_data_object(proc_name->jobid); + } else { /* Get from the general job */ + jdata = orte_get_job_data_object(job); + } + if( NULL == jdata ) { + opal_output(0, "%s errmgr:hnp(crmig):update_state() Error: Cannot find job %s for Process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name) ); + ret = ORTE_ERROR; + ORTE_ERROR_LOG(ret); + return ret; + } + + /* + * If this is a tool, ignore + */ + if( jdata->num_apps == 0 && + OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, proc_name) ) { + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp(crmig): An external tool disconnected. Ignore...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp(crmig): job %s reported state %s" + " for proc %s state %s exit_code %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), + orte_job_state_to_str(jobstate), + (NULL == proc_name) ? "NULL" : ORTE_NAME_PRINT(proc_name), + orte_proc_state_to_str(state), exit_code)); + + if( ORTE_PROC_STATE_ABORTED_BY_SIG == state || + ORTE_PROC_STATE_COMM_FAILED == state ) { + if( ORTE_SUCCESS != (ret = orte_errmgr_hnpresil_crmig_global_process_fault(jdata, proc_name, state)) ) { + ORTE_ERROR_LOG(ret); + return ret; + } + } + else if( ORTE_PROC_STATE_KILLED_BY_CMD == state ) { + if( migrating_underway ) { + /* If we are migrating, then we need to mask this to prevent the lower level from terminating us */ + mca_errmgr_hnpresil_component.ignore_current_update = true; + orte_errmgr_hnpresil_update_proc(jdata, proc_name, state, 0, exit_code); + } + } + + return ORTE_SUCCESS; +} + +int orte_errmgr_hnpresil_crmig_global_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list) +{ + int exit_status = ORTE_SUCCESS; + opal_list_item_t *item = NULL, *m_item = NULL; + orte_errmgr_predicted_map_t *onto_map = NULL, *current_proc_map = NULL; + orte_node_t *node = NULL; + bool found = false; + int num_suggested = 0; + orte_std_cntr_t i_proc; + orte_proc_t *peer_proc = NULL; + + /* + * If not migrating, then suggest nothing + */ + if( !migrating_underway ) { + return ORTE_SUCCESS; + } + + /* + * First look for an exclusive mapping for this process + */ + for(item = opal_list_get_first(current_onto_mapping_exclusive); + item != opal_list_get_end(current_onto_mapping_exclusive); + item = opal_list_get_next(item) ) { + onto_map = (orte_errmgr_predicted_map_t*) item; + if( onto_map->proc_name.vpid == proc->name.vpid ) { + current_proc_map = onto_map; + break; + } + } + + /* + * If there is an exclusive mapping then... + */ + if( NULL != current_proc_map ) { + /* + * If we made an exclusive mapping during the check_and_pre_map() + * then honor it here. + */ + if( NULL != current_proc_map->pre_map_fixed_node ) { + for( item = opal_list_get_first(node_list); + item != opal_list_get_end(node_list); + item = opal_list_get_next(item) ) { + node = (orte_node_t*)item; + + /* Exclude all other nodes */ + found = false; + + if( 0 == strncmp(node->name, current_proc_map->pre_map_fixed_node, + strlen(current_proc_map->pre_map_fixed_node)) ) { + found = true; + break; + } + if( !found ) { + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + continue; + } else { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Fixed use of node [%15s : %10s -> %10s (%10s)] -------", + ORTE_NAME_PRINT(&proc->name), oldnode->name, + current_proc_map->pre_map_fixed_node, node->name)); + } + } + + /* All done with mapping */ + exit_status = ORTE_SUCCESS; + goto cleanup; + } + + /* + * If 'off_current_node' then exclude current node + */ + if( current_proc_map->off_current_node ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Remove old node (info) [%15s : %10s] -------", + ORTE_NAME_PRINT(&proc->name), oldnode->name)); + for( item = opal_list_get_first(node_list); + item != opal_list_get_end(node_list); + item = opal_list_get_next(item) ) { + node = (orte_node_t*)item; + + /* Exclude the old node */ + if( node == oldnode ) { + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + break; + } + } + } + + /* + * If 'map_proc_name' then map to the node where this process resides + * Note: Only do this if there was no 'other' node suggested. If there + * was an 'other' node suggested then we need to honor that before + * we honor the peer suggestion. + */ + if( ORTE_VPID_INVALID != current_proc_map->map_proc_name.vpid && + current_proc_map->proc_name.vpid != current_proc_map->map_proc_name.vpid && + NULL == current_proc_map->map_node_name ) { + /* + * Find the node containting the target process + */ + for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { + peer_proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); + if( NULL == peer_proc ) { + continue; + } + if( peer_proc->name.vpid == current_proc_map->map_proc_name.vpid ) { + current_proc_map->map_node_name = strdup(peer_proc->node->name); + break; + } + } + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Force use of node with proc [%15s -> %15s: %10s -> %10s] -------", + ORTE_NAME_PRINT(&proc->name), ORTE_NAME_PRINT(&peer_proc->name), + oldnode->name, current_proc_map->map_node_name)); + } + + /* + * If 'map_node_name' then use this node exclusively + */ + if( NULL != current_proc_map->map_node_name ) { + for( item = opal_list_get_first(node_list); + item != opal_list_get_end(node_list); + item = opal_list_get_next(item) ) { + node = (orte_node_t*)item; + + /* Exclude all nodes not in the include list */ + found = false; + + if( 0 == strncmp(node->name, current_proc_map->map_node_name, strlen(current_proc_map->map_node_name)) ) { + found = true; + } + if( !found ) { + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + continue; + } else { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Force use of node [%15s : %10s -> %10s (%10s)] -------", + ORTE_NAME_PRINT(&proc->name), oldnode->name, + current_proc_map->map_node_name, node->name)); + } + } + + /* All done with mapping */ + exit_status = ORTE_SUCCESS; + goto cleanup; + } + + /* + * Otherwise then map as if there was no exclusive mapping + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Suggesting as if non-exclusive [%15s : 0x%x : %10s] -------", + ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); + } + /* + * If no exclusive mapping (or exclusive did not yield any results) then... + */ + else { + /* + * Remove the old node from the list, if there are more than 1 nodes available + */ + if(1 < opal_list_get_size(node_list) ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Remove old node [%15s : %10s] -------", + ORTE_NAME_PRINT(&proc->name), oldnode->name)); + for( item = opal_list_get_first(node_list); + item != opal_list_get_end(node_list); + item = opal_list_get_next(item) ) { + node = (orte_node_t*)item; + + /* Exclude the old node */ + if( node == oldnode ) { + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + break; + } + } + } + } + + /* + * If we do not have any general suggestions, then just return + */ + if( opal_list_get_size(current_onto_mapping_general) <= 0 ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- No suggestions for target [%15s : 0x%x : %10s] -------", + ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); + exit_status = ORTE_SUCCESS; + goto cleanup; + } + + /* + * Otherwise look through the general suggestions as an include list + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Suggest a target for [%15s : 0x%x : %10s] -------", + ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); + + num_suggested = 0; + for( item = opal_list_get_first(node_list); + item != opal_list_get_end(node_list); + item = opal_list_get_next(item) ) { + node = (orte_node_t*)item; + + /* Exclude all nodes not in the include list */ + found = false; + + for(m_item = opal_list_get_first(current_onto_mapping_general); + m_item != opal_list_get_end(current_onto_mapping_general); + m_item = opal_list_get_next(m_item) ) { + onto_map = (orte_errmgr_predicted_map_t*) m_item; + + if( 0 == strncmp(node->name, onto_map->map_node_name, strlen(onto_map->map_node_name)) ) { + found = true; + break; + } + } + if( !found ) { + opal_list_remove_item(node_list, item); + OBJ_RELEASE(item); + continue; + } + + ++num_suggested; + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Suggesting target %2d [%15s : 0x%x : %10s -> %10s] -------", + num_suggested, ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name, node->name)); + } + + cleanup: + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):suggest() ------- Suggested %2d nodes for [%15s : 0x%x : %10s] -------", + (int)opal_list_get_size(node_list), ORTE_NAME_PRINT(&proc->name), proc->state, oldnode->name)); + + return exit_status; +} + +int orte_errmgr_hnpresil_crmig_global_ft_event(int state) +{ + return ORTE_SUCCESS; +} + + +/************************ + * Function Definitions: Static + ************************/ +static int orte_errmgr_hnpresil_crmig_global_process_fault(orte_job_t *jdata, + orte_process_name_t *proc_name, + orte_proc_state_t state) +{ + /* + * JJH: Todo + * The expected logic here is: + * if( a daemon with children fails ) { + * abort migration. + * } + * if( a daemon without children fails ) { + * continue. No processes lost + * } + * if( an application process fails ) { + * abort migration. Might be a bad checkpoint, or a process that we were + * not migrating that died. + * } + * else { + * continue; + * } + */ + if( proc_name->jobid == ORTE_PROC_MY_NAME->jobid ) { + errmgr_crmig_process_fault_daemon(jdata, proc_name, state); + } else { + errmgr_crmig_process_fault_app(jdata, proc_name, state); + } + + return ORTE_SUCCESS; +} + +static int errmgr_crmig_global_migrate(opal_list_t *off_procs, opal_list_t *off_nodes, opal_list_t *onto_maps) +{ + int ret, exit_status = ORTE_SUCCESS; + orte_std_cntr_t i_node; + orte_std_cntr_t i_proc; + orte_node_t *node = NULL; + orte_proc_t *proc = NULL; + bool found = false; + orte_snapc_base_quiesce_t *cur_datum = NULL; + bool close_iof_stdin = false; + orte_process_name_t iof_name = {ORTE_JOBID_INVALID, 0}; + char * err_str_procs = NULL; + char * err_str_nodes = NULL; + char * tmp_str = NULL; + orte_errmgr_predicted_proc_t *off_proc = NULL; + orte_errmgr_predicted_node_t *off_node = NULL; + orte_errmgr_predicted_map_t *onto_map = NULL; + opal_list_item_t *item = NULL; + + ERRMGR_CRMIG_CLEAR_TIMERS(); + ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_START); + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Migrating (%3d, %3d, %3d) -------", + (int)opal_list_get_size(off_procs), + (int)opal_list_get_size(off_nodes), + (int)opal_list_get_size(onto_maps))); + + /* + * Modeled after orte_plm_base_reset_job + */ + cur_datum = OBJ_NEW(orte_snapc_base_quiesce_t); + cur_datum->migrating = true; + migrating_underway = true; + mca_errmgr_hnpresil_component.crmig_in_progress = true; + + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_RUNNING; + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* + * Check to make sure that the 'off' and 'onto' nodes exist + * - if 'onto' nodes do not, then add them (JJH XXX) + * - if 'off' nodes do not, then return an error (JJH XXX) + * JJH TODO... + */ + + /* + * Copy over the onto_nodes so we can suggest them later + */ + if( NULL != current_onto_mapping_general ) { + OBJ_RELEASE(current_onto_mapping_general); + current_onto_mapping_general = NULL; + } + if( NULL != current_onto_mapping_exclusive ) { + OBJ_RELEASE(current_onto_mapping_exclusive); + current_onto_mapping_exclusive = NULL; + } + current_onto_mapping_general = OBJ_NEW(opal_list_t); + current_onto_mapping_exclusive = OBJ_NEW(opal_list_t); + if( NULL != onto_maps ) { + while( NULL != (item = opal_list_remove_first(onto_maps)) ) { + onto_map = (orte_errmgr_predicted_map_t*) item; + /* Determine if process exclude mapping, or general */ + if( onto_map->proc_name.vpid == ORTE_VPID_INVALID ) { + opal_list_append(current_onto_mapping_general, item); + } else { + opal_list_append(current_onto_mapping_exclusive, item); + } + } + } + + for(item = opal_list_get_first(current_onto_mapping_exclusive); + item != opal_list_get_end(current_onto_mapping_exclusive); + item = opal_list_get_next(item) ) { + onto_map = (orte_errmgr_predicted_map_t*) item; + /* + * Find the node currently containing this process + */ + found = false; + for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); + if( NULL == proc ) { + continue; + } + + if( proc->name.vpid == onto_map->proc_name.vpid) { + found = true; + break; + } + } + + /* + * Check to see if this process hsould be skipped + */ + if( !onto_map->off_current_node && + (ORTE_VPID_INVALID == onto_map->map_proc_name.vpid || + onto_map->proc_name.vpid == onto_map->map_proc_name.vpid ) && + (NULL == onto_map->map_node_name || + 0 == strncmp(onto_map->map_node_name, proc->node->name, strlen(proc->node->name))) ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Process %15s does not wish to move -------", + ORTE_NAME_PRINT(&proc->name))); + + } else { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Process %15s will be moved -------", + ORTE_NAME_PRINT(&proc->name))); + /* + * Set the process to restarting + */ + proc->state = ORTE_PROC_STATE_MIGRATING; + + opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc); + OBJ_RETAIN(proc); + (cur_datum->num_migrating)++; + + if( current_global_jobdata->stdin_target == proc->name.vpid ) { + close_iof_stdin = true; + iof_name.jobid = proc->name.jobid; + iof_name.vpid = proc->name.vpid; + iof_name.epoch = proc->name.epoch; + } + } + } + + migrating_terminated = false; + migrating_restarted = false; + + /* + * Create a list of processes to migrate, if 'off_nodes' specified + */ + for(item = opal_list_get_first(off_nodes); + item != opal_list_get_end(off_nodes); + item = opal_list_get_next(item) ) { + off_node = (orte_errmgr_predicted_node_t*)item; + + /* + * Find the node in the job structure + * - Make sure that 'odin00' doesn't match all 'odin00*' + */ + found = false; + for(i_node = 0; i_node < opal_pointer_array_get_size(current_global_jobdata->map->nodes); ++i_node) { + node = (orte_node_t*)opal_pointer_array_get_item(current_global_jobdata->map->nodes, i_node); + if( NULL == node ) { + continue; + } + + if( 0 == strncmp(node->name, off_node->node_name, strlen(off_node->node_name)) ) { + found = true; + break; + } + } + if( !found ) { + ; /* Warn about invalid node */ + } else { + /* + * Add all processes from this node + */ + for(i_proc = 0; i_proc < opal_pointer_array_get_size(node->procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i_proc); + if( NULL == proc ) { + continue; + } + + /* + * Set the process to restarting + */ + proc->state = ORTE_PROC_STATE_MIGRATING; + + opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc); + OBJ_RETAIN(proc); + (cur_datum->num_migrating)++; + + if( current_global_jobdata->stdin_target == proc->name.vpid ) { + close_iof_stdin = true; + iof_name.jobid = proc->name.jobid; + iof_name.vpid = proc->name.vpid; + iof_name.epoch = proc->name.epoch; + } + } + } + } + + /* + * Create a list of processes to migrate, if 'off_procs' specified + */ + for(item = opal_list_get_first(off_procs); + item != opal_list_get_end(off_procs); + item = opal_list_get_next(item) ) { + off_proc = (orte_errmgr_predicted_proc_t*)item; + + /* + * Find the process in the job structure + */ + found = false; + for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); + if( NULL == proc ) { + continue; + } + + if( proc->name.vpid == off_proc->proc_name.vpid) { + found = true; + break; + } + } + /* + * Make sure the process is not listed multiple times + */ + if( found ) { + found = check_if_duplicate_proc(proc, &(cur_datum->migrating_procs)); + if( !found ) { + /* + * Set the process to restarting + */ + proc->state = ORTE_PROC_STATE_MIGRATING; + + opal_pointer_array_add(&(cur_datum->migrating_procs), (void*)proc); + OBJ_RETAIN(proc); + (cur_datum->num_migrating)++; + + if( current_global_jobdata->stdin_target == proc->name.vpid ) { + close_iof_stdin = true; + iof_name.jobid = proc->name.jobid; + iof_name.vpid = proc->name.vpid; + iof_name.epoch = proc->name.epoch; + } + } + } + } + + /* + * If we did not find any processes to migrate, then throw a warning, and skip it. + */ + if( 0 >= cur_datum->num_migrating ) { + for(item = opal_list_get_first(off_nodes); + item != opal_list_get_end(off_nodes); + item = opal_list_get_next(item) ) { + off_node = (orte_errmgr_predicted_node_t*)item; + if( NULL != err_str_nodes ) { + asprintf(&tmp_str, "%s, %s", err_str_nodes, off_node->node_name); + free(err_str_nodes); + err_str_nodes = strdup(tmp_str); + free(tmp_str); + tmp_str = NULL; + } else { + asprintf(&err_str_nodes, "%s", off_node->node_name); + } + } + + for(item = opal_list_get_first(off_procs); + item != opal_list_get_end(off_procs); + item = opal_list_get_next(item) ) { + off_proc = (orte_errmgr_predicted_proc_t*)item; + if( NULL != err_str_procs ) { + asprintf(&tmp_str, "%s, %d", err_str_procs, (int)off_proc->proc_name.vpid); + free(err_str_procs); + err_str_procs = strdup(tmp_str); + free(tmp_str); + tmp_str = NULL; + } else { + asprintf(&err_str_procs, "%d", off_proc->proc_name.vpid); + } + } + + opal_show_help("help-orte-errmgr-hnp.txt", "crmig_no_migrating_procs", true, + err_str_nodes, + err_str_procs); + + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_ERROR; + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + goto cleanup; + } + + /* + * Final pass on the migration list to pre-map processes and remove + * processes that should not be migrated. + */ + if( ORTE_SUCCESS != (ret = check_and_pre_map(off_procs, off_nodes, cur_datum)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* + * Display the request before processing it. + */ + display_request(off_procs, off_nodes, cur_datum); + + ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_SETUP); + + /* + * Checkpoint the job + * - Hold all non-migrating processes + * - Abort the marked processes + * - + */ + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_RUN_CKPT; + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Starting the checkpoint of job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + if( ORTE_SUCCESS != (ret = orte_snapc.start_ckpt(cur_datum)) ) { + opal_output(0, "errmgr:hnp(crmig):migrate() Error: Unable to start the checkpoint."); + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_CKPT); + + /* + * Terminate the migrating processes + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Terminate old processes in job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + orte_plm.terminate_procs(&cur_datum->migrating_procs); + + /* + * Clear the IOF stdin target if necessary + */ + if( close_iof_stdin ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Closing old STDIN target for job %s (%s)-------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid), + ORTE_NAME_PRINT(&iof_name) )); + + orte_iof.close(&iof_name, ORTE_IOF_STDIN); + } + + /* + * Wait for the processes to finish terminating + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Waiting for termination -------"); + + while( !migrating_terminated ) { + opal_progress(); + check_if_terminated(&(cur_datum->migrating_procs)); + } + + ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_TERM); + + /* + * Start remapping the processes + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Checkpoint finished, setting up job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_STARTUP; + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* + * Reset the job parameters for restart + * This will set the state of the job to 'restart' + */ + orte_plm_base_reset_job(current_global_jobdata); + + /* + * Adjust the application context information + */ + for(i_proc = 0; i_proc < opal_pointer_array_get_size(&(cur_datum->migrating_procs)); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(&(cur_datum->migrating_procs), i_proc); + if( NULL == proc ) { + continue; + } + + if( ORTE_SUCCESS != (ret = orte_errmgr_base_update_app_context_for_cr_recovery(current_global_jobdata, + proc, + &(cur_datum->ss_snapshot->local_snapshots))) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\tAdjusted: \"%s\" [0x%d] [%s]\n", + ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name)); + } + + ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_RESETUP); + + /* + * Restart the job + * - spawn function will remap and launch the replacement proc(s) + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Respawning migrating processes in job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + orte_plm.spawn(current_global_jobdata); + + + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Waiting for restart -------"); + + migrating_restarted = false; + while( !migrating_restarted ) { + opal_progress(); + check_if_restarted(&(cur_datum->migrating_procs)); + } + + ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_RESTART); + + /* + * Finish the checkpoint + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Reconnecting processes in job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + if( ORTE_SUCCESS != (ret = orte_snapc.end_ckpt(cur_datum)) ) { + opal_output(0, "errmgr:hnp(crmig):migrate() Error: Unable to end the checkpoint."); + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + /* + * All done + */ + opal_output_verbose(10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() ------- Finished migrating processes in job %s -------", + ORTE_JOBID_PRINT(current_global_jobdata->jobid)); + + OBJ_RELEASE(cur_datum); + + current_migration_status = ORTE_ERRMGR_MIGRATE_STATE_FINISH; + if( ORTE_SUCCESS != (ret = orte_errmgr_base_migrate_update(current_migration_status)) ) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + + ERRMGR_CRMIG_SET_TIMER(ERRMGR_CRMIG_TIMER_FINISH); + ERRMGR_CRMIG_DISPLAY_ALL_TIMERS(); + + cleanup: + migrating_underway = false; + migrating_terminated = false; + migrating_restarted = false; + mca_errmgr_hnpresil_component.crmig_in_progress = false; + + if( NULL != err_str_procs ) { + free(err_str_procs); + err_str_procs = NULL; + } + + if( NULL != err_str_nodes ) { + free(err_str_nodes); + err_str_nodes = NULL; + } + + return exit_status; +} + +static bool check_if_duplicate_proc(orte_proc_t *proc, opal_pointer_array_t *migrating_procs) +{ + orte_std_cntr_t i_proc; + orte_proc_t *loc_proc = NULL; + + for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) { + loc_proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc); + if( NULL == loc_proc ) { + continue; + } + if( loc_proc->name.vpid == proc->name.vpid ) { + return true; + } + } + + return false; +} + +static int check_if_terminated(opal_pointer_array_t *migrating_procs) +{ + orte_std_cntr_t i_proc; + orte_proc_t *proc = NULL; + bool is_done; + + is_done = true; + for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc); + if( NULL == proc ) { + continue; + } + + if( !(ORTE_PROC_STATE_KILLED_BY_CMD & proc->state) ) { + is_done = false; + break; + } + } + + if( is_done ) { + migrating_terminated = true; + } + else { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t Still waiting for termination: \"%s\" [0x%x] != [0x%x]\n", + ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_KILLED_BY_CMD)); + } + + return ORTE_SUCCESS; +} + +static int check_if_restarted(opal_pointer_array_t *migrating_procs) +{ + orte_std_cntr_t i_proc; + orte_proc_t *proc = NULL; + bool is_done; + + is_done = true; + for(i_proc = 0; i_proc < opal_pointer_array_get_size(migrating_procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(migrating_procs, i_proc); + if( NULL == proc ) { + continue; + } + + /* proc->state != ORTE_PROC_STATE_LAUNCHED */ + if( !(ORTE_PROC_STATE_RUNNING & proc->state) ) { + is_done = false; + break; + } + } + + if( is_done ) { + migrating_restarted = true; + } + else { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\tStill waiting for restart: \"%s\" [0x%x] != [0x%x]\n", + ORTE_NAME_PRINT(&proc->name), proc->state, ORTE_PROC_STATE_RUNNING)); + } + + return ORTE_SUCCESS; +} + +static void errmgr_crmig_process_fault_app(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state) +{ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):process_fault_app() " + "------- Application fault reported! proc %s (0x%x) " + "- %s", + ORTE_NAME_PRINT(proc), + state, + (migrating_underway ? "Migrating" : "Not Migrating") )); + + return; +} + +static void errmgr_crmig_process_fault_daemon(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state) +{ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):process_fault_daemon() " + "------- Daemon fault reported! proc %s (0x%x) " + "- %s", + ORTE_NAME_PRINT(proc), + state, + (migrating_underway ? "Migrating" : "Not Migrating") )); + + /* + * Failed communication can be ignored for the most part. + * Make sure to remove the route + * JJH: Check to make sure this is not a new daemon loss. + */ + if( ORTE_PROC_STATE_COMM_FAILED == state ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):process_fault_daemon() " + "------- Daemon fault reported! proc %s (0x%x) " + "- Communication failure, keep going", + ORTE_NAME_PRINT(proc), + state )); + } + + return; +} + +static int check_and_pre_map(opal_list_t *off_procs, + opal_list_t *off_nodes, + orte_snapc_base_quiesce_t *cur_datum) +{ + /* + * Check the 'off_procs' list for processes that should not be migrated + */ + + /* + * Check the 'current_onto_mapping_exclusive' for processes that are moving + * 'near/with' other processes that are also moving. Be sure to watch out + * for circular deadlock. + */ + + /* + * Use the 'pre_map_fixed_node' structure to fix this process' mapping. + */ + + return ORTE_SUCCESS; +} + +static void display_request(opal_list_t *off_procs, + opal_list_t *off_nodes, + orte_snapc_base_quiesce_t *cur_datum) +{ + orte_std_cntr_t i_node; + orte_std_cntr_t i_proc; + orte_node_t *node = NULL; + orte_proc_t *proc = NULL; + bool found = false; + char * status_str = NULL; + char * tmp_str = NULL; + orte_errmgr_predicted_proc_t *off_proc = NULL; + orte_errmgr_predicted_node_t *off_node = NULL; + orte_errmgr_predicted_map_t *onto_map = NULL; + opal_list_item_t *item = NULL; + + /* + * Display all requested processes to migrate + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() Requested Processes to migrate: (%d procs)\n", + (int) opal_list_get_size(off_procs) )); + for(item = opal_list_get_first(off_procs); + item != opal_list_get_end(off_procs); + item = opal_list_get_next(item) ) { + off_proc = (orte_errmgr_predicted_proc_t*)item; + + /* + * Find the process in the job structure + */ + found = false; + for(i_proc = 0; i_proc < opal_pointer_array_get_size(current_global_jobdata->procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(current_global_jobdata->procs, i_proc); + if( NULL == proc ) { + continue; + } + + if( proc->name.vpid == off_proc->proc_name.vpid) { + found = true; + break; + } + } + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t%s (Rank %3d) on node %s\n", + ORTE_NAME_PRINT(&proc->name), (int)off_proc->proc_name.vpid, proc->node->name)); + } + + /* + * Display Off Nodes + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() Requested Nodes to migration: (%d nodes)\n", + (int)opal_list_get_size(off_nodes) )); + + for(item = opal_list_get_first(off_nodes); + item != opal_list_get_end(off_nodes); + item = opal_list_get_next(item) ) { + off_node = (orte_errmgr_predicted_node_t*)item; + + for(i_node = 0; i_node < opal_pointer_array_get_size(current_global_jobdata->map->nodes); ++i_node) { + node = (orte_node_t*)opal_pointer_array_get_item(current_global_jobdata->map->nodes, i_node); + if( NULL == node ) { + continue; + } + + found = false; + if( 0 == strncmp(node->name, off_node->node_name, strlen(off_node->node_name)) ) { + found = true; + break; + } + } + if( found ) { + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t\"%s\" \t%d\n", + node->name, node->num_procs)); + for(i_proc = 0; i_proc < opal_pointer_array_get_size(node->procs); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i_proc); + if( NULL == proc ) { + continue; + } + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t\t\"%s\" [0x%x]\n", + ORTE_NAME_PRINT(&proc->name), proc->state)); + } + } + } + + /* + * Suggested onto nodes + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() Suggested nodes to migration onto: (%d nodes)\n", + (int)opal_list_get_size(current_onto_mapping_general) )); + for(item = opal_list_get_first(current_onto_mapping_general); + item != opal_list_get_end(current_onto_mapping_general); + item = opal_list_get_next(item) ) { + onto_map = (orte_errmgr_predicted_map_t*) item; + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t\"%s\"\n", + onto_map->map_node_name)); + } + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() Suggested nodes to migration onto (exclusive): (%d nodes)\n", + (int)opal_list_get_size(current_onto_mapping_exclusive) )); + for(item = opal_list_get_first(current_onto_mapping_exclusive); + item != opal_list_get_end(current_onto_mapping_exclusive); + item = opal_list_get_next(item) ) { + onto_map = (orte_errmgr_predicted_map_t*) item; + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t%d\t(%c)\t\"%s\"\n", + onto_map->proc_name.vpid, + (onto_map->off_current_node ? 'T' : 'F'), + onto_map->map_node_name)); + } + + /* + * Display all processes scheduled to migrate + */ + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "errmgr:hnp(crmig):migrate() All Migrating Processes: (%d procs)\n", + cur_datum->num_migrating)); + for(i_proc = 0; i_proc < opal_pointer_array_get_size(&(cur_datum->migrating_procs)); ++i_proc) { + proc = (orte_proc_t*)opal_pointer_array_get_item(&(cur_datum->migrating_procs), i_proc); + if( NULL == proc ) { + continue; + } + + OPAL_OUTPUT_VERBOSE((10, mca_errmgr_hnpresil_component.super.output_handle, + "\t\"%s\" [0x%x] [%s]\n", + ORTE_NAME_PRINT(&proc->name), proc->state, proc->node->name)); + + if( NULL == status_str ) { + asprintf(&status_str, "\t%s Rank %d on Node %s\n", + ORTE_NAME_PRINT(&proc->name), + (int)proc->name.vpid, + proc->node->name); + } else { + tmp_str = strdup(status_str); + free(status_str); + status_str = NULL; + asprintf(&status_str, "%s\t%s Rank %d on Node %s\n", + tmp_str, + ORTE_NAME_PRINT(&proc->name), + (int)proc->name.vpid, + proc->node->name); + } + } + + opal_show_help("help-orte-errmgr-hnp.txt", "crmig_migrating_job", true, + status_str); + + if( NULL != tmp_str ) { + free(tmp_str); + tmp_str = NULL; + } + + if( NULL != status_str ) { + free(status_str); + status_str = NULL; + } + + return; +} + +/************************ + * Timing + ************************/ +static void errmgr_crmig_set_time(int idx) +{ + if(idx < ERRMGR_CRMIG_TIMER_MAX ) { + if( timer_start[idx] <= 0.0 ) { + timer_start[idx] = errmgr_crmig_get_time(); + } + } +} + +static void errmgr_crmig_display_all_timers(void) +{ + double diff = 0.0; + char * label = NULL; + + opal_output(0, "Process Migration Timing: ******************** Summary Begin\n"); + + /********** Structure Setup **********/ + label = strdup("Setup"); + diff = timer_start[ERRMGR_CRMIG_TIMER_SETUP] - timer_start[ERRMGR_CRMIG_TIMER_START]; + errmgr_crmig_display_indv_timer_core(diff, label); + free(label); + + /********** Checkpoint **********/ + label = strdup("Checkpoint"); + diff = timer_start[ERRMGR_CRMIG_TIMER_CKPT] - timer_start[ERRMGR_CRMIG_TIMER_SETUP]; + errmgr_crmig_display_indv_timer_core(diff, label); + free(label); + + /********** Termination **********/ + label = strdup("Terminate"); + diff = timer_start[ERRMGR_CRMIG_TIMER_TERM] - timer_start[ERRMGR_CRMIG_TIMER_CKPT]; + errmgr_crmig_display_indv_timer_core(diff, label); + free(label); + + /********** Setup new job **********/ + label = strdup("Setup Relaunch"); + diff = timer_start[ERRMGR_CRMIG_TIMER_RESETUP] - timer_start[ERRMGR_CRMIG_TIMER_TERM]; + errmgr_crmig_display_indv_timer_core(diff, label); + free(label); + + /********** Restart **********/ + label = strdup("Restart"); + diff = timer_start[ERRMGR_CRMIG_TIMER_RESTART] - timer_start[ERRMGR_CRMIG_TIMER_RESETUP]; + errmgr_crmig_display_indv_timer_core(diff, label); + free(label); + + /********** Finish **********/ + label = strdup("Finalize"); + diff = timer_start[ERRMGR_CRMIG_TIMER_FINISH] - timer_start[ERRMGR_CRMIG_TIMER_RESTART]; + errmgr_crmig_display_indv_timer_core(diff, label); + free(label); + + opal_output(0, "Process Migration Timing: ******************** Summary End\n"); +} + +static void errmgr_crmig_clear_timers(void) +{ + int i; + for(i = 0; i < ERRMGR_CRMIG_TIMER_MAX; ++i) { + timer_start[i] = 0.0; + } +} + +static double errmgr_crmig_get_time(void) +{ + double wtime; + +#if OPAL_TIMER_USEC_NATIVE + wtime = (double)opal_timer_base_get_usec() / 1000000.0; +#else + struct timeval tv; + gettimeofday(&tv, NULL); + wtime = tv.tv_sec; + wtime += (double)tv.tv_usec / 1000000.0; +#endif + + return wtime; +} + +static void errmgr_crmig_display_indv_timer_core(double diff, char *str) +{ + double total = 0; + double perc = 0; + + total = timer_start[ERRMGR_CRMIG_TIMER_MAX-1] - timer_start[ERRMGR_CRMIG_TIMER_START]; + perc = (diff/total) * 100; + + opal_output(0, + "errmgr_crmig: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n", + str, + diff, + total, + perc); + return; +} + +#endif /* OPAL_ENABLE_FT_CR */ diff --git a/orte/mca/errmgr/hnpresil/help-orte-errmgr-hnp.txt b/orte/mca/errmgr/hnpresil/help-orte-errmgr-hnp.txt new file mode 100644 index 0000000000..836e46f4b0 --- /dev/null +++ b/orte/mca/errmgr/hnpresil/help-orte-errmgr-hnp.txt @@ -0,0 +1,71 @@ + -*- text -*- +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for ORTE Errmgr HNP module. +# +[errmgr-hnp:unknown-job-error] +An error has occurred in an unknown job. This generally should not happen +except due to an internal ORTE error. + +Job state: %s + +This information should probably be reported to the OMPI developers. +# +[errmgr-hnp:daemon-died] +The system has lost communication with the following daemon: + +Daemon: %s +Node: %s + +The reason for the lost communication channel is unknown. Possible +reasons include failure of the daemon itself, failure of the +connecting fabric/switch, and loss of the host node. Please +check with your system administrator to try and determine the +source of the problem. + +Your job is being terminated as a result. +# +[errmgr-hnp:cannot-relocate] +The system is unable to relocate the specified process: + +Process: %s + +because the application for that process could not be found. This +appears to be a system error. Please report it to the ORTE +developers. + +[autor_recovering_job] +Notice: The processes listed below failed unexpectedly. + Using the last checkpoint to recover the job. + Please standby. +%s +[autor_recovery_complete] +Notice: The job has been successfully recovered from the + last checkpoint. +[autor_failed_to_recover_proc] +Error: The process below has failed. There is no checkpoint available for + this job, so we are terminating the application since automatic + recovery cannot occur. +Internal Name: %s +MCW Rank: %d + +[crmig_migrating_job] +Notice: A migration of this job has been requested. + The processes below will be migrated. + Please standby. +%s +[crmig_migrated_job] +Notice: The processes have been successfully migrated to/from the specified + machines. +[crmig_no_migrating_procs] +Warning: Could not find any processes to migrate on the nodes specified. + You provided the following: +Nodes: %s +Procs: %s diff --git a/orte/mca/errmgr/orted/errmgr_orted.c b/orte/mca/errmgr/orted/errmgr_orted.c index b43bf56422..5826a2c58c 100644 --- a/orte/mca/errmgr/orted/errmgr_orted.c +++ b/orte/mca/errmgr/orted/errmgr_orted.c @@ -3,9 +3,6 @@ * All rights reserved. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2004-2011 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -32,11 +29,9 @@ #include "orte/util/proc_info.h" #include "orte/util/session_dir.h" #include "orte/util/show_help.h" -#include "orte/util/nidmap.h" #include "orte/runtime/orte_globals.h" #include "orte/mca/rml/rml.h" #include "orte/mca/odls/odls.h" -#include "orte/mca/odls/base/base.h" #include "orte/mca/plm/plm_types.h" #include "orte/mca/routed/routed.h" #include "orte/mca/sensor/sensor.h" @@ -58,9 +53,8 @@ static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code); static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state); -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch); -static int record_dead_process(orte_process_name_t *proc); -static int send_to_local_applications(opal_pointer_array_t *dead_names); +static void killprocs(orte_jobid_t job, orte_vpid_t vpid); + /* * Module functions: Global @@ -85,11 +79,7 @@ static int suggest_map_targets(orte_proc_t *proc, static int ft_event(int state); -static int post_startup(void); -static int pre_shutdown(void); -static int mark_processes_as_dead(opal_pointer_array_t *dead_procs); -static int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer); /****************** * ORTED module @@ -105,11 +95,11 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = { suggest_map_targets, ft_event, orte_errmgr_base_register_migration_warning, - post_startup, - pre_shutdown, - mark_processes_as_dead, - orte_errmgr_base_set_fault_callback, /* Set callback function */ - failure_notification + NULL, /* post_startup */ + NULL, /* pre_shutdown */ + NULL, /* mark_processes_as_dead */ + NULL, /* set_fault_callback */ + NULL /* failure_notification */ }; /************************ @@ -140,29 +130,20 @@ static int update_state(orte_jobid_t job, int rc=ORTE_SUCCESS; orte_vpid_t null=ORTE_VPID_INVALID; orte_app_context_t *app; - orte_ns_cmp_bitmask_t mask; - + /* * if orte is trying to shutdown, just let it */ if (orte_finalizing) { return ORTE_SUCCESS; } - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "errmgr:orted:update_state() %s) " - "------- %s state updated for process %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ((NULL == proc) ? "App. Process" : - (proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), - (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc))); - + /* if this is a heartbeat failure, let the HNP handle it */ if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate || ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { return ORTE_SUCCESS; } - + /*** UPDATE COMMAND FOR A JOB ***/ if (NULL == proc) { /* this is an update for an entire job */ @@ -199,7 +180,7 @@ static int update_state(orte_jobid_t job, item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; - + /* is this the specified job? */ if (jobdat->jobid == job) { break; @@ -208,7 +189,7 @@ static int update_state(orte_jobid_t job, if (NULL == jobdat) { return ORTE_ERR_NOT_FOUND; } - + switch (jobstate) { case ORTE_JOB_STATE_FAILED_TO_START: failed_start(jobdat, exit_code); @@ -221,10 +202,10 @@ static int update_state(orte_jobid_t job, /* update all procs in job */ update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); /* order all local procs for this job to be killed */ - killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + killprocs(jobdat->jobid, ORTE_VPID_WILDCARD); case ORTE_JOB_STATE_COMM_FAILED: /* kill all local procs */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* tell the caller we can't recover */ return ORTE_ERR_UNRECOVERABLE; break; @@ -261,16 +242,15 @@ static int update_state(orte_jobid_t job, * lifeline */ if (ORTE_PROC_STATE_COMM_FAILED == state) { - mask = ORTE_NS_CMP_ALL; - /* if it is our own connection, ignore it */ - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { + if (ORTE_PROC_MY_NAME->jobid == proc->jobid && + ORTE_PROC_MY_NAME->vpid == proc->vpid) { return ORTE_SUCCESS; } /* see if this was a lifeline */ if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { /* kill our children */ - killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD); /* terminate - our routed children will see * us leave and automatically die */ @@ -281,25 +261,21 @@ static int update_state(orte_jobid_t job, /* was it a daemon that failed? */ if (proc->jobid == ORTE_PROC_MY_NAME->jobid) { /* if all my routes are gone, then terminate ourselves */ - if (0 == orte_routed.num_routes() && - 0 == opal_list_get_size(&orte_local_children)) { + if (0 == orte_routed.num_routes()) { orte_quit(); } } - - record_dead_process(proc); - /* if not, then indicate we can continue */ return ORTE_SUCCESS; } - + /* lookup the local jobdat for this job */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; - + /* is this the specified job? */ if (jobdat->jobid == proc->jobid) { break; @@ -309,7 +285,7 @@ static int update_state(orte_jobid_t job, /* must already be complete */ return ORTE_SUCCESS; } - + /* if there are no local procs for this job, we can * ignore this call */ @@ -330,15 +306,15 @@ static int update_state(orte_jobid_t job, item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + if (child->name->jobid == proc->jobid && + child->name->vpid == proc->vpid) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; child->exit_code = exit_code; /* Decrement the number of local procs */ jobdat->num_local_procs--; /* kill this proc */ - killprocs(proc->jobid, proc->vpid, proc->epoch); + killprocs(proc->jobid, proc->vpid); } app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx); if( jobdat->enable_recovery && child->restarts < app->max_restarts ) { @@ -364,7 +340,7 @@ static int update_state(orte_jobid_t job, /* treat this as normal termination */ goto REPORT_STATE; } - + if (ORTE_PROC_STATE_TERMINATED < state) { if( jobdat->enable_recovery ) { OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, @@ -375,8 +351,8 @@ static int update_state(orte_jobid_t job, item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + if (child->name->jobid == proc->jobid && + child->name->vpid == proc->vpid) { /* see if this child has reached its local restart limit */ app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx); OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, @@ -403,8 +379,8 @@ static int update_state(orte_jobid_t job, } } } - -REPORT_ABORT: + + REPORT_ABORT: /* if the job hasn't completed and the state is abnormally * terminated, then we need to alert the HNP right away */ @@ -427,8 +403,8 @@ REPORT_ABORT: item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + if (child->name->jobid == proc->jobid && + child->name->vpid == proc->vpid) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; child->exit_code = exit_code; @@ -442,7 +418,7 @@ REPORT_ABORT: opal_list_remove_item(&orte_local_children, &child->super); /* Decrement the number of local procs */ jobdat->num_local_procs--; - + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:orted reporting proc %s aborted to HNP (local procs = %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -464,15 +440,15 @@ REPORT_ABORT: OBJ_DESTRUCT(&alert); return rc; } - - REPORT_STATE: + +REPORT_STATE: /* find this proc in the local children so we can update its state */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - mask = ORTE_NS_CMP_ALL; - if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + if (child->name->jobid == proc->jobid && + child->name->vpid == proc->vpid) { if (ORTE_PROC_STATE_UNTERMINATED > child->state) { child->state = state; if (0 < pid) { @@ -492,7 +468,7 @@ REPORT_ABORT: * the HNP so it is available to debuggers and anyone * else that needs it */ - + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:orted: sending contact info to HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -509,7 +485,7 @@ REPORT_ABORT: ORTE_ERROR_LOG(rc); goto FINAL_CLEANUP; } - /* pack all the local child vpids and epochs */ + /* pack all the local child vpids */ for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { @@ -546,7 +522,7 @@ REPORT_ABORT: } return rc; } - + /* only other state is terminated - see if anyone is left alive */ if (!any_live_children(proc->jobid)) { /* lookup the local jobdat for this job */ @@ -555,7 +531,7 @@ REPORT_ABORT: item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; - + /* is this the specified job? */ if (jobdat->jobid == proc->jobid) { break; @@ -577,8 +553,8 @@ REPORT_ABORT: if (ORTE_SUCCESS != (rc = pack_state_update(&alert, jobdat))) { ORTE_ERROR_LOG(rc); } - -FINAL_CLEANUP: + + FINAL_CLEANUP: OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, "%s errmgr:orted reporting all procs in %s terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -592,7 +568,7 @@ FINAL_CLEANUP: item = next) { child = (orte_odls_child_t*)item; next = opal_list_get_next(item); - + if (jobdat->jobid == child->name->jobid) { opal_list_remove_item(&orte_local_children, &child->super); OBJ_RELEASE(child); @@ -601,11 +577,11 @@ FINAL_CLEANUP: /* ensure the job's local session directory tree is removed */ orte_session_dir_cleanup(jobdat->jobid); - + /* remove this job from our local job data since it is complete */ opal_list_remove_item(&orte_local_jobdata, &jobdat->super); OBJ_RELEASE(jobdat); - + /* send it */ if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) { ORTE_ERROR_LOG(rc); @@ -613,7 +589,6 @@ FINAL_CLEANUP: rc = ORTE_SUCCESS; } OBJ_DESTRUCT(&alert); - /* indicate that the job is complete */ return rc; } @@ -639,131 +614,6 @@ int ft_event(int state) return ORTE_SUCCESS; } -int post_startup(void) { - return ORTE_SUCCESS; -} - -int pre_shutdown(void) { - return ORTE_SUCCESS; -} - -int mark_processes_as_dead(opal_pointer_array_t *dead_procs) { - int i; - orte_process_name_t *name_item; - opal_list_item_t *item; - orte_odls_child_t *child; - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "ORTED %s marking procs as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) { - if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) { - opal_output(0, "NULL found in dead process list."); - continue; - } else { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, - "ORTED %s marking %s as dead", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item))); - } - - if (name_item->epoch < orte_util_lookup_epoch(name_item)) { - continue; - } - - /* Increment the epoch */ - orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED); - orte_util_set_epoch(name_item, name_item->epoch + 1); - - /* Remove the dead process from my list of children if applicable */ - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = opal_list_get_next(item)) { - child = (orte_odls_child_t *) item; - - if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID, - child->name, name_item)) { - opal_list_remove_item(&orte_local_children, item); - break; - } - } - - /* Remove the route from the routing layer */ - orte_routed.delete_route(name_item); - } - - /* Update the routing module */ - orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); - - if (NULL != fault_cbfunc) { - (*fault_cbfunc)(dead_procs); - } - - return ORTE_SUCCESS; -} - -int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) { - opal_pointer_array_t *dead_names; - orte_std_cntr_t n; - int ret = ORTE_SUCCESS, num_failed; - int32_t i; - orte_process_name_t *name_item, proc; - - dead_names = OBJ_NEW(opal_pointer_array_t); - - n = 1; - /* Get the number of failed procs */ - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - return ret; - } - - for (i = 0; i < num_failed; i++) { - /* Unpack the buffer to get the dead process' name. */ - n = 1; - - name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); - - if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - return ret; - } - - if (orte_debug_daemons_flag) { - opal_output(0, "%s errmgr:orted ORTED received process %s failed from %s", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name_item), - ORTE_NAME_PRINT(sender)); - } - - /* There shouldn't be an issue of receiving this message multiple - * times but it doesn't hurt to double check. - */ - if (proc.epoch < orte_util_lookup_epoch(name_item)) { - opal_output(1, "Received from proc %s local epoch %d", ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item)); - continue; - } - - opal_pointer_array_add(dead_names, name_item); - } - - /* Tell the errmgr so it can handle changing the epoch, routes, etc. */ - orte_errmgr.mark_processes_as_dead(dead_names); - - /* Tell the applications' ORTE layers that there is a failure. */ - if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) { - return ret; - } - - for (i = 0; i < num_failed; i++) { - name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i); - free(name_item); - } - - return ret; -} - /***************** * Local Functions *****************/ @@ -771,14 +621,14 @@ static bool any_live_children(orte_jobid_t job) { opal_list_item_t *item; orte_odls_child_t *child; - + /* the thread is locked elsewhere - don't try to do it again here */ - + for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - + /* is this child part of the specified job? */ if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) && child->alive) { @@ -788,13 +638,13 @@ static bool any_live_children(orte_jobid_t job) /* if we get here, then nobody is left alive from that job */ return false; - + } static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) { int rc; - + /* pack the child's vpid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name->vpid), 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); @@ -829,70 +679,70 @@ static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) ORTE_ERROR_LOG(rc); return rc; } - + return ORTE_SUCCESS; } -static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) -{ - int rc; - opal_list_item_t *item, *next; - orte_odls_child_t *child; - orte_vpid_t null=ORTE_VPID_INVALID; - - /* pack the jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - /* if we are timing things, pack the time the launch msg for this job was recvd */ - if (orte_timing) { - int64_t tmp; - tmp = jobdat->launch_msg_recvd.tv_sec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { + static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) + { + int rc; + opal_list_item_t *item, *next; + orte_odls_child_t *child; + orte_vpid_t null=ORTE_VPID_INVALID; + + /* pack the jobid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return rc; } - tmp = jobdat->launch_msg_recvd.tv_usec; - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { - ORTE_ERROR_LOG(rc); - return rc; - } - } - for (item = opal_list_get_first(&orte_local_children); - item != opal_list_get_end(&orte_local_children); - item = next) { - child = (orte_odls_child_t*)item; - next = opal_list_get_next(item); - /* if this child is part of the job... */ - if (child->name->jobid == jobdat->jobid) { - if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { + /* if we are timing things, pack the time the launch msg for this job was recvd */ + if (orte_timing) { + int64_t tmp; + tmp = jobdat->launch_msg_recvd.tv_sec; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { + ORTE_ERROR_LOG(rc); + return rc; + } + tmp = jobdat->launch_msg_recvd.tv_usec; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { ORTE_ERROR_LOG(rc); return rc; } } + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = next) { + child = (orte_odls_child_t*)item; + next = opal_list_get_next(item); + /* if this child is part of the job... */ + if (child->name->jobid == jobdat->jobid) { + if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + /* flag that this job is complete so the receiver can know */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; } - /* flag that this job is complete so the receiver can know */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - return rc; - } - - return ORTE_SUCCESS; -} static bool all_children_registered(orte_jobid_t job) { opal_list_item_t *item; orte_odls_child_t *child; - + /* the thread is locked elsewhere - don't try to do it again here */ - + for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - + /* is this child part of the specified job? */ if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { /* if this child has terminated, we consider it as having @@ -918,10 +768,10 @@ static bool all_children_registered(orte_jobid_t job) } } } - + /* if we get here, then everyone in the job is currently registered */ return true; - + } static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) @@ -929,14 +779,14 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) opal_list_item_t *item; orte_odls_child_t *child; int rc; - + /* the thread is locked elsewhere - don't try to do it again here */ - + for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { child = (orte_odls_child_t*)item; - + /* is this child part of the specified job? */ if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { /* pack the child's vpid - must be done in case rml_uri is NULL */ @@ -944,11 +794,10 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) ORTE_ERROR_LOG(rc); return rc; } - /* Pack the child's epoch. */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->epoch), 1, ORTE_EPOCH))) { ORTE_ERROR_LOG(rc); return rc; - } + } /* pack the contact info */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); @@ -956,19 +805,19 @@ static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) } } } - + return ORTE_SUCCESS; - + } static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code) { opal_list_item_t *item; orte_odls_child_t *child; - + /* set the state */ jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; - + for (item = opal_list_get_first(&orte_local_children); item != opal_list_get_end(&orte_local_children); item = opal_list_get_next(item)) { @@ -997,7 +846,7 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs { opal_list_item_t *item; orte_odls_child_t *child; - + /* update job state */ jobdat->state = jobstate; /* update children */ @@ -1011,29 +860,28 @@ static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobs } } -static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch) +static void killprocs(orte_jobid_t job, orte_vpid_t vpid) { opal_pointer_array_t cmd; orte_proc_t proc; int rc; - + /* stop local sensors for this job */ if (ORTE_VPID_WILDCARD == vpid) { orte_sensor.stop(job); } - - if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) { + + if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid) { if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { ORTE_ERROR_LOG(rc); } return; } - + OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); OBJ_CONSTRUCT(&proc, orte_proc_t); proc.name.jobid = job; proc.name.vpid = vpid; - proc.name.epoch = epoch; opal_pointer_array_add(&cmd, &proc); if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { ORTE_ERROR_LOG(rc); @@ -1041,85 +889,3 @@ static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch) OBJ_DESTRUCT(&cmd); OBJ_DESTRUCT(&proc); } - -static int record_dead_process(orte_process_name_t *proc) { - opal_pointer_array_t *dead_name; - opal_buffer_t *buffer; - orte_daemon_cmd_flag_t command; - int rc = ORTE_SUCCESS; - int num_failed; - - if (orte_odls_base_default_check_finished(proc)) { - return rc; - } - - dead_name = OBJ_NEW(opal_pointer_array_t); - - opal_pointer_array_add(dead_name, proc); - - /* Mark the process as dead */ - mark_processes_as_dead(dead_name); - - /* Send a message to the HNP */ - buffer = OBJ_NEW(opal_buffer_t); - command = ORTE_PROCESS_FAILED_NOTIFICATION; - - num_failed = 1; - - if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - } else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(rc); - } else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(rc); - } - - orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0); - - OBJ_RELEASE(buffer); - OBJ_RELEASE(dead_name); - - return rc; -} - -int send_to_local_applications(opal_pointer_array_t *dead_names) { - opal_buffer_t *buf; - int ret; - orte_process_name_t *name_item; - int size, i; - - OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, - "%s Sending failure to local applications.", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); - - buf = OBJ_NEW(opal_buffer_t); - - size = opal_pointer_array_get_size(dead_names); - - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - for (i = 0; i < size; i++) { - if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { - if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - } - } - - if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) { - ORTE_ERROR_LOG(ret); - OBJ_RELEASE(buf); - return ret; - } - - OBJ_RELEASE(buf); - - return ORTE_SUCCESS; -} - diff --git a/orte/mca/errmgr/ortedresil/.windows b/orte/mca/errmgr/ortedresil/.windows new file mode 100644 index 0000000000..aa7d7bbbe5 --- /dev/null +++ b/orte/mca/errmgr/ortedresil/.windows @@ -0,0 +1,12 @@ +# +# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# Specific to this module +mca_link_libraries=libopen-rte diff --git a/orte/mca/errmgr/ortedresil/Makefile.am b/orte/mca/errmgr/ortedresil/Makefile.am new file mode 100644 index 0000000000..63c1cf9522 --- /dev/null +++ b/orte/mca/errmgr/ortedresil/Makefile.am @@ -0,0 +1,38 @@ +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +EXTRA_DIST = .windows + +dist_pkgdata_DATA = help-orte-errmgr-orted.txt + +sources = \ + errmgr_ortedresil.h \ + errmgr_ortedresil_component.c \ + errmgr_ortedresil.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_errmgr_ortedresil_DSO +component_noinst = +component_install = mca_errmgr_ortedresil.la +else +component_noinst = libmca_errmgr_ortedresil.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_errmgr_ortedresil_la_SOURCES = $(sources) +mca_errmgr_ortedresil_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_errmgr_ortedresil_la_SOURCES =$(sources) +libmca_errmgr_ortedresil_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/ortedresil/errmgr_ortedresil.c b/orte/mca/errmgr/ortedresil/errmgr_ortedresil.c new file mode 100644 index 0000000000..a1061eff31 --- /dev/null +++ b/orte/mca/errmgr/ortedresil/errmgr_ortedresil.c @@ -0,0 +1,1126 @@ +/* + * Copyright (c) 2009-2010 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. + * Copyright (c) 2004-2011 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/output.h" +#include "opal/util/opal_sos.h" +#include "opal/dss/dss.h" + +#include "orte/util/error_strings.h" +#include "orte/util/name_fns.h" +#include "orte/util/proc_info.h" +#include "orte/util/session_dir.h" +#include "orte/util/show_help.h" +#include "orte/util/nidmap.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/odls/odls.h" +#include "orte/mca/odls/base/base.h" +#include "orte/mca/plm/plm_types.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/sensor/sensor.h" +#include "orte/runtime/orte_quit.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "orte/mca/errmgr/base/errmgr_private.h" + +#include "errmgr_ortedresil.h" + +/* Local functions */ +static bool any_live_children(orte_jobid_t job); +static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat); +static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child); +static bool all_children_registered(orte_jobid_t job); +static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf); +static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code); +static void update_local_children(orte_odls_job_t *jobdat, + orte_job_state_t jobstate, + orte_proc_state_t state); +static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch); +static int record_dead_process(orte_process_name_t *proc); +static int send_to_local_applications(opal_pointer_array_t *dead_names); + +/* + * Module functions: Global + */ +static int init(void); +static int finalize(void); + +static int predicted_fault(opal_list_t *proc_list, + opal_list_t *node_list, + opal_list_t *suggested_map); + +static int update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); + +static int suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list); + +static int ft_event(int state); + +static int post_startup(void); +static int pre_shutdown(void); + +static int mark_processes_as_dead(opal_pointer_array_t *dead_procs); +static int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer); + +/****************** + * ORTEDRESIL module + ******************/ +orte_errmgr_base_module_t orte_errmgr_ortedresil_module = { + init, + finalize, + orte_errmgr_base_log, + orte_errmgr_base_abort, + orte_errmgr_base_abort_peers, + update_state, + predicted_fault, + suggest_map_targets, + ft_event, + orte_errmgr_base_register_migration_warning, + post_startup, + pre_shutdown, + mark_processes_as_dead, + orte_errmgr_base_set_fault_callback, /* Set callback function */ + failure_notification +}; + +/************************ + * API Definitions + ************************/ +static int init(void) +{ + return ORTE_SUCCESS; +} + +static int finalize(void) +{ + return ORTE_SUCCESS; +} + +static int update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + opal_list_item_t *item, *next; + orte_odls_job_t *jobdat = NULL; + orte_odls_child_t *child; + opal_buffer_t alert; + orte_plm_cmd_flag_t cmd; + int rc=ORTE_SUCCESS; + orte_vpid_t null=ORTE_VPID_INVALID; + orte_app_context_t *app; + orte_ns_cmp_bitmask_t mask; + + /* + * if orte is trying to shutdown, just let it + */ + if (orte_finalizing) { + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, + "errmgr:ortedresil:update_state() %s) " + "------- %s state updated for process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ((NULL == proc) ? "App. Process" : + (proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")), + (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc))); + + /* if this is a heartbeat failure, let the HNP handle it */ + if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate || + ORTE_PROC_STATE_HEARTBEAT_FAILED == state) { + return ORTE_SUCCESS; + } + + /*** UPDATE COMMAND FOR A JOB ***/ + if (NULL == proc) { + /* this is an update for an entire job */ + if (ORTE_JOBID_INVALID == job) { + /* whatever happened, we don't know what job + * it happened to + */ + orte_show_help("help-orte-errmgr-orted.txt", "errmgr-orted:unknown-job-error", + true, orte_job_state_to_str(jobstate)); + OBJ_CONSTRUCT(&alert, opal_buffer_t); + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the "invalid" jobid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &job, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) { + ORTE_ERROR_LOG(rc); + } else { + rc = ORTE_SUCCESS; + } + OBJ_DESTRUCT(&alert); + return rc; + } + + /* lookup the local jobdat for this job */ + jobdat = NULL; + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jobdat = (orte_odls_job_t*)item; + + /* is this the specified job? */ + if (jobdat->jobid == job) { + break; + } + } + if (NULL == jobdat) { + return ORTE_ERR_NOT_FOUND; + } + + switch (jobstate) { + case ORTE_JOB_STATE_FAILED_TO_START: + failed_start(jobdat, exit_code); + break; + case ORTE_JOB_STATE_RUNNING: + /* update all local child states */ + update_local_children(jobdat, jobstate, ORTE_PROC_STATE_RUNNING); + break; + case ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED: + /* update all procs in job */ + update_local_children(jobdat, jobstate, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED); + /* order all local procs for this job to be killed */ + killprocs(jobdat->jobid, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + case ORTE_JOB_STATE_COMM_FAILED: + /* kill all local procs */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + /* tell the caller we can't recover */ + return ORTE_ERR_UNRECOVERABLE; + break; + case ORTE_JOB_STATE_HEARTBEAT_FAILED: + /* let the HNP handle this */ + return ORTE_SUCCESS; + break; + + default: + break; + } + OBJ_CONSTRUCT(&alert, opal_buffer_t); + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + /* pack the job info */ + if (ORTE_SUCCESS != (rc = pack_state_update(&alert, jobdat))) { + ORTE_ERROR_LOG(rc); + } + /* send it */ + if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) { + ORTE_ERROR_LOG(rc); + } else { + rc = ORTE_SUCCESS; + } + OBJ_DESTRUCT(&alert); + return rc; + } + + /* if this was a failed comm, then see if it was to our + * lifeline + */ + if (ORTE_PROC_STATE_COMM_FAILED == state) { + mask = ORTE_NS_CMP_ALL; + + /* if it is our own connection, ignore it */ + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, ORTE_PROC_MY_NAME, proc)) { + return ORTE_SUCCESS; + } + /* see if this was a lifeline */ + if (ORTE_SUCCESS != orte_routed.route_lost(proc)) { + /* kill our children */ + killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD, ORTE_EPOCH_WILDCARD); + /* terminate - our routed children will see + * us leave and automatically die + */ + orte_quit(); + } + /* purge the oob */ + orte_rml.purge(proc); + /* was it a daemon that failed? */ + if (proc->jobid == ORTE_PROC_MY_NAME->jobid) { + /* if all my routes are gone, then terminate ourselves */ + if (0 == orte_routed.num_routes() && + 0 == opal_list_get_size(&orte_local_children)) { + orte_quit(); + } + } + + record_dead_process(proc); + + /* if not, then indicate we can continue */ + return ORTE_SUCCESS; + } + + /* lookup the local jobdat for this job */ + jobdat = NULL; + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jobdat = (orte_odls_job_t*)item; + + /* is this the specified job? */ + if (jobdat->jobid == proc->jobid) { + break; + } + } + if (NULL == jobdat) { + /* must already be complete */ + return ORTE_SUCCESS; + } + + /* if there are no local procs for this job, we can + * ignore this call + */ + if (0 == jobdat->num_local_procs) { + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:ortedresil got state %s for proc %s pid %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_proc_state_to_str(state), + ORTE_NAME_PRINT(proc), pid)); + + /*** UPDATE COMMAND FOR A SPECIFIC PROCESS ***/ + if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == state) { + /* find this proc in the local children */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + mask = ORTE_NS_CMP_ALL; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + if (ORTE_PROC_STATE_UNTERMINATED > child->state) { + child->state = state; + child->exit_code = exit_code; + /* Decrement the number of local procs */ + jobdat->num_local_procs--; + /* kill this proc */ + killprocs(proc->jobid, proc->vpid, proc->epoch); + } + app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx); + if( jobdat->enable_recovery && child->restarts < app->max_restarts ) { + child->restarts++; + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:ortedresil restarting proc %s for the %d time", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc), child->restarts)); + rc = orte_odls.restart_proc(child); + } + return rc; + } + } + } + + if (ORTE_PROC_STATE_TERM_NON_ZERO == state) { + if (orte_abort_non_zero_exit) { + /* treat this as an abnormal + * termination - no recovery allowed + */ + goto REPORT_ABORT; + } + /* treat this as normal termination */ + goto REPORT_STATE; + } + + if (ORTE_PROC_STATE_TERMINATED < state) { + if( jobdat->enable_recovery ) { + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s RECOVERY ENABLED", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* find this proc in the local children */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + mask = ORTE_NS_CMP_ALL; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + /* see if this child has reached its local restart limit */ + app = (orte_app_context_t*)opal_pointer_array_get_item(&jobdat->apps, child->app_idx); + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s CHECKING RESTARTS %d VS MAX %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + child->restarts, app->max_restarts)); + if (child->restarts < app->max_restarts ) { + /* attempt to restart it locally */ + child->restarts++; + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:ortedresil restarting proc %s for the %d time", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name), child->restarts)); + if (ORTE_SUCCESS != (rc = orte_odls.restart_proc(child))) { + /* reset the child's state as restart_proc would + * have cleared it + */ + child->state = state; + ORTE_ERROR_LOG(rc); + goto REPORT_ABORT; + } + return ORTE_SUCCESS; + } + } + } + } + +REPORT_ABORT: + /* if the job hasn't completed and the state is abnormally + * terminated, then we need to alert the HNP right away + */ + OBJ_CONSTRUCT(&alert, opal_buffer_t); + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + /* pack only the data for this proc - have to start with the jobid + * so the receiver can unpack it correctly + */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &proc->jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* find this proc in the local children */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + mask = ORTE_NS_CMP_ALL; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + if (ORTE_PROC_STATE_UNTERMINATED > child->state) { + child->state = state; + child->exit_code = exit_code; + } + /* now pack the child's info */ + if (ORTE_SUCCESS != (rc = pack_state_for_proc(&alert, child))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* remove the child from our local list as it is no longer alive */ + opal_list_remove_item(&orte_local_children, &child->super); + /* Decrement the number of local procs */ + jobdat->num_local_procs--; + + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:ortedresil reporting proc %s aborted to HNP (local procs = %d)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(child->name), + jobdat->num_local_procs)); + + /* release the child object */ + OBJ_RELEASE(child); + /* done with loop */ + break; + } + } + /* send it */ + if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) { + ORTE_ERROR_LOG(rc); + } else { + rc = ORTE_SUCCESS; + } + OBJ_DESTRUCT(&alert); + return rc; + } + + REPORT_STATE: + /* find this proc in the local children so we can update its state */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + mask = ORTE_NS_CMP_ALL; + if (OPAL_EQUAL == orte_util_compare_name_fields(mask, child->name, proc)) { + if (ORTE_PROC_STATE_UNTERMINATED > child->state) { + child->state = state; + if (0 < pid) { + child->pid = pid; + } + child->exit_code = exit_code; + } + /* done with loop */ + break; + } + } + + if (ORTE_PROC_STATE_REGISTERED == state) { + /* see if everyone in this job has registered */ + if (all_children_registered(proc->jobid)) { + /* once everyone registers, send their contact info to + * the HNP so it is available to debuggers and anyone + * else that needs it + */ + + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:ortedresil: sending contact info to HNP", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + OBJ_CONSTRUCT(&alert, opal_buffer_t); + /* pack init routes command */ + cmd = ORTE_PLM_INIT_ROUTES_CMD; + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + /* pack the jobid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &proc->jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + /* pack all the local child vpids and epochs */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + if (child->name->jobid == proc->jobid) { + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &child->name->vpid, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &child->name->epoch, 1, ORTE_EPOCH))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + } + } + /* pack an invalid marker */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &null, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + /* add in contact info for all procs in the job */ + if (ORTE_SUCCESS != (rc = pack_child_contact_info(proc->jobid, &alert))) { + ORTE_ERROR_LOG(rc); + OBJ_DESTRUCT(&alert); + return rc; + } + /* send it */ + if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) { + ORTE_ERROR_LOG(rc); + } else { + rc = ORTE_SUCCESS; + } + OBJ_DESTRUCT(&alert); + } + return rc; + } + + /* only other state is terminated - see if anyone is left alive */ + if (!any_live_children(proc->jobid)) { + /* lookup the local jobdat for this job */ + jobdat = NULL; + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jobdat = (orte_odls_job_t*)item; + + /* is this the specified job? */ + if (jobdat->jobid == proc->jobid) { + break; + } + } + if (NULL == jobdat) { + /* race condition - may not have been formed yet */ + return ORTE_SUCCESS; + } + + OBJ_CONSTRUCT(&alert, opal_buffer_t); + /* pack update state command */ + cmd = ORTE_PLM_UPDATE_PROC_STATE; + if (ORTE_SUCCESS != (rc = opal_dss.pack(&alert, &cmd, 1, ORTE_PLM_CMD))) { + ORTE_ERROR_LOG(rc); + goto FINAL_CLEANUP; + } + /* pack the data for the job */ + if (ORTE_SUCCESS != (rc = pack_state_update(&alert, jobdat))) { + ORTE_ERROR_LOG(rc); + } + +FINAL_CLEANUP: + OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output, + "%s errmgr:ortedresil reporting all procs in %s terminated", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jobdat->jobid))); + + /* remove all of this job's children from the global list - do not lock + * the thread as we are already locked + */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = next) { + child = (orte_odls_child_t*)item; + next = opal_list_get_next(item); + + if (jobdat->jobid == child->name->jobid) { + opal_list_remove_item(&orte_local_children, &child->super); + OBJ_RELEASE(child); + } + } + + /* ensure the job's local session directory tree is removed */ + orte_session_dir_cleanup(jobdat->jobid); + + /* remove this job from our local job data since it is complete */ + opal_list_remove_item(&orte_local_jobdata, &jobdat->super); + OBJ_RELEASE(jobdat); + + /* send it */ + if (0 > (rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, &alert, ORTE_RML_TAG_PLM, 0))) { + ORTE_ERROR_LOG(rc); + } else { + rc = ORTE_SUCCESS; + } + OBJ_DESTRUCT(&alert); + + /* indicate that the job is complete */ + return rc; + } + return ORTE_SUCCESS; +} + +static int predicted_fault(opal_list_t *proc_list, + opal_list_t *node_list, + opal_list_t *suggested_map) +{ + return ORTE_ERR_NOT_IMPLEMENTED; +} + +static int suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list) +{ + return ORTE_ERR_NOT_IMPLEMENTED; +} + +static int ft_event(int state) +{ + return ORTE_SUCCESS; +} + +static int post_startup(void) { + return ORTE_SUCCESS; +} + +static int pre_shutdown(void) { + return ORTE_SUCCESS; +} + +static int mark_processes_as_dead(opal_pointer_array_t *dead_procs) { + int i; + orte_process_name_t *name_item; + opal_list_item_t *item; + orte_odls_child_t *child; + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "ORTED %s marking procs as dead", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + for (i = 0; i < opal_pointer_array_get_size(dead_procs); i++) { + if (NULL == (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_procs, i))) { + opal_output(0, "NULL found in dead process list."); + continue; + } + + if (name_item->epoch < orte_util_lookup_epoch(name_item)) { + continue; + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "ORTED %s marking %s as dead", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name_item))); + + /* Increment the epoch */ + orte_util_set_proc_state(name_item, ORTE_PROC_STATE_TERMINATED); + orte_util_set_epoch(name_item, name_item->epoch + 1); + + /* Remove the dead process from my list of children if applicable */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t *) item; + + if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_JOBID | ORTE_NS_CMP_VPID, + child->name, name_item)) { + opal_list_remove_item(&orte_local_children, item); + OBJ_RELEASE(item); + break; + } + } + + /* Remove the route from the routing layer */ + orte_routed.delete_route(name_item); + } + + /* Update the routing module */ + orte_routed.update_routing_tree(ORTE_PROC_MY_NAME->jobid); + + if (NULL != fault_cbfunc) { + (*fault_cbfunc)(dead_procs); + } + + return ORTE_SUCCESS; +} + +static int failure_notification(orte_process_name_t *sender, opal_buffer_t *buffer) { + opal_pointer_array_t *dead_names; + orte_std_cntr_t n; + int ret = ORTE_SUCCESS, num_failed; + int32_t i; + orte_process_name_t *name_item, proc; + + dead_names = OBJ_NEW(opal_pointer_array_t); + + n = 1; + /* Get the number of failed procs */ + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &num_failed, &n, ORTE_VPID))) { + ORTE_ERROR_LOG(ret); + return ret; + } + + for (i = 0; i < num_failed; i++) { + /* Unpack the buffer to get the dead process' name. */ + n = 1; + + name_item = (orte_process_name_t *) malloc(sizeof(orte_process_name_t)); + + if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, name_item, &n, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + return ret; + } + + if (orte_debug_daemons_flag) { + opal_output(0, "%s errmgr:ortedresil ORTED received process %s failed from %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name_item), + ORTE_NAME_PRINT(sender)); + } + + /* There shouldn't be an issue of receiving this message multiple + * times but it doesn't hurt to double check. + */ + if (proc.epoch < orte_util_lookup_epoch(name_item)) { + opal_output(1, "Received from proc %s local epoch %d", ORTE_NAME_PRINT(name_item), orte_util_lookup_epoch(name_item)); + continue; + } + + opal_pointer_array_add(dead_names, name_item); + } + + /* Tell the errmgr so it can handle changing the epoch, routes, etc. */ + mark_processes_as_dead(dead_names); + + /* Tell the applications' ORTE layers that there is a failure. */ + if (ORTE_SUCCESS != (ret = send_to_local_applications(dead_names))) { + return ret; + } + + for (i = 0; i < num_failed; i++) { + name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i); + free(name_item); + } + + return ret; +} + +/***************** + * Local Functions + *****************/ +static bool any_live_children(orte_jobid_t job) +{ + opal_list_item_t *item; + orte_odls_child_t *child; + + /* the thread is locked elsewhere - don't try to do it again here */ + + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + + /* is this child part of the specified job? */ + if ((job == child->name->jobid || ORTE_JOBID_WILDCARD == job) && + child->alive) { + return true; + } + } + + /* if we get here, then nobody is left alive from that job */ + return false; + +} + +static int pack_state_for_proc(opal_buffer_t *alert, orte_odls_child_t *child) +{ + int rc; + + /* pack the child's vpid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &(child->name->vpid), 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the pid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->pid, 1, OPAL_PID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* if we are timing things, pack the time the proc was launched */ + if (orte_timing) { + int64_t tmp; + tmp = child->starttime.tv_sec; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { + ORTE_ERROR_LOG(rc); + return rc; + } + tmp = child->starttime.tv_usec; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + /* pack its state */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->state, 1, ORTE_PROC_STATE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack its exit code */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &child->exit_code, 1, ORTE_EXIT_CODE))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; +} + +static int pack_state_update(opal_buffer_t *alert, orte_odls_job_t *jobdat) +{ + int rc; + opal_list_item_t *item, *next; + orte_odls_child_t *child; + orte_vpid_t null=ORTE_VPID_INVALID; + + /* pack the jobid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &jobdat->jobid, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* if we are timing things, pack the time the launch msg for this job was recvd */ + if (orte_timing) { + int64_t tmp; + tmp = jobdat->launch_msg_recvd.tv_sec; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { + ORTE_ERROR_LOG(rc); + return rc; + } + tmp = jobdat->launch_msg_recvd.tv_usec; + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &tmp, 1, OPAL_INT64))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = next) { + child = (orte_odls_child_t*)item; + next = opal_list_get_next(item); + /* if this child is part of the job... */ + if (child->name->jobid == jobdat->jobid) { + if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + /* flag that this job is complete so the receiver can know */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &null, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + + return ORTE_SUCCESS; +} + +static bool all_children_registered(orte_jobid_t job) +{ + opal_list_item_t *item; + orte_odls_child_t *child; + + /* the thread is locked elsewhere - don't try to do it again here */ + + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + + /* is this child part of the specified job? */ + if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { + /* if this child has terminated, we consider it as having + * registered for the purposes of this function. If it never + * did register, then we will send a NULL rml_uri back to + * the HNP, which will then know that the proc did not register. + * If other procs did register, then the HNP can declare an + * abnormal termination + */ + if (ORTE_PROC_STATE_UNTERMINATED < child->state) { + /* this proc has terminated somehow - consider it + * as registered for now + */ + continue; + } + /* if this child is *not* registered yet, return false */ + if (!child->init_recvd) { + return false; + } + /* if this child has registered a finalize, return false */ + if (child->fini_recvd) { + return false; + } + } + } + + /* if we get here, then everyone in the job is currently registered */ + return true; + +} + +static int pack_child_contact_info(orte_jobid_t job, opal_buffer_t *buf) +{ + opal_list_item_t *item; + orte_odls_child_t *child; + int rc; + + /* the thread is locked elsewhere - don't try to do it again here */ + + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + + /* is this child part of the specified job? */ + if (OPAL_EQUAL == opal_dss.compare(&child->name->jobid, &job, ORTE_JOBID)) { + /* pack the child's vpid - must be done in case rml_uri is NULL */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->vpid), 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* Pack the child's epoch. */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &(child->name->epoch), 1, ORTE_EPOCH))) { + ORTE_ERROR_LOG(rc); + return rc; + } + /* pack the contact info */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &child->rml_uri, 1, OPAL_STRING))) { + ORTE_ERROR_LOG(rc); + return rc; + } + } + } + + return ORTE_SUCCESS; + +} + +static void failed_start(orte_odls_job_t *jobdat, orte_exit_code_t exit_code) +{ + opal_list_item_t *item; + orte_odls_child_t *child; + + /* set the state */ + jobdat->state = ORTE_JOB_STATE_FAILED_TO_START; + + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + if (child->name->jobid == jobdat->jobid) { + if (ORTE_PROC_STATE_LAUNCHED > child->state || + ORTE_PROC_STATE_FAILED_TO_START == child->state) { + /* this proc never launched - flag that the iof + * is complete or else we will hang waiting for + * pipes to close that were never opened + */ + child->iof_complete = true; + /* ditto for waitpid */ + child->waitpid_recvd = true; + } + } + } + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base.output, + "%s errmgr:hnp: job %s reported incomplete start", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jobdat->jobid))); + return; +} + +static void update_local_children(orte_odls_job_t *jobdat, orte_job_state_t jobstate, orte_proc_state_t state) +{ + opal_list_item_t *item; + orte_odls_child_t *child; + + /* update job state */ + jobdat->state = jobstate; + /* update children */ + for (item = opal_list_get_first(&orte_local_children); + item != opal_list_get_end(&orte_local_children); + item = opal_list_get_next(item)) { + child = (orte_odls_child_t*)item; + if (jobdat->jobid == child->name->jobid) { + child->state = state; + } + } +} + +static void killprocs(orte_jobid_t job, orte_vpid_t vpid, orte_epoch_t epoch) +{ + opal_pointer_array_t cmd; + orte_proc_t proc; + int rc; + + /* stop local sensors for this job */ + if (ORTE_VPID_WILDCARD == vpid) { + orte_sensor.stop(job); + } + + if (ORTE_JOBID_WILDCARD == job && ORTE_VPID_WILDCARD == vpid && ORTE_EPOCH_WILDCARD == epoch) { + if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(NULL))) { + ORTE_ERROR_LOG(rc); + } + return; + } + + OBJ_CONSTRUCT(&cmd, opal_pointer_array_t); + OBJ_CONSTRUCT(&proc, orte_proc_t); + proc.name.jobid = job; + proc.name.vpid = vpid; + proc.name.epoch = epoch; + opal_pointer_array_add(&cmd, &proc); + if (ORTE_SUCCESS != (rc = orte_odls.kill_local_procs(&cmd))) { + ORTE_ERROR_LOG(rc); + } + OBJ_DESTRUCT(&cmd); + OBJ_DESTRUCT(&proc); +} + +static int record_dead_process(orte_process_name_t *proc) { + opal_pointer_array_t *dead_name; + opal_buffer_t *buffer; + orte_daemon_cmd_flag_t command; + int rc = ORTE_SUCCESS; + int num_failed; + + if (orte_odls_base_default_check_finished(proc)) { + return rc; + } + + dead_name = OBJ_NEW(opal_pointer_array_t); + + opal_pointer_array_add(dead_name, proc); + + /* Mark the process as dead */ + mark_processes_as_dead(dead_name); + + /* Send a message to the HNP */ + buffer = OBJ_NEW(opal_buffer_t); + command = ORTE_PROCESS_FAILED_NOTIFICATION; + + num_failed = 1; + + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(rc); + } else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &num_failed, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(rc); + } else if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, proc, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(rc); + } + + orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_DAEMON, 0); + + OBJ_RELEASE(buffer); + OBJ_RELEASE(dead_name); + + return rc; +} + +int send_to_local_applications(opal_pointer_array_t *dead_names) { + opal_buffer_t *buf; + int ret; + orte_process_name_t *name_item; + int size, i; + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output, + "%s Sending failure to local applications.", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + buf = OBJ_NEW(opal_buffer_t); + + size = opal_pointer_array_get_size(dead_names); + + if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, &size, 1, ORTE_VPID))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buf); + return ret; + } + + for (i = 0; i < size; i++) { + if (NULL != (name_item = (orte_process_name_t *) opal_pointer_array_get_item(dead_names, i))) { + if (ORTE_SUCCESS != (ret = opal_dss.pack(buf, name_item, 1, ORTE_NAME))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buf); + return ret; + } + } + } + + if (ORTE_SUCCESS != (ret = orte_odls.deliver_message(ORTE_JOBID_WILDCARD, buf, ORTE_RML_TAG_EPOCH_CHANGE))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(buf); + return ret; + } + + OBJ_RELEASE(buf); + + return ORTE_SUCCESS; +} + diff --git a/orte/mca/errmgr/ortedresil/errmgr_ortedresil.h b/orte/mca/errmgr/ortedresil/errmgr_ortedresil.h new file mode 100644 index 0000000000..6d9cefa6c8 --- /dev/null +++ b/orte/mca/errmgr/ortedresil/errmgr_ortedresil.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + */ + +#ifndef MCA_ERRMGR_ORTEDRESIL_EXPORT_H +#define MCA_ERRMGR_ORTEDRESIL_EXPORT_H + +#include "orte_config.h" + +#include "orte/mca/errmgr/errmgr.h" + +BEGIN_C_DECLS + +/* + * Local Component structures + */ + +ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_ortedresil_component; + +ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_ortedresil_module; + +END_C_DECLS + +#endif /* MCA_ERRMGR_ORTEDRESIL_EXPORT_H */ diff --git a/orte/mca/errmgr/ortedresil/errmgr_ortedresil_component.c b/orte/mca/errmgr/ortedresil/errmgr_ortedresil_component.c new file mode 100644 index 0000000000..a3ece3f2f1 --- /dev/null +++ b/orte/mca/errmgr/ortedresil/errmgr_ortedresil_component.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "errmgr_ortedresil.h" + +/* + * Public string for version number + */ +const char *orte_errmgr_ortedresil_component_version_string = + "ORTE ERRMGR ortedresil MCA component version " ORTE_VERSION; + +/* + * Local functionality + */ +static int errmgr_ortedresil_open(void); +static int errmgr_ortedresil_close(void); +static int errmgr_ortedresil_component_query(mca_base_module_t **module, int *priority); + +/* + * Instantiate the public struct with all of our public information + * and pointer to our public functions in it + */ +orte_errmgr_base_component_t mca_errmgr_ortedresil_component = +{ + /* Handle the general mca_component_t struct containing + * meta information about the component itortedresil + */ + { + ORTE_ERRMGR_BASE_VERSION_3_0_0, + /* Component name and version */ + "ortedresil", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + errmgr_ortedresil_open, + errmgr_ortedresil_close, + errmgr_ortedresil_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } +}; + +static int errmgr_ortedresil_open(void) +{ + return ORTE_SUCCESS; +} + +static int errmgr_ortedresil_close(void) +{ + return ORTE_SUCCESS; +} + +static int errmgr_ortedresil_component_query(mca_base_module_t **module, int *priority) +{ + if (ORTE_PROC_IS_DAEMON) { + /* keep our priority low so that other modules are higher + * and will run before us + */ + *priority = 0; + *module = (mca_base_module_t *)&orte_errmgr_ortedresil_module; + return ORTE_SUCCESS; + } + + *priority = -1; + *module = NULL; + return ORTE_ERROR; +} + diff --git a/orte/mca/errmgr/ortedresil/help-orte-errmgr-orted.txt b/orte/mca/errmgr/ortedresil/help-orte-errmgr-orted.txt new file mode 100644 index 0000000000..c6d43f1f77 --- /dev/null +++ b/orte/mca/errmgr/ortedresil/help-orte-errmgr-orted.txt @@ -0,0 +1,14 @@ + -*- text -*- +# +# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for ORTE RecoS IGNORE framework. +# diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index 124a097f69..befe43f3db 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -240,10 +240,12 @@ int orte_ess_base_app_setup(void) } /* Execute the post-startup errmgr code */ - if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) { - ORTE_ERROR_LOG(ret); - error = "orte_errmgr.post_startup"; - goto error; + if (NULL != orte_errmgr.post_startup) { + if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) { + ORTE_ERROR_LOG(ret); + error = "orte_errmgr.post_startup"; + goto error; + } } /* if we are an ORTE app - and not an MPI app - then @@ -278,7 +280,9 @@ error: int orte_ess_base_app_finalize(void) { - orte_errmgr.pre_shutdown(); + if (NULL != orte_errmgr.pre_shutdown) { + orte_errmgr.pre_shutdown(); + } orte_notifier_base_close(); diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index 2a08182c21..1325cc818b 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -505,10 +505,12 @@ int orte_ess_base_orted_setup(char **hosts) orte_sensor.start(ORTE_PROC_MY_NAME->jobid); /* Execute the post-startup errmgr code */ - if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) { - ORTE_ERROR_LOG(ret); - error = "orte_errmgr.post_startup"; - goto error; + if (NULL != orte_errmgr.post_startup) { + if (ORTE_SUCCESS != (ret = orte_errmgr.post_startup())) { + ORTE_ERROR_LOG(ret); + error = "orte_errmgr.post_startup"; + goto error; + } } return ORTE_SUCCESS; @@ -523,7 +525,9 @@ int orte_ess_base_orted_setup(char **hosts) int orte_ess_base_orted_finalize(void) { - orte_errmgr.pre_shutdown(); + if (NULL != orte_errmgr.pre_shutdown) { + orte_errmgr.pre_shutdown(); + } /* stop the local sensors */ orte_sensor.stop(ORTE_PROC_MY_NAME->jobid);