diff --git a/NEWS b/NEWS index 142ccd8045..24b84ae5ea 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana +Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana University Research and Technology Corporation. All rights reserved. Copyright (c) 2004-2006 The University of Tennessee and The University @@ -29,6 +29,9 @@ version 1.0. Trunk (not on release branches yet) ----------------------------------- +- ErrMgr framework redesigned to better support fault tolerance development + activities. See the following RFC for details: + http://www.open-mpi.org/community/lists/devel/2010/03/7589.php - Add pkg-config(1) configuration files for ompi, ompi-c, ompi-cxx, ompi-f77, ompi-f90. See the README for more details. diff --git a/orte/mca/errmgr/base/base.h b/orte/mca/errmgr/base/base.h index 3ff4fe50d6..50683a7280 100644 --- a/orte/mca/errmgr/base/base.h +++ b/orte/mca/errmgr/base/base.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University @@ -33,32 +33,37 @@ #include "orte/mca/errmgr/errmgr.h" -/* - * Global functions for MCA overall collective open and close - */ BEGIN_C_DECLS /* - * Internal definitions - */ -/* - * function definitions + * MCA Framework functions */ ORTE_DECLSPEC int orte_errmgr_base_open(void); ORTE_DECLSPEC int orte_errmgr_base_select(void); ORTE_DECLSPEC int orte_errmgr_base_close(void); -/* - * globals that might be needed +/** + * Composite Stack states */ +#define ORTE_ERRMGR_STACK_STATE_NONE 0x00 /* No actions have been performed */ +#define ORTE_ERRMGR_STACK_STATE_STABLIZED 0x01 /* Stabalized the runtime */ +#define ORTE_ERRMGR_STACK_STATE_CONTINUE 0x02 /* Continue running without this process */ +#define ORTE_ERRMGR_STACK_STATE_RECOVERED 0x04 /* Process has been recovered */ +#define ORTE_ERRMGR_STACK_STATE_JOB_ABORT 0x08 /* Abort this job, cannot recover */ -extern bool orte_errmgr_base_selected; -extern bool orte_errmgr_initialized; +/** + * Output and component variables + */ ORTE_DECLSPEC extern opal_list_t orte_errmgr_base_components_available; -ORTE_DECLSPEC extern mca_errmgr_base_component_t orte_errmgr_base_selected_component; +ORTE_DECLSPEC extern int orte_errmgr_base_output; +ORTE_DECLSPEC extern bool orte_errmgr_base_shutting_down; +ORTE_DECLSPEC extern bool orte_errmgr_base_enable_recovery; + +extern opal_pointer_array_t orte_errmgr_base_modules; +extern bool orte_errmgr_initialized; /* - * external API functions will be documented in the mca/errmgr/errmgr.h file + * Additional External API function declared in errmgr.h */ END_C_DECLS diff --git a/orte/mca/errmgr/base/errmgr_base_close.c b/orte/mca/errmgr/base/errmgr_base_close.c index 78734838ea..101d54c93b 100644 --- a/orte/mca/errmgr/base/errmgr_base_close.c +++ b/orte/mca/errmgr/base/errmgr_base_close.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University @@ -32,17 +32,31 @@ int orte_errmgr_base_close(void) { + orte_errmgr_base_module_t *module = NULL; + int i; + OPAL_TRACE(5); - + + /* Close all selected components */ + for(i = 0; i < orte_errmgr_base_modules.size; ++i) { + module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i); + if( NULL == module ) { + continue; + } + if( NULL != module->internal_errmgr_finalize ) { + module->internal_errmgr_finalize(); + } + } + /* Close all remaining available components (may be one if this is a OMPI RTE program, or [possibly] multiple if this is ompi_info) */ - mca_base_components_close(orte_errmgr_base_output, - &orte_errmgr_base_components_available, NULL); - + &orte_errmgr_base_components_available, + NULL); + + OBJ_DESTRUCT(&orte_errmgr_base_modules); + orte_errmgr_initialized = false; - /* All done */ - return ORTE_SUCCESS; } diff --git a/orte/mca/errmgr/base/errmgr_base_fns.c b/orte/mca/errmgr/base/errmgr_base_fns.c index 0c8e427be0..c3520669ee 100644 --- a/orte/mca/errmgr/base/errmgr_base_fns.c +++ b/orte/mca/errmgr/base/errmgr_base_fns.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University @@ -24,20 +24,37 @@ #include #endif #include +#include + +#include "orte/runtime/orte_globals.h" +#include "orte/runtime/orte_wait.h" +#include "orte/runtime/orte_locks.h" #include "opal/util/trace.h" #include "opal/util/output.h" - -#include "orte/runtime/orte_globals.h" #include "orte/util/name_fns.h" #include "orte/util/session_dir.h" + +#include "orte/mca/plm/plm.h" +#include "orte/mca/routed/routed.h" #include "orte/mca/ess/ess.h" #include "orte/mca/odls/odls.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/errmgr_private.h" +/* + * Local Function Declaration + */ +static int orte_errmgr_base_stabalize_runtime(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state); + +/* + * Public interfaces + */ void orte_errmgr_base_log(int error_code, char *filename, int line) { OPAL_TRACE(1); @@ -52,17 +69,363 @@ void orte_errmgr_base_log(int error_code, char *filename, int line) ORTE_ERROR_NAME(error_code), filename, line); } -void orte_errmgr_base_proc_aborted_not_avail(orte_process_name_t *name, int exit_code) +int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code) { - return; + int rc; + orte_job_t *jdata; + orte_proc_t *proc; + int i; + orte_proc_state_t state = ORTE_PROC_STATE_ABORTED; + int stack_state = ORTE_ERRMGR_STACK_STATE_NONE; + orte_errmgr_base_module_t *module = NULL; + + if( ORTE_PROC_IS_APP ) { + return ORTE_SUCCESS; + } + + stack_state = ORTE_ERRMGR_STACK_STATE_NONE; + stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT; + + /******************************** + * Stabalize the runtime + ********************************/ + if( !orte_errmgr_base_shutting_down ) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:proc_aborted() %s) " + "------- %s fault reported! Process %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"), + ORTE_NAME_PRINT(name))); + } + + /* get the job data object for this process */ + if (NULL == (jdata = orte_get_job_data_object(name->jobid))) { + /* nothing we can do - abort things */ + goto PROCESS; + } + + /* if the proc was terminated by cmd, ignore it */ + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, name->vpid))) { + /* nothing we can do */ + goto PROCESS; + } + + if( !orte_errmgr_base_shutting_down ) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:proc_aborted() %s) " + "------- %s fault reported! Process %s, state (0x%x)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (name->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process"), + ORTE_NAME_PRINT(name), + proc->state )); + } + + if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) { + /* don't do anything or else we can enter an infinite loop */ + return ORTE_SUCCESS; + } + + if( ORTE_SUCCESS != orte_errmgr_base_stabalize_runtime(jdata, name, state)) { + goto PROCESS; + } + + /******************************** + * Call the active modules + ********************************/ + if( orte_errmgr_base_enable_recovery && !orte_errmgr_base_shutting_down) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:proc_aborted() %s) " + "------- Attempting recovery... (%3d active components)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_errmgr_base_modules.size)); + + stack_state |= ORTE_ERRMGR_STACK_STATE_STABLIZED; + for(i = 0; i < orte_errmgr_base_modules.size; ++i) { + module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i); + if( NULL == module ) { + continue; + } + if( NULL != module->internal_process_fault ) { + module->internal_process_fault(jdata, name, state, &stack_state); + } + } + } + + /******************************** + * If the active modules still need us to abort, then do so + ********************************/ + if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (stack_state)) ) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:proc_aborted() %s) " + "------- Successfully recovered from process %s fault! Continuing...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name))); + return ORTE_SUCCESS; + } + + PROCESS: + if( !orte_errmgr_base_shutting_down ) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:proc_aborted() %s) " + "------- Not able to recover from process %s fault! Aborting...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name))); + } + + /* if we are already in progress, then ignore this call */ + if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, + "%s errmgr:base: abort in progress, ignoring proc %s aborted with status %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name), exit_code)); + + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, + "%s errmgr:base: proc %s aborted with status %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name), exit_code)); + + orte_job_term_ordered = true; + + /* if the proc is a daemon, then we are abnormally terminating */ + if (ORTE_PROC_MY_NAME->jobid == name->jobid) { + orte_abnormal_term_ordered = true; + } + + /* indicate that all jobs other than the one containing this + * proc have been ordered to abort - this is necessary to avoid + * duplicate ordering of "abort". + * + * NOTE: be sure to not include the 0 job data location as this + * contains the daemons! + */ + for (i=1; i < orte_job_data->size; i++) { + /* the array may have holes in it as we are recovering + * jobids as they complete, so check everything + */ + if (NULL == (jdata = orte_get_job_data_object(name->jobid))) { + continue; + } + if (ORTE_JOB_STATE_ABORTED != jdata->state && + ORTE_JOB_STATE_ABORTED_BY_SIG != jdata->state && + ORTE_JOB_STATE_ABORTED_WO_SYNC != jdata->state) { + jdata->state = ORTE_JOB_STATE_ABORT_ORDERED; + } + } + + /* tell the plm to terminate all jobs */ + if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) { + ORTE_ERROR_LOG(rc); + } + + /* set the exit status, just in case whomever called us failed + * to do so - it can only be done once, so we are protected + * from overwriting it + */ + ORTE_UPDATE_EXIT_STATUS(exit_code); + + /* just return - let the daemons report back so we can properly + * know when to actually exit + */ + + return ORTE_SUCCESS; } -void orte_errmgr_base_incomplete_start_not_avail(orte_jobid_t job, int exit_code) +int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code) { - return; + int rc; + orte_job_t *jdata; + orte_proc_state_t state = ORTE_PROC_STATE_FAILED_TO_START; + int stack_state = ORTE_ERRMGR_STACK_STATE_NONE; + + if( ORTE_PROC_IS_APP ) { + return ORTE_SUCCESS; + } + + stack_state = ORTE_ERRMGR_STACK_STATE_NONE; + stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT; + + /******************************** + * Stabalize the runtime + ********************************/ + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:incomplete_start() %s) " + "------- Incomplete start of job %s!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job))); + + /* get the job data object for this process */ + if (NULL == (jdata = orte_get_job_data_object(job))) { + /* nothing we can do - abort things */ + goto PROCESS; + } + + if( ORTE_SUCCESS != orte_errmgr_base_stabalize_runtime(jdata, NULL, state)) { + goto PROCESS; + } + + /******************************** + * Call the active modules + * JJH: Currently, if we cannot launch the job, then we should just abort. + * JJH: Add job launch recovery logic... + ********************************/ +#if 0 + if( orte_errmgr_base_enable_recovery ) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:incomplete_start() %s) " + "------- Attempting recovery... (%3d active components)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_errmgr_base_modules.size)); + stack_state |= ORTE_ERRMGR_STACK_STATE_STABLIZED; + for(i = 0; i < orte_errmgr_base_modules.size; ++i) { + module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i); + if( NULL == module ) { + continue; + } + if( NULL != module->internal_process_fault ) { + module->internal_process_fault(jdata, NULL, state, &stack_state); + } + } + } +#endif + + /******************************** + * If the active modules still need us to abort, then do so + ********************************/ + if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (stack_state)) ) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:incomplete_start() %s) " + "------- Successfully recovered from incomplete start of job %s! Continuing...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job) )); + return ORTE_SUCCESS; + } + + PROCESS: + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:incomplete_start() %s) " + "------- Not able to recover from incomplete start of job %s! Aborting...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job) )); + + /* if we are already in progress, then ignore this call */ + if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, + "%s errmgr:base: abort in progress, ignoring incomplete start on job %s with status %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), exit_code)); + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, + "%s errmgr:base: job %s reported incomplete start with status %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job), exit_code)); + + orte_job_term_ordered = true; + + /* tell the plm to terminate all jobs */ + if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) { + ORTE_ERROR_LOG(rc); + } + + /* set the exit status, just in case whomever called us failed + * to do so - it can only be done once, so we are protected + * from overwriting it + */ + ORTE_UPDATE_EXIT_STATUS(exit_code); + + /* just return - let the daemons report back so we can properly + * know when to actually exit + */ + + return ORTE_SUCCESS; } -void orte_errmgr_base_error_abort(int error_code, char *fmt, ...) +int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code) +{ + orte_job_t *jdata = NULL; + orte_proc_state_t state = ORTE_PROC_STATE_COMM_FAILED; + int stack_state = ORTE_ERRMGR_STACK_STATE_NONE; + orte_errmgr_base_module_t *module = NULL; + int i; + + stack_state = ORTE_ERRMGR_STACK_STATE_NONE; + stack_state |= ORTE_ERRMGR_STACK_STATE_JOB_ABORT; + + /******************************** + * Stabalize the runtime + ********************************/ + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:comm_failed() %s) " + "------- Communication to Process %s failed!", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name) )); + + /* get the job data object for this process */ + if (NULL == (jdata = orte_get_job_data_object(name->jobid))) { + /* nothing we can do - abort things */ + goto PROCESS; + } + + if( ORTE_SUCCESS != orte_errmgr_base_stabalize_runtime(jdata, name, state)) { + goto PROCESS; + } + + /******************************** + * Call the active modules + ********************************/ + if( orte_errmgr_base_enable_recovery ) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:comm_failed() %s) " + "------- Attempting recovery... (%3d active components)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_errmgr_base_modules.size)); + + stack_state |= ORTE_ERRMGR_STACK_STATE_STABLIZED; + for(i = 0; i < orte_errmgr_base_modules.size; ++i) { + module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i); + if( NULL == module ) { + continue; + } + if( NULL != module->internal_process_fault ) { + module->internal_process_fault(jdata, name, state, &stack_state); + } + } + } + + /******************************** + * If the active modules still need us to abort, then do so + ********************************/ + if( !(ORTE_ERRMGR_STACK_STATE_JOB_ABORT & (stack_state)) ) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:comm_failed() %s) " + "------- Successfully recovered from communication fault with process %s! Continuing...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name) )); + return ORTE_SUCCESS; + } + + PROCESS: + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:comm_failed() %s) " + "------- Not able to recover from communication fault with process %s! Aborting...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(name) )); + + /* + * Default action is to abort + */ + ORTE_UPDATE_EXIT_STATUS(exit_code); + orte_abnormal_term_ordered = true; + orte_trigger_event(&orte_exit); + + return ORTE_SUCCESS; +} + +int orte_errmgr_base_abort(int error_code, char *fmt, ...) { va_list arglist; @@ -89,12 +452,191 @@ void orte_errmgr_base_error_abort(int error_code, char *fmt, ...) /* abnormal exit */ orte_ess.abort(error_code, false); + + return ORTE_SUCCESS; } -int orte_errmgr_base_register_cb_not_avail(orte_jobid_t job, - orte_job_state_t state, - orte_err_cb_fn_t cbfunc, - void *cbdata) +int orte_errmgr_base_predicted_fault(char ***proc_list, + char ***node_list, + char ***suggested_nodes) { - return ORTE_ERR_NOT_AVAILABLE; + orte_errmgr_base_module_t *module = NULL; + int i; + + /* + * If the user did not ask for recovery, then do not process recovery events + */ + if( !orte_errmgr_base_enable_recovery ) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:predicted_fault() %s) " + "------- Recovery currently disabled! Skipping...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) )); + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:predicted_fault() %s) " + "------- Notifying components... (%3d active components)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_errmgr_base_modules.size)); + + for(i = 0; i < orte_errmgr_base_modules.size; ++i) { + module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i); + if( NULL == module ) { + continue; + } + if( NULL != module->internal_predicted_fault ) { + module->internal_predicted_fault(proc_list, node_list, suggested_nodes); + } + } + + return ORTE_SUCCESS; +} + +int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list) +{ + orte_errmgr_base_module_t *module = NULL; + int i; + + /* + * If the user did not ask for recovery, then do not process recovery events + */ + if( !orte_errmgr_base_enable_recovery ) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:suggest_map_targets() %s) " + "------- Recovery currently disabled! Skipping...", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME) )); + return ORTE_SUCCESS; + } + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:suggest_map_targets() %s) " + "------- Notifying components... (%3d active components)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_errmgr_base_modules.size)); + + for(i = 0; i < orte_errmgr_base_modules.size; ++i) { + module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i); + if( NULL == module ) { + continue; + } + if( NULL != module->internal_suggest_map_targets ) { + module->internal_suggest_map_targets(proc, oldnode, node_list); + } + } + + return ORTE_SUCCESS; +} + +int orte_errmgr_base_ft_event(int state) +{ + orte_errmgr_base_module_t *module = NULL; + int i; + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:ft_event() %s) " + "------- Notifying components... (%3d active components)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + orte_errmgr_base_modules.size)); + + for(i = 0; i < orte_errmgr_base_modules.size; ++i) { + module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i); + if( NULL == module ) { + continue; + } + if( NULL != module->internal_ft_event ) { + module->internal_ft_event(state); + } + } + + return ORTE_SUCCESS; +} + +/* + * Local functions + */ +static int orte_errmgr_base_stabalize_runtime(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state) +{ + orte_proc_t *loc_proc, *child_proc; + orte_std_cntr_t i_proc; + int32_t i; + + /* + * orterun is trying to shutdown, so just let it + */ + if( orte_errmgr_base_shutting_down ) { + return ORTE_SUCCESS; + } + + /* + * orte_errmgr_base_incomplete_start() will pass a NULL since all processes + * are effected by this fault. + * JJH: Since we do not handle the recovery from such errors yet, just + * skip processing, and go to the abort sequence. + */ + if( NULL == proc ) { + return ORTE_SUCCESS; + } + + /* + * Set the process state in the job data structure + */ + for(i = 0; i < jdata->procs->size; ++i) { + if (NULL == (loc_proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + continue; + } + + if( loc_proc->name.vpid != proc->vpid) { + continue; + } + + loc_proc->state = state; + + break; + } + /* + * If this is a part of the control plane (HNP/orted) + */ + if( proc->jobid == ORTE_PROC_MY_NAME->jobid ) { + /* + * Remove the route to this process + */ + orte_routed.delete_route(proc); + + /* + * If the aborted daemon had active processes on its node, then we should + * make sure to signal that all the children are gone. + */ + if( loc_proc->node->num_procs > 0 ) { + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:stabalize_runtime() %s) " + "------- Daemon lost with the following processes", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + + for(i_proc = 0; i_proc < opal_pointer_array_get_size(loc_proc->node->procs); ++i_proc) { + child_proc = (orte_proc_t*)opal_pointer_array_get_item(loc_proc->node->procs, i_proc); + if( NULL == child_proc ) { + continue; + } + + OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base_output, + "errmgr:base:stabalize_runtime() %s) " + "\t %s [0x%x]", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&child_proc->name), + child_proc->state)); + + if( child_proc->last_errmgr_state < child_proc->state ) { + child_proc->last_errmgr_state = child_proc->state; + orte_errmgr_base_proc_aborted(&child_proc->name, -1); + } + } + } + } + + return ORTE_SUCCESS; } diff --git a/orte/mca/errmgr/base/errmgr_base_open.c b/orte/mca/errmgr/base/errmgr_base_open.c index f32634aae0..976f0d7493 100644 --- a/orte/mca/errmgr/base/errmgr_base_open.c +++ b/orte/mca/errmgr/base/errmgr_base_open.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University @@ -20,50 +20,54 @@ #include "orte_config.h" #include "orte/constants.h" +#ifdef HAVE_STRING_H +#include +#endif +#ifdef HAVE_UNISTD_H +#include +#endif +#ifdef HAVE_SYS_TYPES_H +#include +#endif + #include "opal/mca/mca.h" #include "opal/mca/base/base.h" #include "opal/mca/base/mca_base_param.h" + +#include "opal/util/opal_environ.h" +#include "opal/util/output.h" #include "opal/util/trace.h" #include "opal/util/output.h" - #include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/errmgr_private.h" - -/* - * The following file was created by configure. It contains extern - * statements and the definition of an array of pointers to each - * component's public mca_base_component_t struct. - */ - #include "orte/mca/errmgr/base/static-components.h" /* - * globals + * Globals */ - -/* - * Global variables - */ -int orte_errmgr_base_output = -1; -/* - * define a default module that all application procs - * can use without having to open the framework. The - * decision on whether or not to open the framework is - * made in orte_init - */ -orte_errmgr_base_module_t orte_errmgr = { - orte_errmgr_base_proc_aborted_not_avail, - orte_errmgr_base_incomplete_start_not_avail, - orte_errmgr_base_register_cb_not_avail, - orte_errmgr_base_error_abort -}; - -bool orte_errmgr_base_selected = false; -opal_list_t orte_errmgr_base_components_available; -mca_errmgr_base_component_t orte_errmgr_base_selected_component; +int orte_errmgr_base_output = -1; +bool orte_errmgr_base_enable_recovery = false; +bool orte_errmgr_base_shutting_down = false; bool orte_errmgr_initialized = false; +opal_list_t orte_errmgr_base_components_available; + +/* Public module provides a wrapper around previous functions */ +orte_errmgr_base_module_t orte_errmgr = { + orte_errmgr_base_proc_aborted, + orte_errmgr_base_incomplete_start, + orte_errmgr_base_comm_failed, + orte_errmgr_base_abort, + + /* Internal Interfaces */ + NULL, /* internal_errmgr_init */ + NULL, /* internal_errmgr_finalize */ + NULL, /* internal_predicted_fault */ + NULL, /* internal_process_fault */ + NULL, /* internal_suggest_map_targets */ + NULL /* internal_ft_event */ +}; /** * Function for finding and opening either all MCA components, or the one @@ -71,25 +75,46 @@ bool orte_errmgr_initialized = false; */ int orte_errmgr_base_open(void) { - OPAL_TRACE(5); - - if (!orte_errmgr_initialized) { /* ensure we only do this once */ - - orte_errmgr_base_output = opal_output_open(NULL); + int value; - /* Open up all available components */ - - if (ORTE_SUCCESS != - mca_base_components_open("errmgr", orte_errmgr_base_output, - mca_errmgr_base_static_components, - &orte_errmgr_base_components_available, true)) { - return ORTE_ERROR; - } - - orte_errmgr_initialized = true; + OPAL_TRACE(5); + + /* Only pass this way once */ + if( orte_errmgr_initialized ) { + return ORTE_SUCCESS; + } + + OBJ_CONSTRUCT(&orte_errmgr_base_modules, opal_pointer_array_t); + + orte_errmgr_base_output = opal_output_open(NULL); + + mca_base_param_reg_int_name("errmgr", + "base_enable_recovery", + "If the ErrMgr recovery components should be enabled." + " [Default = disabled]", + false, false, + 0, &value); + orte_errmgr_base_enable_recovery = OPAL_INT_TO_BOOL(value); + + /* + * A flag to indicate that orterun is shutting down, so skip the recovery + * logic. + */ + orte_errmgr_base_shutting_down = false; + + /* + * Open up all available components + */ + if (ORTE_SUCCESS != + mca_base_components_open("errmgr", + orte_errmgr_base_output, + mca_errmgr_base_static_components, + &orte_errmgr_base_components_available, + true)) { + return ORTE_ERROR; } - /* All done */ + orte_errmgr_initialized = true; return ORTE_SUCCESS; } diff --git a/orte/mca/errmgr/base/errmgr_base_select.c b/orte/mca/errmgr/base/errmgr_base_select.c index e1e5e23588..2c9de6e35f 100644 --- a/orte/mca/errmgr/base/errmgr_base_select.c +++ b/orte/mca/errmgr/base/errmgr_base_select.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University @@ -18,38 +18,163 @@ #include "orte_config.h" +#include "orte/constants.h" + +#ifdef HAVE_STRING_H +#include +#endif #include "opal/mca/mca.h" #include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/util/output.h" #include "orte/mca/errmgr/base/base.h" #include "orte/mca/errmgr/base/errmgr_private.h" -/** - * Function for selecting one component from all those that are - * available. +/* + * List of composite modules, ordered by priority */ +opal_pointer_array_t orte_errmgr_base_modules; + +struct orte_errmgr_base_select_module_t { + mca_base_component_t *component; + mca_base_module_t *module; + int priority; +}; +typedef struct orte_errmgr_base_select_module_t orte_errmgr_base_select_module_t; + int orte_errmgr_base_select(void) { - mca_errmgr_base_component_t *best_component = NULL; - orte_errmgr_base_module_t *best_module = NULL; + int exit_status = OPAL_SUCCESS; + mca_base_component_list_item_t *cli = NULL; + mca_base_component_t *component = NULL; + mca_base_module_t *module = NULL; + opal_list_item_t *item = NULL; + int priority = 0, i, j, low_i; + orte_errmgr_base_select_module_t *tmp_module = NULL, *tmp_module_sw = NULL; + opal_pointer_array_t tmp_array; + orte_errmgr_base_module_t *i_module = NULL; /* - * Select the best component + * If the user does not want the recovery features, then do not select any. */ - if( OPAL_SUCCESS != mca_base_select("errmgr", orte_errmgr_base_output, - &orte_errmgr_base_components_available, - (mca_base_module_t **) &best_module, - (mca_base_component_t **) &best_component) ) { - /* This will only happen if no component was selected */ - return ORTE_ERR_NOT_FOUND; + if( !orte_errmgr_base_enable_recovery ) { + goto INIT; } - /* Save the winner */ - orte_errmgr = *best_module; - orte_errmgr_base_selected_component = *best_component; - orte_errmgr_base_selected = true; + OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t); - return ORTE_SUCCESS; + opal_output_verbose(10, orte_errmgr_base_output, + "errmgr:base:select: Auto-selecting components"); + + /* + * Traverse the list of available components. + * For each call their 'query' functions to determine relative priority. + */ + for (item = opal_list_get_first(&orte_errmgr_base_components_available); + item != opal_list_get_end(&orte_errmgr_base_components_available); + item = opal_list_get_next(item) ) { + cli = (mca_base_component_list_item_t *) item; + component = (mca_base_component_t *) cli->cli_component; + + /* + * If there is a query function then use it. + */ + if (NULL == component->mca_query_component) { + opal_output_verbose(5, orte_errmgr_base_output, + "errmgr:base:select Skipping component [%s]. It does not implement a query function", + component->mca_component_name ); + continue; + } + + /* + * Query this component for the module and priority + */ + opal_output_verbose(5, orte_errmgr_base_output, + "errmgr:base:select Querying component [%s]", + component->mca_component_name); + + component->mca_query_component(&module, &priority); + + /* + * If no module was returned or negative priority, then skip component + */ + if (NULL == module || priority < 0) { + opal_output_verbose(5, orte_errmgr_base_output, + "errmgr:base:select Skipping component [%s]. Query failed to return a module", + component->mca_component_name ); + continue; + } + + /* + * Append them to the temporary list, we will sort later + */ + opal_output_verbose(5, orte_errmgr_base_output, + "errmgr:base:select Query of component [%s] set priority to %d", + component->mca_component_name, priority); + tmp_module = (orte_errmgr_base_select_module_t *)malloc(sizeof(orte_errmgr_base_select_module_t)); + tmp_module->component = component; + tmp_module->module = module; + tmp_module->priority = priority; + + opal_pointer_array_add(&tmp_array, (void*)tmp_module); + } + + /* + * Sort the list by decending priority + */ + priority = 0; + for(j = 0; j < tmp_array.size; ++j) { + tmp_module_sw = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, j); + if( NULL == tmp_module_sw ) { + continue; + } + + low_i = -1; + priority = tmp_module_sw->priority; + + for(i = 0; i < tmp_array.size; ++i) { + tmp_module = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, i); + if( NULL == tmp_module ) { + continue; + } + if( tmp_module->priority > priority ) { + low_i = i; + priority = tmp_module->priority; + } + } + + if( low_i >= 0 ) { + tmp_module = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, low_i); + opal_pointer_array_set_item(&tmp_array, low_i, NULL); + j--; /* Try this entry again, if it is not the lowest */ + } else { + tmp_module = tmp_module_sw; + opal_pointer_array_set_item(&tmp_array, j, NULL); + } + opal_output_verbose(5, orte_errmgr_base_output, + "errmgr:base:select Add module with priority [%s] %d", + tmp_module->component->mca_component_name, tmp_module->priority); + opal_pointer_array_add(&orte_errmgr_base_modules, (void*)(tmp_module->module)); + free(tmp_module); + } + OBJ_DESTRUCT(&tmp_array); + + INIT: + /* + * Initialize each of the Errmgr Modules + */ + for(i = 0; i < orte_errmgr_base_modules.size; ++i) { + i_module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base_modules, i); + if( NULL == i_module ) { + continue; + } + if( NULL != i_module->internal_errmgr_init ) { + i_module->internal_errmgr_init(); + } + } + + return exit_status; } diff --git a/orte/mca/errmgr/base/errmgr_private.h b/orte/mca/errmgr/base/errmgr_private.h index 88b075369d..16b10aff10 100644 --- a/orte/mca/errmgr/base/errmgr_private.h +++ b/orte/mca/errmgr/base/errmgr_private.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University @@ -34,7 +34,6 @@ #include "orte/mca/errmgr/errmgr.h" - /* * Functions for use solely within the ERRMGR framework */ @@ -48,29 +47,29 @@ typedef uint8_t orte_errmgr_cmd_flag_t; #define ORTE_ERRMGR_ABORT_PROCS_REQUEST_CMD 0x01 #define ORTE_ERRMGR_REGISTER_CALLBACK_CMD 0x02 -/* provide access to verbose output channel */ -ORTE_DECLSPEC extern int orte_errmgr_base_output; - - /* * Base functions */ +ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line); -ORTE_DECLSPEC void orte_errmgr_base_log(int error_code, char *filename, int line); - -ORTE_DECLSPEC void orte_errmgr_base_proc_aborted_not_avail(orte_process_name_t *name, int exit_code); - -ORTE_DECLSPEC void orte_errmgr_base_incomplete_start_not_avail(orte_jobid_t job, int exit_code); - -ORTE_DECLSPEC void orte_errmgr_base_error_abort(int error_code, char *fmt, ...) __opal_attribute_format__(__printf__, 2, 3) __opal_attribute_noreturn__; - -ORTE_DECLSPEC int orte_errmgr_base_register_cb_not_avail(orte_jobid_t job, - orte_job_state_t state, - orte_err_cb_fn_t cbfunc, - void *cbdata); +ORTE_DECLSPEC int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code); +ORTE_DECLSPEC int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code); +ORTE_DECLSPEC int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code); +ORTE_DECLSPEC int orte_errmgr_base_abort(int error_code, char *fmt, ...) +# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR + __opal_attribute_format__(__printf__, 2, 3) +# endif + ; +ORTE_DECLSPEC int orte_recos_base_predicted_fault(char ***proc_list, + char ***node_list, + char ***suggested_nodes); +ORTE_DECLSPEC int orte_recos_base_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list); +ORTE_DECLSPEC int orte_recos_base_ft_event(int state); /* - * external API functions will be documented in the mca/errmgr/errmgr.h file + * Additional External API function declared in errmgr.h */ END_C_DECLS diff --git a/orte/mca/errmgr/default/.windows b/orte/mca/errmgr/default/.windows deleted file mode 100644 index 7a934e8f29..0000000000 --- a/orte/mca/errmgr/default/.windows +++ /dev/null @@ -1,12 +0,0 @@ -# -# Copyright (c) 2008-2010 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# Specific to this module -mca_link_libraries=libopen-rte diff --git a/orte/mca/errmgr/default/Makefile.am b/orte/mca/errmgr/default/Makefile.am deleted file mode 100644 index 80c0a588fe..0000000000 --- a/orte/mca/errmgr/default/Makefile.am +++ /dev/null @@ -1,45 +0,0 @@ -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -EXTRA_DIST = .windows - -sources = \ - errmgr_default.h \ - errmgr_default_component.c \ - errmgr_default.c - -# Make the output library in this directory, and name it either -# mca__.la (for DSO builds) or libmca__.la -# (for static builds). - -if OMPI_BUILD_errmgr_default_DSO -component_noinst = -component_install = mca_errmgr_default.la -else -component_noinst = libmca_errmgr_default.la -component_install = -endif - -mcacomponentdir = $(pkglibdir) -mcacomponent_LTLIBRARIES = $(component_install) -mca_errmgr_default_la_SOURCES = $(sources) -mca_errmgr_default_la_LDFLAGS = -module -avoid-version - -noinst_LTLIBRARIES = $(component_noinst) -libmca_errmgr_default_la_SOURCES =$(sources) -libmca_errmgr_default_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/default/configure.params b/orte/mca/errmgr/default/configure.params deleted file mode 100644 index 3513f8d956..0000000000 --- a/orte/mca/errmgr/default/configure.params +++ /dev/null @@ -1,24 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2007 Los Alamos National Security, LLC. All rights -# reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# Specific to this module - -PARAM_CONFIG_FILES="Makefile" diff --git a/orte/mca/errmgr/default/errmgr_default.c b/orte/mca/errmgr/default/errmgr_default.c deleted file mode 100644 index 7754cca05c..0000000000 --- a/orte/mca/errmgr/default/errmgr_default.c +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - - -#include "orte_config.h" -#include "orte/constants.h" - -#include -#include - -#include "opal/util/trace.h" - -#include "orte/runtime/orte_globals.h" -#include "orte/runtime/orte_wait.h" -#include "orte/runtime/orte_locks.h" -#include "orte/mca/plm/plm.h" -#include "orte/util/name_fns.h" - -#include "orte/mca/errmgr/base/errmgr_private.h" -#include "errmgr_default.h" - -/* - * This function gets called by the PLM when an orted notifies us - * that a process has aborted - * Various components will follow their own strategy for dealing with - * this situation. For this component, we call the provided - * err_cbfunc if they requested notification on proc aborted. - * Otherwise, we simply kill the job. - */ -void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code) -{ - int rc; - orte_job_t *jdata; - orte_proc_t *proc; - int i; - - /* get the job data object for this process */ - if (NULL == (jdata = orte_get_job_data_object(name->jobid))) { - /* nothing we can do - abort things */ - goto PROCESS; - } - - /* if the proc was terminated by cmd, ignore it */ - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, name->vpid))) { - /* nothing we can do */ - goto PROCESS; - } - if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) { - /* don't do anything or else we can enter an infinite loop */ - return; - } - - if (NULL != jdata->err_cbfunc && (ORTE_PROC_STATE_ABORTED & jdata->err_cbstates)) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, - "%s errmgr:default: proc %s aborted with status %d - calling cbfunc", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name), exit_code)); - jdata->err_cbfunc(name, ORTE_PROC_STATE_ABORTED, jdata->err_cbdata); - return; - } - -PROCESS: - /* if we are already in progress, then ignore this call */ - if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, - "%s errmgr:default: abort in progress, ignoring proc %s aborted with status %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name), exit_code)); - - return; - } - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, - "%s errmgr:default: proc %s aborted with status %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_NAME_PRINT(name), exit_code)); - - orte_job_term_ordered = true; - - /* if the proc is a daemon, then we are abnormally terminating */ - if (ORTE_PROC_MY_NAME->jobid == name->jobid) { - orte_abnormal_term_ordered = true; - } - - /* indicate that all jobs other than the one containing this - * proc have been ordered to abort - this is necessary to avoid - * duplicate ordering of "abort". - * - * NOTE: be sure to not include the 0 job data location as this - * contains the daemons! - */ - for (i=1; i < orte_job_data->size; i++) { - /* the array may have holes in it as we are recovering - * jobids as they complete, so check everything - */ - if (NULL == (jdata = orte_get_job_data_object(name->jobid))) { - continue; - } - if (ORTE_JOB_STATE_ABORTED != jdata->state && - ORTE_JOB_STATE_ABORTED_BY_SIG != jdata->state && - ORTE_JOB_STATE_ABORTED_WO_SYNC != jdata->state) { - jdata->state = ORTE_JOB_STATE_ABORT_ORDERED; - } - } - - /* tell the plm to terminate all jobs */ - if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) { - ORTE_ERROR_LOG(rc); - } - - /* set the exit status, just in case whomever called us failed - * to do so - it can only be done once, so we are protected - * from overwriting it - */ - ORTE_UPDATE_EXIT_STATUS(exit_code); - - /* just return - let the daemons report back so we can properly - * know when to actually exit - */ -} - -/* - * This function gets called by the PLM when an orted notifies us that - * a job failed to start. - * Various components will follow their own strategy for dealing with - * this situation. For this component, we simply kill the job. - */ -void orte_errmgr_default_incomplete_start(orte_jobid_t job, int exit_code) -{ - int rc; - orte_job_t *jdata; - orte_process_name_t name; - - /* get the job data object for this process */ - if (NULL == (jdata = orte_get_job_data_object(job))) { - /* nothing we can do - abort things */ - goto PROCESS; - } - - if (NULL != jdata->err_cbfunc && (ORTE_PROC_STATE_FAILED_TO_START & jdata->err_cbstates)) { - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, - "%s errmgr:cm: job %s reported incomplete start with status %d - calling cbfunc", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), exit_code)); - name.jobid = job; - name.vpid = ORTE_VPID_WILDCARD; - jdata->err_cbfunc(&name, ORTE_PROC_STATE_FAILED_TO_START, jdata->err_cbdata); - return; - } - -PROCESS: - /* if we are already in progress, then ignore this call */ - if (!opal_atomic_trylock(&orte_abort_inprogress_lock)) { /* returns 1 if already locked */ - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, - "%s errmgr:default: abort in progress, ignoring incomplete start on job %s with status %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), exit_code)); - return; - } - - OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_output, - "%s errmgr:default: job %s reported incomplete start with status %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(job), exit_code)); - - orte_job_term_ordered = true; - - /* tell the plm to terminate all jobs */ - if (ORTE_SUCCESS != (rc = orte_plm.terminate_job(ORTE_JOBID_WILDCARD))) { - ORTE_ERROR_LOG(rc); - } - - /* set the exit status, just in case whomever called us failed - * to do so - it can only be done once, so we are protected - * from overwriting it - */ - ORTE_UPDATE_EXIT_STATUS(exit_code); - - /* just return - let the daemons report back so we can properly - * know when to actually exit - */ -} - -/* - * Register a callback function upon a change to a specified job state. - */ -int orte_errmgr_default_register_callback(orte_jobid_t job, - orte_proc_state_t state, - orte_err_cb_fn_t cbfunc, - void *cbdata) -{ - orte_job_t *jdata; - - /* get the job data object for this process */ - if (NULL == (jdata = orte_get_job_data_object(job))) { - /* nothing we can do - abort things */ - return ORTE_ERR_NOT_FOUND; - } - - /* update the error callback data */ - jdata->err_cbfunc = cbfunc; - jdata->err_cbstates = state; - jdata->err_cbdata = cbdata; - return ORTE_SUCCESS; -} diff --git a/orte/mca/errmgr/default/errmgr_default.h b/orte/mca/errmgr/default/errmgr_default.h deleted file mode 100644 index 200c39bab3..0000000000 --- a/orte/mca/errmgr/default/errmgr_default.h +++ /dev/null @@ -1,57 +0,0 @@ -/* -*- C -*- - * - * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - * - */ -#ifndef ORTE_ERRMGR_HNP_H -#define ORTE_ERRMGR_HNP_H - - -#include "orte_config.h" -#include "orte/types.h" - -#include "orte/mca/plm/plm_types.h" -#include "orte/runtime/orte_globals.h" -#include "orte/mca/errmgr/errmgr.h" - -BEGIN_C_DECLS - -/* - * Module open / close - */ -int orte_errmgr_default_component_open(void); -int orte_errmgr_default_component_close(void); -int orte_errmgr_default_component_query(mca_base_module_t **module, int *priority); - - -/* - * Component API functions - */ -void orte_errmgr_default_proc_aborted(orte_process_name_t *name, int exit_code); - -void orte_errmgr_default_incomplete_start(orte_jobid_t job, int exit_code); - -int orte_errmgr_default_register_callback(orte_jobid_t job, - orte_job_state_t state, - orte_err_cb_fn_t cbfunc, - void *cbdata); - -ORTE_MODULE_DECLSPEC extern mca_errmgr_base_component_t mca_errmgr_default_component; - -END_C_DECLS - -#endif diff --git a/orte/mca/errmgr/default/errmgr_default_component.c b/orte/mca/errmgr/default/errmgr_default_component.c deleted file mode 100644 index 31e8d59d47..0000000000 --- a/orte/mca/errmgr/default/errmgr_default_component.c +++ /dev/null @@ -1,108 +0,0 @@ -/* -*- C -*- - * - * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/** @file: - * - * The Open MPI General Purpose Registry - Proxy component - * - */ - -/* - * includes - */ -#include "orte_config.h" -#include "orte/constants.h" - -#include "opal/mca/mca.h" -#include "opal/mca/base/base.h" -#include "opal/mca/base/mca_base_param.h" - - -#include "orte/mca/errmgr/errmgr.h" -#include "orte/mca/errmgr/base/base.h" -#include "orte/mca/errmgr/base/errmgr_private.h" -#include "orte/util/proc_info.h" - -#include "errmgr_default.h" - - -/* - * Struct of function pointers that need to be initialized - */ -mca_errmgr_base_component_t mca_errmgr_default_component = { - { - ORTE_ERRMGR_BASE_VERSION_2_0_0, - - "default", /* MCA component name */ - ORTE_MAJOR_VERSION, /* MCA component major version */ - ORTE_MINOR_VERSION, /* MCA component minor version */ - ORTE_RELEASE_VERSION, /* MCA component release version */ - orte_errmgr_default_component_open, /* component open */ - orte_errmgr_default_component_close, /* component close */ - orte_errmgr_default_component_query /* component query */ - }, - { - /* The component is checkpoint ready */ - MCA_BASE_METADATA_PARAM_CHECKPOINT - } -}; - -/* - * setup the function pointers for the module - */ -orte_errmgr_base_module_t orte_errmgr_default = { - orte_errmgr_default_proc_aborted, - orte_errmgr_default_incomplete_start, - orte_errmgr_default_register_callback, - orte_errmgr_base_error_abort -}; - - -/* - * Open the component - */ -int orte_errmgr_default_component_open(void) -{ - return ORTE_SUCCESS; -} - -/* - * Close the component - */ -int orte_errmgr_default_component_close(void) -{ - return ORTE_SUCCESS; -} - -int orte_errmgr_default_component_query(mca_base_module_t **module, int *priority) -{ - /* If we are an HNP or a CM, then pick us! */ - if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_CM) { - /* Return a module (choose an arbitrary, positive priority -- - it's only relevant compared to other components). */ - - *priority = 100; - *module = (mca_base_module_t *)&orte_errmgr_default; - return ORTE_SUCCESS; - } - - /* otherwise, don't take me! */ - *module = NULL; - return ORTE_ERROR; - -} diff --git a/orte/mca/errmgr/errmgr.h b/orte/mca/errmgr/errmgr.h index 9b662e17fb..bee3a62cc6 100644 --- a/orte/mca/errmgr/errmgr.h +++ b/orte/mca/errmgr/errmgr.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University @@ -18,7 +18,38 @@ */ /** @file: * - * The Open RTE Error Manager + * The Open RTE Error and Recovery Manager (ErrMgr) + * + * This framework is a composite framework in which multiple components + * are often active at the same time and may work on a single external call + * to the interface functions. + * + * This framework allows the user to compose a job recovery policy from multiple + * individual components. Each component will operate on the function call if it + * has a registered function. If no component registers a function then the base + * functionality/policy is used. + * + * For example, consider the 3 components on the left (C1, C2, C3), and the + * API function calls across the top: + * | Priority | Fn1 | Fn2 | Fn3 | Fn4 | + * -----+----------+------+------+------+------+ + * base | --- | act0 | --- | --- | act6 | + * C1 | 10 | act1 | --- | act2 | --- | + * C2 | 20 | --- | act3 | --- | --- | + * C3 | 30 | act4 | act5 | --- | --- | + * -----+----------+------+------+------+------+ + * A call to Fn1 will result in: + * act4, act1 + * A call to Fn2 will result in: + * act5, act3 + * A call to Fn3 will result in: + * act2 + * A call to Fn4 will result in: + * act6 + * + * Notice that when the base function is overridden it is not called. The base + * function is only called when the function has not been overridden by a + * component. * */ @@ -34,6 +65,10 @@ #include "orte/types.h" #include "opal/mca/mca.h" +#include "opal/mca/base/base.h" + +#include "opal/class/opal_object.h" +#include "opal/util/output.h" #include "opal/util/error.h" #include "orte/runtime/orte_globals.h" @@ -54,16 +89,98 @@ BEGIN_C_DECLS orte_errmgr_base_log(n, __FILE__, __LINE__) /** - * This is not part of any - * module so it can be used at any time! + * This is not part of any module so it can be used at any time! */ ORTE_DECLSPEC extern void orte_errmgr_base_log(int error_code, char *filename, int line); +/** + * Module initialization function. + * Public interface. Will be call in each of the active composite components + * + * @retval ORTE_SUCCESS The operation completed successfully + * @retval ORTE_ERROR An unspecifed error occurred + */ +typedef int (*orte_errmgr_base_module_init_fn_t) + (void); + +/** + * Module finalization function. + * Public interface. Will be call in each of the active composite components + * + * @retval ORTE_SUCCESS The operation completed successfully + * @retval ORTE_ERROR An unspecifed error occurred + */ +typedef int (*orte_errmgr_base_module_finalize_fn_t) + (void); /* - * Component functions - all MUST be provided! + * Internal Composite Interfaces */ +/** + * Predicted process/node failure notification + * Composite interface. Called in priority order. + * + * @param[in] proc_list List of processes (or NULL if none) + * @param[in] node_list List of nodes (or NULL if none) + * @param[in] suggested_nodes List of suggested nodes to use on recovery (or NULL if none) + * + * @retval ORTE_SUCCESS The operation completed successfully + * @retval ORTE_ERROR An unspecifed error occurred + */ +typedef int (*orte_errmgr_base_predicted_fault_fn_t) + (char ***proc_list, char ***node_list, char ***suggested_nodes); + +/** + * Actual process failure notification + * Composite interface. Called in priority order. + * + * @param[in] proc_name Name of the failed processes + * @param[in] state State of the failed process + * + * @retval ORTE_SUCCESS The operation completed successfully + * @retval ORTE_ERROR An unspecifed error occurred + */ +typedef int (*orte_errmgr_base_process_fault_fn_t) + (orte_job_t *jdata, orte_process_name_t *proec_name, orte_proc_state_t state, int *stack_state); + +/** + * Suggest a node to map a restarting process onto + * Composite interface. Called in priority order. + * + * @param[in] proc Process that is being mapped + * @param[in] oldnode Previous node where this process resided + * @param[in|out] node_list List of nodes to select from + * + * @retval ORTE_SUCCESS The operation completed successfully + * @retval ORTE_ERROR An unspecifed error occurred + */ +typedef int (*orte_errmgr_base_suggest_map_targets_fn_t) + (orte_proc_t *proc, orte_node_t *oldnode, opal_list_t *node_list); + +/** + * Handle fault tolerance updates + * + * @param[in] state Fault tolerance state update + * + * @retval ORTE_SUCCESS The operation completed successfully + * @retval ORTE_ERROR An unspecifed error occurred + */ +typedef int (*orte_errmgr_base_ft_event_fn_t)(int state); + + +/* + * External API Functions - Implemented in errmgr/base/errmgr_base_fns.c + */ + +ORTE_DECLSPEC int orte_errmgr_base_predicted_fault(char ***proc_list, + char ***node_list, + char ***suggested_nodes); +ORTE_DECLSPEC int orte_errmgr_base_suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list); +ORTE_DECLSPEC int orte_errmgr_base_ft_event(int state); + /** * Alert - process aborted @@ -79,7 +196,8 @@ ORTE_DECLSPEC extern void orte_errmgr_base_log(int error_code, char *filename, i * @retval ORTE_SUCCESS Whatever action that was taken was successful * @retval ORTE_ERROR Appropriate error code */ -typedef void (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *name, int exit_code); +ORTE_DECLSPEC extern int orte_errmgr_base_proc_aborted(orte_process_name_t *name, int exit_code); +typedef int (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *name, int exit_code); /** * Alert - incomplete start of a job @@ -101,28 +219,8 @@ typedef void (*orte_errmgr_base_module_proc_aborted_fn_t)(orte_process_name_t *n * @retval ORTE_SUCCESS Whatever action that was taken was successful * @retval ORTE_ERROR Appropriate error code */ -typedef void (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_jobid_t job, int exit_code); - -/* - * Register a job with the error manager - * When a job is launched, this function is called so the error manager can register - * subscriptions on the job segment so that the error manager will be notified when - * problems occur - i.e., when process status entries change to abnormal termination - * values. Process status entries are changed by the appropriate state monitor - * and/or the process launcher, depending upon the stage at which the problem occurs. - * - * Monitoring of the job begins once the job has reached the "executing" stage. Prior - * to that time, failure of processes to start are the responsibility of the respective - * process launcher - which is expected to call the error manager via the "incomplete - * start" interface to report any problems prior to the job beginning "execution". - * - * NOTE: ONLY HNPs are allowed to register for trigger reports. All other components - * MUST do nothing but return ORTE_SUCCESS. - */ -typedef int (*orte_errmgr_base_module_register_cb_fn_t)(orte_jobid_t job, - orte_proc_state_t state, - orte_err_cb_fn_t cbfunc, - void *cbdata); +ORTE_DECLSPEC extern int orte_errmgr_base_incomplete_start(orte_jobid_t job, int exit_code); +typedef int (*orte_errmgr_base_module_incomplete_start_fn_t)(orte_jobid_t job, int exit_code); /** * Alert - self aborting @@ -131,48 +229,85 @@ typedef int (*orte_errmgr_base_module_register_cb_fn_t)(orte_jobid_t job, * itself, and then exit - it takes no other actions. The intent here is to provide * a last-ditch exit procedure that attempts to clean up a little. */ -typedef void (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...) __opal_attribute_noreturn__ +ORTE_DECLSPEC extern int orte_errmgr_base_abort(int error_code, char *fmt, ...) +# if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR + __opal_attribute_format__(__printf__, 2, 3) +# endif + ; +typedef int (*orte_errmgr_base_module_abort_fn_t)(int error_code, char *fmt, ...) # if OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR __opal_attribute_format__(__printf__, 2, 3) # endif ; +/** + * If the communication link failed to a peer. + * This gives us a chance to recover from this error, or abort. + */ +ORTE_DECLSPEC extern int orte_errmgr_base_comm_failed(orte_process_name_t *name, int exit_code); +typedef int (*orte_errmgr_base_module_comm_failed_fn_t)(orte_process_name_t *name, + int exit_code); + /* - * + * Module Structure */ struct orte_errmgr_base_module_2_3_0_t { + /* ---- Previous Interfaces (Always call base) -- */ orte_errmgr_base_module_proc_aborted_fn_t proc_aborted; orte_errmgr_base_module_incomplete_start_fn_t incomplete_start; - orte_errmgr_base_module_register_cb_fn_t register_callback; + orte_errmgr_base_module_comm_failed_fn_t comm_failed; orte_errmgr_base_module_abort_fn_t abort; + + /* -------------- Internal Composite Interfaces -- */ + /** Initialization Function */ + orte_errmgr_base_module_init_fn_t internal_errmgr_init; + /** Finalization Function */ + orte_errmgr_base_module_finalize_fn_t internal_errmgr_finalize; + + /** Predicted process/node failure notification */ + orte_errmgr_base_predicted_fault_fn_t internal_predicted_fault; + /** Actual process failure notification */ + orte_errmgr_base_process_fault_fn_t internal_process_fault; + /** Suggest a node to map a restarting process onto */ + orte_errmgr_base_suggest_map_targets_fn_t internal_suggest_map_targets; + + /** Handle any FT Notifications */ + orte_errmgr_base_ft_event_fn_t internal_ft_event; }; typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t; typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t; /* - * ERRMGR Component - * the standard component data structure + * ErrMgr Component */ -struct mca_errmgr_base_component_2_0_0_t { +struct orte_errmgr_base_component_3_0_0_t { + /** MCA base component */ mca_base_component_t base_version; + /** MCA base data */ mca_base_component_data_t base_data; + + /** Verbosity Level */ + int verbose; + /** Output Handle for opal_output */ + int output_handle; + /** Default Priority */ + int priority; }; -typedef struct mca_errmgr_base_component_2_0_0_t mca_errmgr_base_component_2_0_0_t; -typedef mca_errmgr_base_component_2_0_0_t mca_errmgr_base_component_t; - +typedef struct orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_3_0_0_t; +typedef orte_errmgr_base_component_3_0_0_t orte_errmgr_base_component_t; +/* + * Global structure for accessing previous error manager functions + */ +ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr; /* * Macro for use in components that are of type errmgr */ -#define ORTE_ERRMGR_BASE_VERSION_2_0_0 \ +#define ORTE_ERRMGR_BASE_VERSION_3_0_0 \ MCA_BASE_VERSION_2_0_0, \ - "errmgr", 2, 0, 0 - -/* Global structure for accessing error manager functions - */ -ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr; /* holds selected module's function pointers */ + "errmgr", 3, 0, 0 END_C_DECLS diff --git a/orte/mca/errmgr/orcm/Makefile.am b/orte/mca/errmgr/orcm/Makefile.am new file mode 100644 index 0000000000..0f841db255 --- /dev/null +++ b/orte/mca/errmgr/orcm/Makefile.am @@ -0,0 +1,37 @@ +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_pkgdata_DATA = help-orte-errmgr-orcm.txt + +sources = \ + errmgr_orcm.h \ + errmgr_orcm_component.c \ + errmgr_orcm_module.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if OMPI_BUILD_errmgr_orcm_DSO +component_noinst = +component_install = mca_errmgr_orcm.la +else +component_noinst = libmca_errmgr_orcm.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_errmgr_orcm_la_SOURCES = $(sources) +mca_errmgr_orcm_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_errmgr_orcm_la_SOURCES = $(sources) +libmca_errmgr_orcm_la_LDFLAGS = -module -avoid-version diff --git a/orte/mca/errmgr/orcm/configure.m4 b/orte/mca/errmgr/orcm/configure.m4 new file mode 100644 index 0000000000..9545607d94 --- /dev/null +++ b/orte/mca/errmgr/orcm/configure.m4 @@ -0,0 +1,19 @@ +# -*- shell-script -*- +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_errmgr_orcm_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_errmgr_orcm_CONFIG],[ + # If we don't want FT, don't compile this component + AS_IF([test "$ompi_want_ft" = "1"], + [$1], + [$2]) +])dnl diff --git a/orte/mca/errmgr/orcm/configure.params b/orte/mca/errmgr/orcm/configure.params new file mode 100644 index 0000000000..f9bc6702dc --- /dev/null +++ b/orte/mca/errmgr/orcm/configure.params @@ -0,0 +1,13 @@ +# -*- shell-script -*- +# +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +PARAM_INIT_FILE=errmgr_orcm_component.c +PARAM_CONFIG_FILES="Makefile" diff --git a/orte/mca/errmgr/orcm/errmgr_orcm.h b/orte/mca/errmgr/orcm/errmgr_orcm.h new file mode 100644 index 0000000000..1905ec95d8 --- /dev/null +++ b/orte/mca/errmgr/orcm/errmgr_orcm.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/** + * @file + * + */ + +#ifndef MCA_ERRMGR_ORCM_EXPORT_H +#define MCA_ERRMGR_ORCM_EXPORT_H + +#include "orte_config.h" + +#include "orte/mca/errmgr/errmgr.h" + +BEGIN_C_DECLS + +/* + * Local Component structures + */ + +ORTE_MODULE_DECLSPEC extern orte_errmgr_base_component_t mca_errmgr_orcm_component; + +ORTE_DECLSPEC extern orte_errmgr_base_module_t orte_errmgr_orcm_module; + +END_C_DECLS + +#endif /* MCA_ERRMGR_ORCM_EXPORT_H */ diff --git a/orte/mca/errmgr/orcm/errmgr_orcm_component.c b/orte/mca/errmgr/orcm/errmgr_orcm_component.c new file mode 100644 index 0000000000..a0d70466b8 --- /dev/null +++ b/orte/mca/errmgr/orcm/errmgr_orcm_component.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" +#include "opal/util/output.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" +#include "errmgr_orcm.h" + +/* + * Public string for version number + */ +const char *orte_errmgr_orcm_component_version_string = + "ORTE ERRMGR orcm MCA component version " ORTE_VERSION; + +/* + * Local functionality + */ +static int errmgr_orcm_open(void); +static int errmgr_orcm_close(void); +static int errmgr_orcm_component_query(mca_base_module_t **module, int *priority); + +/* + * Instantiate the public struct with all of our public information + * and pointer to our public functions in it + */ +orte_errmgr_base_component_t mca_errmgr_orcm_component = +{ + /* Handle the general mca_component_t struct containing + * meta information about the component itorcm + */ + { + ORTE_ERRMGR_BASE_VERSION_3_0_0, + /* Component name and version */ + "orcm", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + errmgr_orcm_open, + errmgr_orcm_close, + errmgr_orcm_component_query + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + /* Verbosity level */ + 0, + /* opal_output handler */ + -1, + /* Default priority */ + 1 +}; + +static int errmgr_orcm_open(void) +{ + return ORTE_SUCCESS; +} + +static int errmgr_orcm_close(void) +{ + return ORTE_SUCCESS; +} + +static int errmgr_orcm_component_query(mca_base_module_t **module, int *priority) +{ + /* + * This component is selected only when requested - and if so, then + * it MUST be used exclusively + */ + bool is_required = false; + + mca_base_is_component_required(&orte_errmgr_base_components_available, + &mca_errmgr_orcm_component.base_version, + true, + &is_required); + + if( !is_required ) { + *priority = 0; + *module = NULL; + return ORTE_ERROR; + } + + *priority = 1000; + *module = (mca_base_module_t *)&orte_errmgr_orcm_module; + return ORTE_SUCCESS; +} + diff --git a/orte/mca/errmgr/orcm/errmgr_orcm_module.c b/orte/mca/errmgr/orcm/errmgr_orcm_module.c new file mode 100644 index 0000000000..dce878f3eb --- /dev/null +++ b/orte/mca/errmgr/orcm/errmgr_orcm_module.c @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2009-2010 The Trustees of Indiana University. + * All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "orte_config.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ +#ifdef HAVE_STRING_H +#include +#endif + +#include "opal/util/output.h" +#include "opal/util/opal_environ.h" +#include "opal/util/basename.h" +#include "opal/util/argv.h" +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" +#include "opal/mca/crs/crs.h" +#include "opal/mca/crs/base/base.h" + +#include "orte/util/name_fns.h" +#include "orte/util/proc_info.h" +#include "orte/runtime/orte_globals.h" +#include "opal/dss/dss.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/rml/rml.h" +#include "orte/mca/rml/rml_types.h" +#include "orte/mca/plm/plm.h" +#include "orte/mca/plm/base/base.h" +#include "orte/mca/plm/base/plm_private.h" +#include "orte/mca/filem/filem.h" +#include "orte/mca/grpcomm/grpcomm.h" +#include "orte/runtime/orte_wait.h" +#include "orte/mca/rmaps/rmaps_types.h" +#include "orte/mca/routed/routed.h" +#include "orte/mca/snapc/snapc.h" +#include "orte/mca/snapc/base/base.h" + +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" + +#include "errmgr_orcm.h" + + +/* + * Module functions: Global + */ +static int init(void); +static int finalize(void); + +static int predicted_fault(char ***proc_list, + char ***node_list, + char ***suggested_nodes); + +static int process_fault(orte_job_t *jdata, + orte_process_name_t *proc_name, + orte_proc_state_t state, + int *stack_state); + +static int suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list); + +static int ft_event(int state); + + + +/****************** + * ORCM module + ******************/ +orte_errmgr_base_module_t orte_errmgr_orcm_module = { + NULL, /* proc_aborted (old interface) */ + NULL, /* incomplete_start (old interface) */ + NULL, /* comm_failed (old interface) */ + NULL, /* abort (old interface) */ + init, + finalize, + predicted_fault, + process_fault, + suggest_map_targets, + ft_event +}; + +/************************ + * API Definitions + ************************/ +static int init(void) +{ + return ORTE_SUCCESS; +} + +static int finalize(void) +{ + return ORTE_SUCCESS; +} + +static int predicted_fault(char ***proc_list, + char ***node_list, + char ***suggested_nodes) +{ + return ORTE_ERR_NOT_IMPLEMENTED; +} + +static int process_fault(orte_job_t *jdata, + orte_process_name_t *proc, + orte_proc_state_t state, + int *stack_state) +{ + orte_job_t *jnew; + orte_proc_t *pdata; + orte_app_context_t *app=NULL; + orte_node_t *node, *newnode; + orte_proc_t *daemon, *nodeproc; + opal_value_array_t jobs; + bool found; + int i; + size_t j; + + *stack_state ^= ORTE_ERRMGR_STACK_STATE_JOB_ABORT; + + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output, + "errmgr:orcm:process_fault() " + "------- %s fault reported! proc %s (0x%x)", + (proc->jobid == ORTE_PROC_MY_NAME->jobid ? "Daemon" : "App. Process"), + ORTE_NAME_PRINT(proc), + state )); + /* get the app - just for output purposes in case of error */ + app = opal_pointer_array_get_item(jdata->apps, 0); + + /* Remove the route to this process since it is dead */ + orte_routed.delete_route(proc); + + /**** NON-DAEMON PROC FAILED ****/ + if (proc->jobid != ORTE_PROC_MY_NAME->jobid) { + /* if the proc failed to start or we killed it by cmd, + * don't attempt to restart it as this can lead to an + * infinite loop + */ + if (ORTE_PROC_STATE_FAILED_TO_START == state) { + opal_output(0, "APPLICATION %s FAILED TO START", app->app); + return ORTE_SUCCESS; + } + + /* if the proc was terminated by cmd, then do nothing */ + if (ORTE_PROC_STATE_KILLED_BY_CMD == state) { + opal_output(0, "APPLICATION %s KILLED BY COMMAND", app->app); + return ORTE_SUCCESS; + } + + /* get the proc_t object for this process */ + pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); + if (NULL == pdata) { + opal_output(0, "Data for proc %s could not be found", ORTE_NAME_PRINT(proc)); + return ORTE_ERR_NOT_FOUND; + } + /* proc just died - save the node where this proc was located */ + node = pdata->node; + /* increment restarts */ + pdata->restarts++; + /* have we exceeded #restarts? */ + if (jdata->max_restarts < pdata->restarts) { + opal_output(0, "Max restarts for proc %s of app %s has been exceeded - process will not be restarted", + ORTE_NAME_PRINT(proc), app->app); + return ORTE_SUCCESS; + } + /* reset the job params for restart */ + orte_plm_base_reset_job(jdata); + + /* restart the job - the spawn function will remap and + * launch the replacement proc(s) + */ + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output, + "%s RESTARTING APP: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); + + if (ORTE_SUCCESS != orte_plm.spawn(jdata)) { + opal_output(0, "FAILED TO RESTART APP %s", app->app); + orte_trigger_event(&orte_exit); + return ORTE_ERROR; + } + /* get the new node */ + newnode = pdata->node; + /* report what we did */ + opal_output(0, "Proc %s:%s aborted on node %s and was restarted on node %s\n\n", + app->app, ORTE_NAME_PRINT(proc), node->name, newnode->name); + + + return ORTE_SUCCESS; + } + + /* if it was a daemon that failed, then we have to + * treat it differently + */ + if (ORTE_PROC_MY_NAME->jobid == proc->jobid) { + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output, + "%s Daemon %s failed", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_VPID_PRINT(proc->vpid))); + /* need to relaunch all the apps that were on + * the node where this daemon was running as + * they either died along with the node, or will + * have self-terminated when the daemon died + */ + if (NULL == (daemon = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) { + /* nothing we can do - abort things */ + opal_output(0, "FAILED TO GET DAEMON OBJECT"); + return ORTE_ERROR; + } + /* flag the daemon state to indicate it terminated - this will + * cause the daemon to be restarted IF required for starting + * procs on that node + */ + daemon->state = ORTE_PROC_STATE_ABORTED; + /* identify the node where the daemon was running */ + node = daemon->node; + /* release the contact info, if not already done */ + if (NULL != daemon->rml_uri) { + free(daemon->rml_uri); + daemon->rml_uri = NULL; + } + /* setup to track the jobs on this node */ + OBJ_CONSTRUCT(&jobs, opal_value_array_t); + opal_value_array_init(&jobs, sizeof(orte_jobid_t)); + /* cycle through the node's procs */ + for (i=0; i < node->procs->size; i++) { + if (NULL == (nodeproc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { + continue; + } + /* set the proc to abnormally terminated */ + nodeproc->state = ORTE_PROC_STATE_ABORTED; + /* increment restarts */ + nodeproc->restarts++; + /* check if this proc's jobid is already in array */ + found = false; + for (j=0; j < opal_value_array_get_size(&jobs); j++) { + if (nodeproc->name.jobid == OPAL_VALUE_ARRAY_GET_ITEM(&jobs, orte_jobid_t, j)) { + found = true; + break; + } + } + if (!found) { + /* add it */ + opal_value_array_append_item(&jobs, &nodeproc->name.jobid); + } + } + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output, + "%s RESTARTING APPS FROM NODE: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + node->name)); + for (j=0; j < opal_value_array_get_size(&jobs); j++) { + if (NULL == (jnew = orte_get_job_data_object(OPAL_VALUE_ARRAY_GET_ITEM(&jobs, orte_jobid_t, j)))) { + /* nothing we can do - abort things */ + opal_output(0, "FAILED TO GET JOB OBJECT TO BE RESTARTED"); + return ORTE_ERROR; + } + /* reset the job params for restart */ + orte_plm_base_reset_job(jnew); + /* restart the job - the spawn function will remap and + * launch the replacement proc(s) + */ + OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_output, + "%s RESTARTING JOB %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jnew->jobid))); + if (ORTE_SUCCESS != orte_plm.spawn(jnew)) { + opal_output(0, "FAILED TO RESTART APPS FROM NODE: %s", node->name); + return ORTE_ERROR; + } + } + opal_output(0, "Daemon %s on node %s aborted - procs were restarted elsewhere\n\n", + ORTE_NAME_PRINT(proc), node->name); + /* all done - cleanup and leave */ + OBJ_DESTRUCT(&jobs); + return ORTE_ERROR; + } + + /* save */ + return ORTE_SUCCESS; +} + +static int suggest_map_targets(orte_proc_t *proc, + orte_node_t *oldnode, + opal_list_t *node_list) +{ + return ORTE_ERR_NOT_IMPLEMENTED; +} + +int ft_event(int state) +{ + return ORTE_SUCCESS; +} + +/***************** + * Local Functions + *****************/ diff --git a/orte/mca/errmgr/orcm/help-orte-errmgr-orcm.txt b/orte/mca/errmgr/orcm/help-orte-errmgr-orcm.txt new file mode 100644 index 0000000000..c6d43f1f77 --- /dev/null +++ b/orte/mca/errmgr/orcm/help-orte-errmgr-orcm.txt @@ -0,0 +1,14 @@ + -*- text -*- +# +# Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English general help file for ORTE RecoS IGNORE framework. +# diff --git a/orte/mca/ess/cm/ess_cm_module.c b/orte/mca/ess/cm/ess_cm_module.c index 0baf0acf98..3f2f469c51 100644 --- a/orte/mca/ess/cm/ess_cm_module.c +++ b/orte/mca/ess/cm/ess_cm_module.c @@ -3,6 +3,9 @@ * Copyright (c) 2009 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. * * $COPYRIGHT$ * @@ -72,7 +75,6 @@ orte_ess_base_module_t orte_ess_cm_module = { proc_get_node_rank, update_pidmap, update_nidmap, - orte_ess_base_query_sys_info, NULL /* ft_event */ }; diff --git a/orte/mca/ess/env/ess_env_module.c b/orte/mca/ess/env/ess_env_module.c index 6babafea2e..342f27dee3 100644 --- a/orte/mca/ess/env/ess_env_module.c +++ b/orte/mca/ess/env/ess_env_module.c @@ -447,6 +447,10 @@ static int rte_ft_event(int state) } /******** Continue Recovery ********/ else if (OPAL_CRS_CONTINUE == state ) { + OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, + "ess:env ft_event(%2d) - %s is Continuing", + state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* * Notify RML -> OOB */ @@ -476,6 +480,10 @@ static int rte_ft_event(int state) } /******** Restart Recovery ********/ else if (OPAL_CRS_RESTART == state ) { + OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, + "ess:env ft_event(%2d) - %s is Restarting", + state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); + /* * This should follow the ess init() function */ @@ -583,6 +591,13 @@ static int rte_ft_event(int state) goto cleanup; } + /* if one was provided, build my nidmap */ + if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { + ORTE_ERROR_LOG(ret); + exit_status = ret; + goto cleanup; + } + /* * Notify SnapC */ @@ -592,12 +607,6 @@ static int rte_ft_event(int state) goto cleanup; } - /* if one was provided, build my nidmap */ - if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { - ORTE_ERROR_LOG(ret); - exit_status = ret; - goto cleanup; - } } else if (OPAL_CRS_TERM == state ) { /* Nothing */ diff --git a/orte/mca/oob/tcp/oob_tcp.c b/orte/mca/oob/tcp/oob_tcp.c index f75c250566..af8058c1f5 100644 --- a/orte/mca/oob/tcp/oob_tcp.c +++ b/orte/mca/oob/tcp/oob_tcp.c @@ -1873,7 +1873,37 @@ int mca_oob_tcp_set_addr(const orte_process_name_t* name, const char* uri) struct sockaddr_storage inaddr; mca_oob_tcp_addr_t* addr = NULL; mca_oob_tcp_peer_t* peer = NULL; + opal_list_item_t *item; int rc; + + if (NULL == uri) { + /* purge the hash table entry for this proc */ + OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); + /* get the peer object */ + opal_hash_table_get_value_uint64(&mca_oob_tcp_component.tcp_peers, + orte_util_hash_name(name), + (void**)&peer); + if (NULL != peer) { + OPAL_THREAD_LOCK(&peer->peer_lock); + /* flag the state as closed */ + peer->peer_state = MCA_OOB_TCP_CLOSED; + /* clear any pending sends */ + while (NULL != (item = opal_list_remove_first(&peer->peer_send_queue))) { + OBJ_RELEASE(item); + } + peer->peer_send_msg = NULL; + /* clear any pending recvs */ + peer->peer_recv_msg = NULL; + OPAL_THREAD_UNLOCK(&peer->peer_lock); + } + /* delete the entry from the hash table */ + opal_hash_table_set_value_uint64(&mca_oob_tcp_component.tcp_peer_names, + orte_util_hash_name(name), NULL); + /* all done */ + OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock); + return ORTE_SUCCESS; + } + if((rc = mca_oob_tcp_parse_uri(uri, (struct sockaddr*) &inaddr)) != ORTE_SUCCESS) { return rc; } diff --git a/orte/mca/oob/tcp/oob_tcp_peer.c b/orte/mca/oob/tcp/oob_tcp_peer.c index 993982adeb..466fc1ddf7 100644 --- a/orte/mca/oob/tcp/oob_tcp_peer.c +++ b/orte/mca/oob/tcp/oob_tcp_peer.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2006 The University of Tennessee and The University @@ -532,7 +532,9 @@ static void mca_oob_tcp_peer_complete_connect(mca_oob_tcp_peer_t* peer, int sd) so_error); } mca_oob_tcp_peer_shutdown(peer); - opal_evtimer_add(&peer->peer_timer_event, &tv); + if( MCA_OOB_TCP_FAILED != peer->peer_state ) { + opal_evtimer_add(&peer->peer_timer_event, &tv); + } return; } else if(so_error != 0) { /* No need to worry about the return code here - we return regardless @@ -595,6 +597,8 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer) peer->peer_state); } + mca_oob_tcp_peer_shutdown(peer); + /* inform the routed framework that we have lost a connection so * it can decide if this is important, what to do about it, etc. */ @@ -606,8 +610,6 @@ void mca_oob_tcp_peer_close(mca_oob_tcp_peer_t* peer) OPAL_THREAD_UNLOCK(&peer->peer_lock); orte_errmgr.abort(1, NULL); } - - mca_oob_tcp_peer_shutdown(peer); } void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer) @@ -646,18 +648,6 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer) not likely to suddenly become successful, so abort the whole thing */ peer->peer_state = MCA_OOB_TCP_FAILED; - - /* since we cannot communicate, and the system obviously needed - * to do so, let's abort so we don't just hang here - */ - if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { - /* just wake us up */ - ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); - orte_abnormal_term_ordered = true; - orte_trigger_event(&orte_exit); - } else { - orte_errmgr.abort(1, NULL); - } } if (peer->peer_sd >= 0) { @@ -669,7 +659,9 @@ void mca_oob_tcp_peer_shutdown(mca_oob_tcp_peer_t* peer) } opal_event_del(&peer->peer_timer_event); - peer->peer_state = MCA_OOB_TCP_CLOSED; + if( MCA_OOB_TCP_FAILED != peer->peer_state ) { + peer->peer_state = MCA_OOB_TCP_CLOSED; + } } /* diff --git a/orte/mca/plm/base/plm_base_launch_support.c b/orte/mca/plm/base/plm_base_launch_support.c index bb555cc4fa..a494300812 100644 --- a/orte/mca/plm/base/plm_base_launch_support.c +++ b/orte/mca/plm/base/plm_base_launch_support.c @@ -47,7 +47,7 @@ #include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/odls/odls.h" #if OPAL_ENABLE_FT_CR == 1 -#include "orte/mca/snapc/snapc.h" +#include "orte/mca/snapc/base/base.h" #endif #include "orte/mca/filem/filem.h" #include "orte/mca/filem/base/base.h" @@ -217,12 +217,15 @@ int orte_plm_base_setup_job(orte_job_t *jdata) ***/ #if OPAL_ENABLE_FT_CR == 1 + /* JJH: Would it be useful to let the errmgr know what we are doing here? */ /* * Notify the Global SnapC component regarding new job */ - if( ORTE_SUCCESS != (rc = orte_snapc.setup_job(jdata->jobid) ) ) { - /* Silent Failure :/ JJH */ - ORTE_ERROR_LOG(rc); + if (ORTE_JOB_STATE_RESTART != jdata->state) { + if( ORTE_SUCCESS != (rc = orte_snapc.setup_job(jdata->jobid) ) ) { + /* Silent Failure :/ JJH */ + ORTE_ERROR_LOG(rc); + } } #endif @@ -1388,7 +1391,8 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) * an error unless it was specifically commanded */ if (jdata->state < ORTE_JOB_STATE_TERMINATED || - jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP) { + jdata->controls & ORTE_JOB_CONTROL_CONTINUOUS_OP || + jdata->controls & ORTE_JOB_CONTROL_RECOVERABLE) { for (i=0; i < jdata->procs->size; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { /* the proc array may no longer be left justified, so @@ -1396,6 +1400,10 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) */ continue; } + + /* + * Determine how the process state affects the job state + */ if (ORTE_PROC_STATE_FAILED_TO_START == proc->state) { jdata->state = ORTE_JOB_STATE_FAILED_TO_START; if (!jdata->abort) { @@ -1406,7 +1414,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } - break; } else if (ORTE_PROC_STATE_ABORTED == proc->state) { jdata->state = ORTE_JOB_STATE_ABORTED; if (!jdata->abort) { @@ -1417,7 +1424,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } - break; } else if (ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state) { jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG; if (!jdata->abort) { @@ -1428,7 +1434,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) jdata->abort = true; ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } - break; } else if (ORTE_PROC_STATE_TERM_WO_SYNC == proc->state) { jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC; if (!jdata->abort) { @@ -1445,7 +1450,6 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) */ ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); } - break; } else if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) { /* we ordered this proc to die, so it isn't an abnormal termination * and we don't flag it as such - just check the remaining jobs to @@ -1471,6 +1475,30 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) ORTE_UPDATE_EXIT_STATUS(proc->exit_code); } } + + /* + * Call the errmgr for this process, if necessary + */ + if (ORTE_PROC_STATE_ABORTED == proc->state || + ORTE_PROC_STATE_ABORTED_BY_SIG == proc->state || + ORTE_PROC_STATE_TERM_WO_SYNC == proc->state || + ORTE_PROC_STATE_KILLED_BY_CMD == proc->state ) { + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:check_job_completed " + "Declared job %s %s by proc %s with code %d (0x%x vs 0x%x)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(jdata->jobid), + (jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD ? + "killed by cmd" : "aborted"), + ORTE_NAME_PRINT(&(proc->name)), + proc->exit_code, + proc->last_errmgr_state, proc->state)); + /* Only report escalations in the fault state */ + if( proc->last_errmgr_state < proc->state ) { + proc->last_errmgr_state = proc->state; + orte_errmgr.proc_aborted(&(proc->name), proc->exit_code); + } + } } } @@ -1490,21 +1518,16 @@ void orte_plm_base_check_job_completed(orte_job_t *jdata) orte_errmgr.incomplete_start(jdata->jobid, jdata->aborted_proc->exit_code); } goto CHECK_ALL_JOBS; - } else if (ORTE_JOB_STATE_ABORTED == jdata->state || - ORTE_JOB_STATE_ABORTED_BY_SIG == jdata->state || - ORTE_JOB_STATE_ABORTED_WO_SYNC == jdata->state) { - OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, - "%s plm:base:check_job_completed declared job %s aborted by proc %s with code %d", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), - ORTE_JOBID_PRINT(jdata->jobid), - (NULL == jdata->aborted_proc) ? "unknown" : ORTE_NAME_PRINT(&(jdata->aborted_proc->name)), - (NULL == jdata->aborted_proc) ? ORTE_ERROR_DEFAULT_EXIT_CODE : jdata->aborted_proc->exit_code)); - /* report this to the errmgr */ + } else if (ORTE_JOB_STATE_ABORTED == jdata->state || + ORTE_JOB_STATE_ABORTED_BY_SIG == jdata->state || + ORTE_JOB_STATE_ABORTED_WO_SYNC == jdata->state || + ORTE_JOB_STATE_KILLED_BY_CMD == jdata->state ) { + /* report this to the errmgr + * (if we know which process caused this, then it was reported above) + */ if (NULL == jdata->aborted_proc) { /* we don't know who caused us to abort */ orte_errmgr.proc_aborted(ORTE_NAME_INVALID, ORTE_ERROR_DEFAULT_EXIT_CODE); - } else { - orte_errmgr.proc_aborted(&(jdata->aborted_proc->name), jdata->aborted_proc->exit_code); } goto CHECK_ALL_JOBS; } else if (jdata->num_terminated >= jdata->num_procs) { @@ -1521,7 +1544,9 @@ CHECK_ALL_JOBS: /* if this job is a continuously operating one, then don't do * anything further - just return here */ - if (NULL != jdata && ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls) { + if (NULL != jdata && + (ORTE_JOB_CONTROL_CONTINUOUS_OP & jdata->controls || + ORTE_JOB_CONTROL_RECOVERABLE & jdata->controls) ) { goto CHECK_ALIVE; } @@ -1634,6 +1659,13 @@ CHECK_ALIVE: ORTE_JOBID_PRINT(job->jobid))); one_still_alive = true; } + else { + OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, + "%s plm:base:check_job_completed job %s is terminated (%d vs %d [0x%x])", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_JOBID_PRINT(job->jobid), + job->num_terminated, job->num_procs, jdata->state )); + } } /* if a job is still alive, we just return */ if (one_still_alive) { diff --git a/orte/mca/plm/base/plm_base_rsh_support.c b/orte/mca/plm/base/plm_base_rsh_support.c index 0b21b2cc2a..2bcdae8853 100644 --- a/orte/mca/plm/base/plm_base_rsh_support.c +++ b/orte/mca/plm/base/plm_base_rsh_support.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University @@ -1343,7 +1343,9 @@ void orte_plm_base_reset_job(orte_job_t *jdata) int n, i, j; orte_proc_t *proc, *proc_from_node; orte_node_t *node_from_map, *node; - + orte_odls_job_t *jobdat = NULL; + opal_list_item_t *item = NULL; + /* set the state to restart */ jdata->state = ORTE_JOB_STATE_RESTART; /* cycle through the procs */ @@ -1354,6 +1356,7 @@ void orte_plm_base_reset_job(orte_job_t *jdata) if (ORTE_PROC_STATE_TERMINATED < proc->state) { /* this proc abnormally terminated */ proc->state = ORTE_PROC_STATE_RESTART; + proc->last_errmgr_state = ORTE_PROC_STATE_UNDEF; proc->pid = 0; /* remove the proc from the node upon which it was mapped * @@ -1394,7 +1397,13 @@ void orte_plm_base_reset_job(orte_job_t *jdata) } } /* adjust job accounting */ - jdata->num_terminated--; + if( jdata->num_terminated > 0 ) { + jdata->num_terminated--; + } + else { + OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output, + "plm:base:reset_job() WARNING: Prevented num_terminated from becoming < 0!")); + } } } /* clear the info on who aborted */ @@ -1406,6 +1415,18 @@ void orte_plm_base_reset_job(orte_job_t *jdata) /* since every daemon will be reporting status for every proc, reset these to zero */ jdata->num_launched = 0; jdata->num_reported = 0; + + /* Clean up the orte_odls_job_t structure for this job */ + jobdat = NULL; + for (item = opal_list_get_first(&orte_local_jobdata); + item != opal_list_get_end(&orte_local_jobdata); + item = opal_list_get_next(item)) { + jobdat = (orte_odls_job_t*)item; + if (jobdat->jobid == jdata->jobid) { + jobdat->num_participating = -1; + } + } + /* since we are restarting the failed proc, reset the exit status */ ORTE_RESET_EXIT_STATUS(); } diff --git a/orte/mca/plm/plm_types.h b/orte/mca/plm/plm_types.h index 3892b35405..869a4bd1b2 100644 --- a/orte/mca/plm/plm_types.h +++ b/orte/mca/plm/plm_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2008 The University of Tennessee and The University @@ -58,6 +58,7 @@ typedef uint16_t orte_proc_state_t; #define ORTE_PROC_STATE_ABORTED_BY_SIG 0x0400 /* process aborted by signal */ #define ORTE_PROC_STATE_TERM_WO_SYNC 0x0800 /* process exit'd w/o required sync */ #define ORTE_PROC_STATE_KILLED_BY_CMD 0x1000 /* process was killed by ORTE cmd */ +#define ORTE_PROC_STATE_COMM_FAILED 0x2000 /* process communication has failed */ /* diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 3dab425ecc..cf15a31453 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2007 The University of Tennessee and The University @@ -323,8 +323,16 @@ static void orte_plm_rsh_wait_daemon(pid_t pid, int status, void* cbdata) daemon->state = ORTE_PROC_STATE_FAILED_TO_START; /* increment the #daemons terminated so we will exit properly */ jdata->num_terminated++; +#if 0 /* report that the daemon has failed so we can exit */ orte_plm_base_launch_failed(ORTE_PROC_MY_NAME->jobid, pid, status, ORTE_JOB_STATE_FAILED_TO_START); +#else + /* JJH: Look into a better way of doing this. If we let the daemon + * know, then it kills the job when we are trying to restart.. */ + opal_output(0, "%s daemon %s failed. SKIPPING orte_plm_base_launch_failed()", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&daemon->name)); +#endif } } diff --git a/orte/mca/rmaps/base/rmaps_base_support_fns.c b/orte/mca/rmaps/base/rmaps_base_support_fns.c index 4e410a8c8d..0f745dafaa 100644 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2008 The University of Tennessee and The University @@ -19,6 +19,10 @@ #include "orte_config.h" #include "orte/constants.h" +#include +#ifdef HAVE_UNISTD_H +#include +#endif /* HAVE_UNISTD_H */ #include #include "opal/util/if.h" @@ -649,7 +653,7 @@ int orte_rmaps_base_define_daemons(orte_job_map_t *map) orte_job_t *daemons; int i; int rc; - + OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, "%s rmaps:base:define_daemons", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); @@ -708,7 +712,55 @@ int orte_rmaps_base_define_daemons(orte_job_map_t *map) if (ORTE_VPID_INVALID == map->daemon_vpid_start) { map->daemon_vpid_start = proc->name.vpid; } - } else { + } + /* + * If we are launching on a node where there used to be a daemon, but + * it had previously failed, try to relaunch it. (Daemon Recovery) Do + * this ONLY if there are procs mapped to that daemon! + */ + else if(node->daemon->state > ORTE_PROC_STATE_UNTERMINATED ) { + /* If no processes are to be launched on this node, then exclude it */ + if( 0 >= node->num_procs ) { + OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, + "%s rmaps:base:define_daemons Skipping the Recovery of daemon %s [0x%x] Launched: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&node->daemon->name), + node->daemon->state, + (node->daemon_launched ? "T" : "F") + )); + /* since this daemon exists but is not needed, then flag it + * as "launched" to avoid relaunching it for no reason + */ + node->daemon_launched = true; + continue; + } + + OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, + "%s rmaps:base:define_daemons RECOVERING daemon %s [0x%x] Launched: %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&node->daemon->name), + node->daemon->state, + (node->daemon_launched ? "T" : "F") + )); + + /* flag that the daemon is no longer launched */ + node->daemon_launched = false; + + /* set the state to indicate launch is in progress */ + node->daemon->state = ORTE_PROC_STATE_RESTART; + + free(node->daemon->rml_uri); + node->daemon->rml_uri = NULL; + + OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, + "%s rmaps:base:define_daemons add new daemon %s (Recovering old daemon)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&node->daemon->name))); + + /* track number of daemons to be launched */ + ++map->num_new_daemons; + } + else { /* this daemon was previously defined - flag it */ node->daemon_launched = true; OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base.rmaps_output, diff --git a/orte/mca/rmaps/resilient/rmaps_resilient.c b/orte/mca/rmaps/resilient/rmaps_resilient.c index 0b3039c655..0d4e426849 100644 --- a/orte/mca/rmaps/resilient/rmaps_resilient.c +++ b/orte/mca/rmaps/resilient/rmaps_resilient.c @@ -1,5 +1,8 @@ /* * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. * * $COPYRIGHT$ * @@ -116,7 +119,7 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) int rc; float avgload, minload; orte_node_t *node, *nd=NULL, *oldnode; - orte_rmaps_res_ftgrp_t *ftgrp, *target; + orte_rmaps_res_ftgrp_t *ftgrp, *target = NULL; orte_vpid_t totprocs, lowprocs, num_assigned; FILE *fp; char *ftinput; @@ -195,6 +198,11 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) oldnode = proc->node; /* point to the app */ app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx); + if( NULL == app ) { + ORTE_ERROR_LOG(ORTE_ERROR); + goto error; + } + OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base.rmaps_output, "%s rmaps:resilient: proc %s from node %s is to be restarted", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), @@ -257,18 +265,39 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) * and -host options */ if (NULL == target) { - nd = oldnode; /* put it back where it was if nothing else is found */ - totprocs = 1000000; + nd = NULL; + + /* + * Get a list of all nodes + */ OBJ_CONSTRUCT(&node_list, opal_list_t); map = jdata->map; - if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app, map->policy))) { + if (ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, + &num_slots, + app, + map->policy))) { ORTE_ERROR_LOG(rc); - return rc; + goto error; } + + /* Ask the ErrMgr components if they have a suggestion for this process */ + orte_errmgr_base_suggest_map_targets(proc, proc->node, &node_list); + + nd = (orte_node_t*)opal_list_get_first(&node_list); + if( NULL == nd ) { + ORTE_ERROR_LOG(ORTE_ERROR); + goto error; + } + + /* + * Look though the list for the least loaded machine. + */ + nd = oldnode; /* Put it back where it was if nothing else is found */ + totprocs = 1000000; /* find the lightest loaded node while deconstructing the list */ while (NULL != (item = opal_list_remove_first(&node_list))) { node = (orte_node_t*)item; - if (node->num_procs < totprocs) { + if( node->num_procs < totprocs) { nd = node; totprocs = node->num_procs; } @@ -280,9 +309,18 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) "%s rmaps:resilient: no avail fault groups found - placing proc on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nd->name)); - /* put proc on the found node */ - if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, nd, jdata->map->cpus_per_rank, proc->app_idx, - NULL, jdata->map->oversubscribe, false, &proc))) { + + /* + * Put the process on the found node (add it if not already in the map) + */ + if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, + nd, + jdata->map->cpus_per_rank, + proc->app_idx, + NULL, + jdata->map->oversubscribe, + false, + &proc))) { /** if the code is ORTE_ERR_NODE_FULLY_USED, then we know this * really isn't an error */ @@ -291,12 +329,15 @@ static int orte_rmaps_resilient_map(orte_job_t *jdata) goto error; } } + /* flag the proc state as non-launched so we'll know to launch it */ proc->state = ORTE_PROC_STATE_INIT; + /* update the node and local ranks so static ports can * be properly selected if active */ orte_rmaps_base_update_local_ranks(jdata, oldnode, nd, proc); + continue; } /* if we did find a target, re-map the proc to the lightest loaded diff --git a/orte/mca/rml/ftrm/rml_ftrm.h b/orte/mca/rml/ftrm/rml_ftrm.h index 7de13cbfc2..e02d981e43 100644 --- a/orte/mca/rml/ftrm/rml_ftrm.h +++ b/orte/mca/rml/ftrm/rml_ftrm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2006 The University of Tennessee and The University @@ -163,6 +163,8 @@ BEGIN_C_DECLS */ int orte_rml_ftrm_ft_event(int state); + int orte_rml_ftrm_purge(orte_process_name_t *peer); + END_C_DECLS #endif diff --git a/orte/mca/rml/ftrm/rml_ftrm_component.c b/orte/mca/rml/ftrm/rml_ftrm_component.c index 6af04e872b..a38cf3abb7 100644 --- a/orte/mca/rml/ftrm/rml_ftrm_component.c +++ b/orte/mca/rml/ftrm/rml_ftrm_component.c @@ -81,7 +81,9 @@ orte_rml_module_t orte_rml_ftrm_module = { orte_rml_ftrm_add_exception_handler, orte_rml_ftrm_del_exception_handler, - orte_rml_ftrm_ft_event + orte_rml_ftrm_ft_event, + + orte_rml_ftrm_purge }; int rml_ftrm_output_handle; diff --git a/orte/mca/rml/ftrm/rml_ftrm_module.c b/orte/mca/rml/ftrm/rml_ftrm_module.c index bdcbe5e597..a20aa2d107 100644 --- a/orte/mca/rml/ftrm/rml_ftrm_module.c +++ b/orte/mca/rml/ftrm/rml_ftrm_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University @@ -454,3 +454,18 @@ int orte_rml_ftrm_ft_event(int state) return ORTE_SUCCESS; } +int orte_rml_ftrm_purge(orte_process_name_t *peer) +{ + int ret; + + opal_output_verbose(20, rml_ftrm_output_handle, + "orte_rml_ftrm: purge()"); + + if( NULL != orte_rml_ftrm_wrapped_module.purge ) { + if( ORTE_SUCCESS != (ret = orte_rml_ftrm_wrapped_module.purge(peer) ) ) { + return ret; + } + } + + return ORTE_SUCCESS; +} diff --git a/orte/mca/rml/oob/rml_oob.h b/orte/mca/rml/oob/rml_oob.h index c81af7be91..4665b95adf 100644 --- a/orte/mca/rml/oob/rml_oob.h +++ b/orte/mca/rml/oob/rml_oob.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2006 The University of Tennessee and The University @@ -179,6 +179,8 @@ void orte_rml_oob_exception_callback(const orte_process_name_t *peer, orte_rml_exception_t exception); +int orte_rml_oob_purge(orte_process_name_t *peer); + END_C_DECLS #endif diff --git a/orte/mca/rml/oob/rml_oob_component.c b/orte/mca/rml/oob/rml_oob_component.c index 12d8c7fea2..acb717557f 100644 --- a/orte/mca/rml/oob/rml_oob_component.c +++ b/orte/mca/rml/oob/rml_oob_component.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University @@ -100,10 +100,14 @@ orte_rml_oob_module_t orte_rml_oob_module = { orte_rml_oob_add_exception, orte_rml_oob_del_exception, - orte_rml_oob_ft_event + orte_rml_oob_ft_event, + + orte_rml_oob_purge } }; +/* Local variables */ +static bool init_done = false; static int rml_oob_open(void) @@ -134,6 +138,11 @@ rml_oob_close(void) static orte_rml_module_t* rml_oob_init(int* priority) { + if (init_done) { + *priority = 1; + return &orte_rml_oob_module.super; + } + if (mca_oob_base_init() != ORTE_SUCCESS) return NULL; *priority = 1; @@ -155,7 +164,8 @@ rml_oob_init(int* priority) orte_rml_oob_module.active_oob = &mca_oob; orte_rml_oob_module.active_oob->oob_exception_callback = orte_rml_oob_exception_callback; - + + init_done = true; return &orte_rml_oob_module.super; } diff --git a/orte/mca/rml/oob/rml_oob_contact.c b/orte/mca/rml/oob/rml_oob_contact.c index 63303c7aa4..bb7e8e017a 100644 --- a/orte/mca/rml/oob/rml_oob_contact.c +++ b/orte/mca/rml/oob/rml_oob_contact.c @@ -1,4 +1,7 @@ /* + * Copyright (c) 2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -12,6 +15,7 @@ #include "orte/mca/errmgr/errmgr.h" #include "orte/mca/rml/base/rml_contact.h" +#include "orte/mca/routed/routed.h" #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" @@ -75,3 +79,37 @@ orte_rml_oob_get_new_name(orte_process_name_t *name) return orte_rml_oob_module.active_oob->oob_get_new_name(name); } + +int +orte_rml_oob_purge(orte_process_name_t *peer) +{ + opal_list_item_t *item, *next; + orte_rml_oob_queued_msg_t *qmsg; + orte_rml_oob_msg_header_t *hdr; + orte_process_name_t step; + + /* clear the oob contact info and pending messages */ + orte_rml_oob_module.active_oob->oob_set_addr(peer, NULL); + + /* clear our message queue */ + OPAL_THREAD_LOCK(&orte_rml_oob_module.queued_lock); + item = opal_list_get_first(&orte_rml_oob_module.queued_routing_messages); + while (item != opal_list_get_end(&orte_rml_oob_module.queued_routing_messages)) { + next = opal_list_get_next(item); + qmsg = (orte_rml_oob_queued_msg_t*)item; + hdr = (orte_rml_oob_msg_header_t*) qmsg->payload[0].iov_base; + step = orte_routed.get_route(&hdr->destination); + if (peer->jobid == hdr->destination.jobid && + peer->vpid == hdr->destination.vpid) { + opal_list_remove_item(&orte_rml_oob_module.queued_routing_messages, item); + OBJ_RELEASE(item); + } else if (step.jobid == hdr->destination.jobid && + step.vpid == hdr->destination.vpid) { + opal_list_remove_item(&orte_rml_oob_module.queued_routing_messages, item); + OBJ_RELEASE(item); + } + item = next; + } + OPAL_THREAD_UNLOCK(&orte_rml_oob_module.queued_lock); + return ORTE_SUCCESS; +} diff --git a/orte/mca/rml/rml.h b/orte/mca/rml/rml.h index af5a87eb0d..a7014e22fc 100644 --- a/orte/mca/rml/rml.h +++ b/orte/mca/rml/rml.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. * Copyright (c) 2004-2005 The University of Tennessee and The University @@ -575,6 +575,12 @@ typedef int (*orte_rml_module_exception_fn_t)(orte_rml_exception_callback_t cbfu */ typedef int (*orte_rml_module_ft_event_fn_t)(int state); +/** + * Purge the RML/OOB of contact info and pending messages + * to/from a specified process. Used when a process aborts + * and is to be restarted + */ +typedef int (*orte_rml_module_purge_fn_t)(struct orte_process_name_t *peer); /* ******************************************************************** */ @@ -629,6 +635,9 @@ struct orte_rml_module_t { /** Fault tolerance handler */ orte_rml_module_ft_event_fn_t ft_event; + + /** Purge information */ + orte_rml_module_purge_fn_t purge; }; /** Convienence typedef */ typedef struct orte_rml_module_t orte_rml_module_t; diff --git a/orte/mca/routed/base/routed_base_components.c b/orte/mca/routed/base/routed_base_components.c index 6f0a2d405e..fd94430fc9 100644 --- a/orte/mca/routed/base/routed_base_components.c +++ b/orte/mca/routed/base/routed_base_components.c @@ -2,7 +2,7 @@ * Copyright (c) 2007 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2004-2008 The Trustees of Indiana University. + * Copyright (c) 2004-2010 The Trustees of Indiana University. * All rights reserved. * $COPYRIGHT$ * @@ -58,12 +58,19 @@ opal_list_t orte_routed_base_components; static orte_routed_component_t *active_component = NULL; static bool component_open_called = false; +static bool opened = false; +static bool selected = false; int orte_routed_base_open(void) { int ret; + if (opened) { + return ORTE_SUCCESS; + } + opened = true; + /* setup the output stream */ orte_routed_base_output = opal_output_open(NULL); @@ -88,6 +95,11 @@ orte_routed_base_select(void) orte_routed_component_t *best_component = NULL; orte_routed_module_t *best_module = NULL; + if (selected) { + return ORTE_SUCCESS; + } + selected = true; + /* * Select the best component */ @@ -134,6 +146,9 @@ orte_routed_base_close(void) OBJ_DESTRUCT(&orte_routed_base_components); + opened = false; + selected = false; + return ORTE_SUCCESS; } diff --git a/orte/mca/routed/cm/routed_cm.c b/orte/mca/routed/cm/routed_cm.c index 4ddfb2beca..14d5efb4df 100644 --- a/orte/mca/routed/cm/routed_cm.c +++ b/orte/mca/routed/cm/routed_cm.c @@ -1,6 +1,9 @@ /* * Copyright (c) 2007 Los Alamos National Security, LLC. * All rights reserved. + * Copyright (c) 2009-2010 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -184,6 +187,8 @@ static int delete_route(orte_process_name_t *proc) * the routing tree */ + /* remove any entries in the RML for this process */ + rc = orte_rml.purge(proc); return ORTE_SUCCESS; } @@ -279,6 +284,9 @@ static orte_process_name_t get_route(orte_process_name_t *target) { orte_process_name_t *ret, daemon; int rc; + int32_t i; + orte_job_t *jdata; + orte_proc_t *proc; if (target->jobid == ORTE_JOBID_INVALID || target->vpid == ORTE_VPID_INVALID) { @@ -342,7 +350,37 @@ static orte_process_name_t get_route(orte_process_name_t *target) } else { /* otherwise, if I am the HNP, send to the daemon */ if (ORTE_PROC_IS_HNP) { - ret = &daemon; + /* + * Check to make sure the daemon is active, if not then return an INVALID name + * JJH: There should be a faster way to do this check, but for now just iterate... + */ + if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + ret = ORTE_NAME_INVALID; + goto found; + } + + for(i = 0; i < jdata->procs->size; ++i) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + continue; + } + if( proc->name.vpid != daemon.vpid) { + continue; + } + + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, + "%s routed_cm_get: Checking process %15s state 0x%x", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&(proc->name)), + proc->state)); + + if( proc->state <= ORTE_PROC_STATE_UNTERMINATED ) { + ret = &daemon; + } else { + ret = ORTE_NAME_INVALID; + } + goto found; + } } else { /* send to the HNP for routing */ ret = ORTE_PROC_MY_HNP; @@ -727,7 +765,9 @@ static int update_routing_tree(void) static orte_vpid_t get_routing_tree(opal_list_t *children) { orte_routed_tree_t *nm; - orte_vpid_t i; + int32_t i; + orte_job_t *jdata; + orte_proc_t *proc; /* if I am anything other than a daemon or the HNP, this * is a meaningless command as I am not allowed to route @@ -741,12 +781,41 @@ static orte_vpid_t get_routing_tree(opal_list_t *children) return ORTE_PROC_MY_HNP->vpid; } - /* for the HNP, the cm routing tree is direct to all known daemons */ + /* for the HNP, the cm routing tree is direct to all known alive daemons */ if (NULL != children) { - for (i=1; i < orte_process_info.num_procs; i++) { - nm = OBJ_NEW(orte_routed_tree_t); - nm->vpid = i; - opal_list_append(children, &nm->super); + if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + + for(i = 0; i < jdata->procs->size; ++i) { + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + continue; + } + if( proc->name.vpid == 0) { + continue; + } + + if( proc->state <= ORTE_PROC_STATE_UNTERMINATED && + NULL != proc->rml_uri ) { + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, + "%s get_routing_tree: Adding process %15s state 0x%x", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&(proc->name)), + proc->state)); + + nm = OBJ_NEW(orte_routed_tree_t); + nm->vpid = proc->name.vpid; + opal_bitmap_clear_all_bits(&nm->relatives); + opal_list_append(children, &nm->super); + } + else { + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, + "%s get_routing_tree: Skipped process %15s state 0x%x (non functional daemon)", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&(proc->name)), + proc->state)); + } } } diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 9d6b1c7881..6f00edc581 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -781,6 +781,7 @@ static void orte_proc_construct(orte_proc_t* proc) proc->pid = 0; proc->local_rank = ORTE_LOCAL_RANK_INVALID; proc->node_rank = ORTE_NODE_RANK_INVALID; + proc->last_errmgr_state = ORTE_PROC_STATE_UNDEF; proc->state = ORTE_PROC_STATE_UNDEF; proc->app_idx = 0; proc->slot_list = NULL; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 14a0b3b09e..90f1785dde 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -285,6 +285,7 @@ typedef uint8_t orte_job_controls_t; #define ORTE_JOB_CONTROL_DO_NOT_MONITOR 0x10 #define ORTE_JOB_CONTROL_FORWARD_COMM 0x20 #define ORTE_JOB_CONTROL_CONTINUOUS_OP 0x40 +#define ORTE_JOB_CONTROL_RECOVERABLE 0x80 #define ORTE_MAPPING_POLICY OPAL_UINT16 /* put the rank assignment method in the upper 8 bits */ @@ -419,6 +420,8 @@ struct orte_proc_t { * know which static IP port to use */ orte_node_rank_t node_rank; + /* Last state used to trigger the errmgr for this proc */ + orte_proc_state_t last_errmgr_state; /* process state */ orte_proc_state_t state; /* exit code */ diff --git a/orte/test/system/orte_spin.c b/orte/test/system/orte_spin.c index d00385ea0b..e4ce3ed495 100644 --- a/orte/test/system/orte_spin.c +++ b/orte/test/system/orte_spin.c @@ -5,9 +5,12 @@ * A program that just spins - provides mechanism for testing user-driven * abnormal program termination */ +#include "opal_config.h" #include +#include "opal/runtime/opal_progress.h" + #include "orte/runtime/runtime.h" int main(int argc, char* argv[]) @@ -22,7 +25,15 @@ int main(int argc, char* argv[]) while (1) { i++; pi = i / 3.14159256; - if (i > 100) i = 0; + if (i > 100) { + /* need to progress so we can + * wake up if our daemon goes + * away! + */ + opal_progress(); + /* reset the counter so we loop */ + i = 0; + } } orte_finalize(); diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 8fd52c7c1d..80cde818dc 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -81,6 +81,7 @@ #include "orte/mca/rml/rml_types.h" #include "orte/mca/rml/base/rml_contact.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/errmgr/base/base.h" #include "orte/mca/grpcomm/grpcomm.h" #include "orte/runtime/runtime.h" @@ -1127,6 +1128,15 @@ static void abort_exit_callback(int fd, short ign, void *arg) !orte_never_launched) { /* if the debuggers were run, clean up */ orte_debugger_finalize(); + + /* + * Turn off the errmgr recovery functionality, if it was enabled. + * This keeps the errmgr from trying to recover from the shutdown + * procedure. + */ + orte_errmgr_base_enable_recovery = false; + orte_errmgr_base_shutting_down = true; + /* terminate the orteds - they will automatically kill * their local procs */