From 8f45fcb4294a8daf848372cbbeab1f9337ac7fb5 Mon Sep 17 00:00:00 2001 From: Josh Hursey Date: Wed, 26 Jan 2011 14:56:35 +0000 Subject: [PATCH] More fixes for the C/R support. Fixes a couple bugs with the migration and autor features. The C/R functionality should be fully working now. * Fix the checkpoint-restart-checkpoint case which would previous reject the checkpoint of the newly restarted process. By making sure to re-enable checkpointing once the application has fully restarted fixes this issue (make sure to set is_app_checkpointable to true on restart confirmation). * In the case of an invalid checkpoint, do not try to access the SStore datastore as it will be using a dummy handler, and return NULL strings. mpirun was segfaulting in the error case because it was trying to convert the seq_num from a string to an integer. * Make sure to initialize the timer event in the Automatic Recovery section of the HNP errmgr, per the libevent update. This caused a segfault when attempting to recover a failed process. * If ompi-checkpoint loses connection to the HNP/mpirun the TCP socket will fail and call the ErrMgr update_state function. This commit adds a dummy function {{{orte_errmgr_base_update_state()}}} that will prevent the ompi-checkpoint command from segfaulting in this error scenario. This commit was SVN r24306. --- orte/mca/errmgr/base/errmgr_base_fns.c | 14 +++++++++ orte/mca/errmgr/base/errmgr_private.h | 7 +++++ orte/mca/errmgr/hnp/errmgr_hnp_autor.c | 5 +++ orte/mca/snapc/base/snapc_base_fns.c | 42 ++++++++++++++++++++----- orte/mca/snapc/full/snapc_full_app.c | 2 +- orte/mca/snapc/full/snapc_full_global.c | 1 + 6 files changed, 63 insertions(+), 8 deletions(-) diff --git a/orte/mca/errmgr/base/errmgr_base_fns.c b/orte/mca/errmgr/base/errmgr_base_fns.c index d7e2974125..fd0eeddf43 100644 --- a/orte/mca/errmgr/base/errmgr_base_fns.c +++ b/orte/mca/errmgr/base/errmgr_base_fns.c @@ -237,6 +237,20 @@ void orte_errmgr_base_abort(int error_code, char *fmt, ...) /* No way to reach here */ } +int orte_errmgr_base_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code) +{ + /* + * This is a stub function that is only meant to be called by tools, + * so it will always return success. + */ + return ORTE_SUCCESS; +} + /******************** * Utility functions ********************/ diff --git a/orte/mca/errmgr/base/errmgr_private.h b/orte/mca/errmgr/base/errmgr_private.h index 45dd3d1bc5..5077d02eed 100644 --- a/orte/mca/errmgr/base/errmgr_private.h +++ b/orte/mca/errmgr/base/errmgr_private.h @@ -64,5 +64,12 @@ ORTE_DECLSPEC void orte_errmgr_base_abort(int error_code, char *fmt, ...) __opal_attribute_format__(__printf__, 2, 3) __opal_attribute_noreturn__; +ORTE_DECLSPEC int orte_errmgr_base_update_state(orte_jobid_t job, + orte_job_state_t jobstate, + orte_process_name_t *proc_name, + orte_proc_state_t state, + pid_t pid, + orte_exit_code_t exit_code); + END_C_DECLS #endif diff --git a/orte/mca/errmgr/hnp/errmgr_hnp_autor.c b/orte/mca/errmgr/hnp/errmgr_hnp_autor.c index 5d33655024..bc8f2cab39 100644 --- a/orte/mca/errmgr/hnp/errmgr_hnp_autor.c +++ b/orte/mca/errmgr/hnp/errmgr_hnp_autor.c @@ -171,6 +171,10 @@ int orte_errmgr_hnp_autor_global_module_init(void) current_global_jobid = ORTE_JOBID_INVALID; current_global_jobdata = NULL; + if( NULL == autor_timer_event ) { + autor_timer_event = opal_event_evtimer_new(opal_event_base, errmgr_autor_recover_processes, NULL); + } + ERRMGR_AUTOR_CLEAR_TIMERS(); return ORTE_SUCCESS; @@ -187,6 +191,7 @@ int orte_errmgr_hnp_autor_global_module_finalize(void) } if( NULL != autor_timer_event ) { free(autor_timer_event); + autor_timer_event = NULL; } current_global_jobid = ORTE_JOBID_INVALID; diff --git a/orte/mca/snapc/base/snapc_base_fns.c b/orte/mca/snapc/base/snapc_base_fns.c index 0c5c19ca71..e8f80db8f9 100644 --- a/orte/mca/snapc/base/snapc_base_fns.c +++ b/orte/mca/snapc/base/snapc_base_fns.c @@ -712,14 +712,42 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, ORTE_SNAPC_CKPT_STATE_ESTABLISHED == ckpt_status || ORTE_SNAPC_CKPT_STATE_STOPPED == ckpt_status || ORTE_SNAPC_CKPT_STATE_ERROR == ckpt_status ) { - orte_sstore.get_attr(ss_handle, - SSTORE_METADATA_GLOBAL_SNAP_REF, - &global_snapshot_handle); - orte_sstore.get_attr(ss_handle, - SSTORE_METADATA_GLOBAL_SNAP_SEQ, - &tmp_str); - seq_num = atoi(tmp_str); + if( ORTE_SNAPC_CKPT_STATE_ERROR != ckpt_status ) { + if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle, + SSTORE_METADATA_GLOBAL_SNAP_REF, + &global_snapshot_handle)) ) { + opal_output(orte_snapc_base_output, + "%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n", + ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret ); + ORTE_ERROR_LOG(ret); + /* Do not exit here, continue so that we can inform the tool + * that the checkpoint has failed + */ + } + + if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle, + SSTORE_METADATA_GLOBAL_SNAP_SEQ, + &tmp_str)) ) { + opal_output(orte_snapc_base_output, + "%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n", + ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret ); + ORTE_ERROR_LOG(ret); + /* Do not exit here, continue so that we can inform the tool + * that the checkpoint has failed + */ + } + + if( NULL != tmp_str ) { + seq_num = atoi(tmp_str); + } else { + seq_num = -1; + } + } else { + /* Checkpoint Error Case */ + global_snapshot_handle = NULL; + seq_num = -1; + } OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_output, "%s) base:ckpt_update_cmd: Sending update command + \n", diff --git a/orte/mca/snapc/full/snapc_full_app.c b/orte/mca/snapc/full/snapc_full_app.c index 4836545980..a9d2e64c7a 100644 --- a/orte/mca/snapc/full/snapc_full_app.c +++ b/orte/mca/snapc/full/snapc_full_app.c @@ -161,7 +161,7 @@ int app_coord_init() if( 0 == ORTE_PROC_MY_NAME->vpid ) { OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle, - "app) Shutdown Barrier: Send INIT to HNP...!")); + "app) Startup Barrier: Send INIT to HNP...!")); OBJ_CONSTRUCT(&buffer, opal_buffer_t); diff --git a/orte/mca/snapc/full/snapc_full_global.c b/orte/mca/snapc/full/snapc_full_global.c index 796eddd3f8..46a0a33f5c 100644 --- a/orte/mca/snapc/full/snapc_full_global.c +++ b/orte/mca/snapc/full/snapc_full_global.c @@ -1781,6 +1781,7 @@ static int snapc_full_process_orted_update_cmd(orte_process_name_t* sender, SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_RECOVERED); SNAPC_FULL_DISPLAY_RECOVERED_TIMER(); orte_snapc_base_has_recovered = true; + is_app_checkpointable = true; exit_status = ORTE_SUCCESS; goto cleanup;