1
1

More fixes for the C/R support. Fixes a couple bugs with the migration and autor features. The C/R functionality should be fully working now.

* Fix the checkpoint-restart-checkpoint case which would previous reject the checkpoint of the newly restarted process. By making sure to re-enable checkpointing once the application has fully restarted fixes this issue (make sure to set is_app_checkpointable to true on restart confirmation).
 * In the case of an invalid checkpoint, do not try to access the SStore datastore as it will be using a dummy handler, and return NULL strings. mpirun was segfaulting in the error case because it was trying to convert the seq_num from a string to an integer.
 * Make sure to initialize the timer event in the Automatic Recovery section of the HNP errmgr, per the libevent update. This caused a segfault when attempting to recover a failed process.
 * If ompi-checkpoint loses connection to the HNP/mpirun the TCP socket will fail and call the ErrMgr update_state function. This commit adds a dummy function {{{orte_errmgr_base_update_state()}}} that will prevent the ompi-checkpoint command from segfaulting in this error scenario.

This commit was SVN r24306.
Этот коммит содержится в:
Josh Hursey 2011-01-26 14:56:35 +00:00
родитель 8a3179cdcb
Коммит 8f45fcb429
6 изменённых файлов: 63 добавлений и 8 удалений

Просмотреть файл

@ -237,6 +237,20 @@ void orte_errmgr_base_abort(int error_code, char *fmt, ...)
/* No way to reach here */
}
int orte_errmgr_base_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code)
{
/*
* This is a stub function that is only meant to be called by tools,
* so it will always return success.
*/
return ORTE_SUCCESS;
}
/********************
* Utility functions
********************/

Просмотреть файл

@ -64,5 +64,12 @@ ORTE_DECLSPEC void orte_errmgr_base_abort(int error_code, char *fmt, ...)
__opal_attribute_format__(__printf__, 2, 3)
__opal_attribute_noreturn__;
ORTE_DECLSPEC int orte_errmgr_base_update_state(orte_jobid_t job,
orte_job_state_t jobstate,
orte_process_name_t *proc_name,
orte_proc_state_t state,
pid_t pid,
orte_exit_code_t exit_code);
END_C_DECLS
#endif

Просмотреть файл

@ -171,6 +171,10 @@ int orte_errmgr_hnp_autor_global_module_init(void)
current_global_jobid = ORTE_JOBID_INVALID;
current_global_jobdata = NULL;
if( NULL == autor_timer_event ) {
autor_timer_event = opal_event_evtimer_new(opal_event_base, errmgr_autor_recover_processes, NULL);
}
ERRMGR_AUTOR_CLEAR_TIMERS();
return ORTE_SUCCESS;
@ -187,6 +191,7 @@ int orte_errmgr_hnp_autor_global_module_finalize(void)
}
if( NULL != autor_timer_event ) {
free(autor_timer_event);
autor_timer_event = NULL;
}
current_global_jobid = ORTE_JOBID_INVALID;

Просмотреть файл

@ -712,14 +712,42 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
ORTE_SNAPC_CKPT_STATE_ESTABLISHED == ckpt_status ||
ORTE_SNAPC_CKPT_STATE_STOPPED == ckpt_status ||
ORTE_SNAPC_CKPT_STATE_ERROR == ckpt_status ) {
orte_sstore.get_attr(ss_handle,
SSTORE_METADATA_GLOBAL_SNAP_REF,
&global_snapshot_handle);
orte_sstore.get_attr(ss_handle,
if( ORTE_SNAPC_CKPT_STATE_ERROR != ckpt_status ) {
if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle,
SSTORE_METADATA_GLOBAL_SNAP_REF,
&global_snapshot_handle)) ) {
opal_output(orte_snapc_base_output,
"%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
ORTE_ERROR_LOG(ret);
/* Do not exit here, continue so that we can inform the tool
* that the checkpoint has failed
*/
}
if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle,
SSTORE_METADATA_GLOBAL_SNAP_SEQ,
&tmp_str);
&tmp_str)) ) {
opal_output(orte_snapc_base_output,
"%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n",
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
ORTE_ERROR_LOG(ret);
/* Do not exit here, continue so that we can inform the tool
* that the checkpoint has failed
*/
}
if( NULL != tmp_str ) {
seq_num = atoi(tmp_str);
} else {
seq_num = -1;
}
} else {
/* Checkpoint Error Case */
global_snapshot_handle = NULL;
seq_num = -1;
}
OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_output,
"%s) base:ckpt_update_cmd: Sending update command <status %d> + <ref %s> <seq %d>\n",

Просмотреть файл

@ -161,7 +161,7 @@ int app_coord_init()
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
"app) Shutdown Barrier: Send INIT to HNP...!"));
"app) Startup Barrier: Send INIT to HNP...!"));
OBJ_CONSTRUCT(&buffer, opal_buffer_t);

Просмотреть файл

@ -1781,6 +1781,7 @@ static int snapc_full_process_orted_update_cmd(orte_process_name_t* sender,
SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_RECOVERED);
SNAPC_FULL_DISPLAY_RECOVERED_TIMER();
orte_snapc_base_has_recovered = true;
is_app_checkpointable = true;
exit_status = ORTE_SUCCESS;
goto cleanup;