More fixes for the C/R support. Fixes a couple bugs with the migration and autor features. The C/R functionality should be fully working now.
* Fix the checkpoint-restart-checkpoint case which would previous reject the checkpoint of the newly restarted process. By making sure to re-enable checkpointing once the application has fully restarted fixes this issue (make sure to set is_app_checkpointable to true on restart confirmation). * In the case of an invalid checkpoint, do not try to access the SStore datastore as it will be using a dummy handler, and return NULL strings. mpirun was segfaulting in the error case because it was trying to convert the seq_num from a string to an integer. * Make sure to initialize the timer event in the Automatic Recovery section of the HNP errmgr, per the libevent update. This caused a segfault when attempting to recover a failed process. * If ompi-checkpoint loses connection to the HNP/mpirun the TCP socket will fail and call the ErrMgr update_state function. This commit adds a dummy function {{{orte_errmgr_base_update_state()}}} that will prevent the ompi-checkpoint command from segfaulting in this error scenario. This commit was SVN r24306.
Этот коммит содержится в:
родитель
8a3179cdcb
Коммит
8f45fcb429
@ -237,6 +237,20 @@ void orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||
/* No way to reach here */
|
||||
}
|
||||
|
||||
int orte_errmgr_base_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code)
|
||||
{
|
||||
/*
|
||||
* This is a stub function that is only meant to be called by tools,
|
||||
* so it will always return success.
|
||||
*/
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
|
||||
/********************
|
||||
* Utility functions
|
||||
********************/
|
||||
|
@ -64,5 +64,12 @@ ORTE_DECLSPEC void orte_errmgr_base_abort(int error_code, char *fmt, ...)
|
||||
__opal_attribute_format__(__printf__, 2, 3)
|
||||
__opal_attribute_noreturn__;
|
||||
|
||||
ORTE_DECLSPEC int orte_errmgr_base_update_state(orte_jobid_t job,
|
||||
orte_job_state_t jobstate,
|
||||
orte_process_name_t *proc_name,
|
||||
orte_proc_state_t state,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
||||
|
@ -171,6 +171,10 @@ int orte_errmgr_hnp_autor_global_module_init(void)
|
||||
current_global_jobid = ORTE_JOBID_INVALID;
|
||||
current_global_jobdata = NULL;
|
||||
|
||||
if( NULL == autor_timer_event ) {
|
||||
autor_timer_event = opal_event_evtimer_new(opal_event_base, errmgr_autor_recover_processes, NULL);
|
||||
}
|
||||
|
||||
ERRMGR_AUTOR_CLEAR_TIMERS();
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
@ -187,6 +191,7 @@ int orte_errmgr_hnp_autor_global_module_finalize(void)
|
||||
}
|
||||
if( NULL != autor_timer_event ) {
|
||||
free(autor_timer_event);
|
||||
autor_timer_event = NULL;
|
||||
}
|
||||
|
||||
current_global_jobid = ORTE_JOBID_INVALID;
|
||||
|
@ -712,14 +712,42 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
|
||||
ORTE_SNAPC_CKPT_STATE_ESTABLISHED == ckpt_status ||
|
||||
ORTE_SNAPC_CKPT_STATE_STOPPED == ckpt_status ||
|
||||
ORTE_SNAPC_CKPT_STATE_ERROR == ckpt_status ) {
|
||||
orte_sstore.get_attr(ss_handle,
|
||||
SSTORE_METADATA_GLOBAL_SNAP_REF,
|
||||
&global_snapshot_handle);
|
||||
|
||||
orte_sstore.get_attr(ss_handle,
|
||||
SSTORE_METADATA_GLOBAL_SNAP_SEQ,
|
||||
&tmp_str);
|
||||
seq_num = atoi(tmp_str);
|
||||
if( ORTE_SNAPC_CKPT_STATE_ERROR != ckpt_status ) {
|
||||
if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle,
|
||||
SSTORE_METADATA_GLOBAL_SNAP_REF,
|
||||
&global_snapshot_handle)) ) {
|
||||
opal_output(orte_snapc_base_output,
|
||||
"%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n",
|
||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
|
||||
ORTE_ERROR_LOG(ret);
|
||||
/* Do not exit here, continue so that we can inform the tool
|
||||
* that the checkpoint has failed
|
||||
*/
|
||||
}
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_sstore.get_attr(ss_handle,
|
||||
SSTORE_METADATA_GLOBAL_SNAP_SEQ,
|
||||
&tmp_str)) ) {
|
||||
opal_output(orte_snapc_base_output,
|
||||
"%s) base:ckpt_update_cmd: Error: SStore get_attr failed (ret = %d)\n",
|
||||
ORTE_SNAPC_COORD_NAME_PRINT(orte_snapc_coord_type), ret );
|
||||
ORTE_ERROR_LOG(ret);
|
||||
/* Do not exit here, continue so that we can inform the tool
|
||||
* that the checkpoint has failed
|
||||
*/
|
||||
}
|
||||
|
||||
if( NULL != tmp_str ) {
|
||||
seq_num = atoi(tmp_str);
|
||||
} else {
|
||||
seq_num = -1;
|
||||
}
|
||||
} else {
|
||||
/* Checkpoint Error Case */
|
||||
global_snapshot_handle = NULL;
|
||||
seq_num = -1;
|
||||
}
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, orte_snapc_base_output,
|
||||
"%s) base:ckpt_update_cmd: Sending update command <status %d> + <ref %s> <seq %d>\n",
|
||||
|
@ -161,7 +161,7 @@ int app_coord_init()
|
||||
|
||||
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
||||
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
||||
"app) Shutdown Barrier: Send INIT to HNP...!"));
|
||||
"app) Startup Barrier: Send INIT to HNP...!"));
|
||||
|
||||
OBJ_CONSTRUCT(&buffer, opal_buffer_t);
|
||||
|
||||
|
@ -1781,6 +1781,7 @@ static int snapc_full_process_orted_update_cmd(orte_process_name_t* sender,
|
||||
SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_RECOVERED);
|
||||
SNAPC_FULL_DISPLAY_RECOVERED_TIMER();
|
||||
orte_snapc_base_has_recovered = true;
|
||||
is_app_checkpointable = true;
|
||||
|
||||
exit_status = ORTE_SUCCESS;
|
||||
goto cleanup;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user