FT: fix compilation using --with-ft (1/5)
Enabling the FT code breaks compilation (again). This series tries to fix the compiler errors. This is again only fixing the compiler errors without any warranty that the result might actually support FT again. This first patch moves orte_cr_continue_like_restart from ORTE to opal_cr_continue_like_restart in OPAL. This only leaves three calls from OPAL to ORTE in the FT code. As it is not yet 100% clear how to handle these calls the code orte_sstore.set_attr() has been #ifdef'd out for now.
Этот коммит содержится в:
родитель
a188cb2ff9
Коммит
f45dd069bd
@ -57,7 +57,7 @@ int mca_bml_r2_ft_event(int state)
|
||||
first_continue_pass = !first_continue_pass;
|
||||
|
||||
/* Since nothing in Checkpoint, we are fine here (unless required by BTL) */
|
||||
if( orte_cr_continue_like_restart && !first_continue_pass) {
|
||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
procs = ompi_proc_all(&num_procs);
|
||||
if(NULL == procs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -139,7 +139,7 @@ int mca_bml_r2_ft_event(int state)
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
/* Matches OPAL_CRS_RESTART_PRE */
|
||||
if( orte_cr_continue_like_restart && first_continue_pass) {
|
||||
if (opal_cr_continue_like_restart && first_continue_pass) {
|
||||
if( OMPI_SUCCESS != (ret = mca_bml_r2_finalize()) ) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed to finalize BML framework\n");
|
||||
return ret;
|
||||
@ -150,7 +150,7 @@ int mca_bml_r2_ft_event(int state)
|
||||
}
|
||||
}
|
||||
/* Matches OPAL_CRS_RESTART */
|
||||
else if( orte_cr_continue_like_restart && !first_continue_pass ) {
|
||||
else if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
/*
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
|
@ -3076,7 +3076,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
first_continue_pass = !first_continue_pass;
|
||||
|
||||
/* Only finalize the Protocol after the PML has been rebuilt */
|
||||
if( orte_cr_continue_like_restart && first_continue_pass ) {
|
||||
if (opal_cr_continue_like_restart && first_continue_pass) {
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
|
@ -688,7 +688,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
|
||||
}
|
||||
|
||||
if( orte_cr_continue_like_restart && !first_continue_pass ) {
|
||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
/*
|
||||
* Get a list of processes
|
||||
*/
|
||||
@ -791,7 +791,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
||||
}
|
||||
|
||||
if( orte_cr_continue_like_restart && !first_continue_pass ) {
|
||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
/*
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
|
@ -817,7 +817,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
|
||||
}
|
||||
|
||||
if( orte_cr_continue_like_restart && !first_continue_pass ) {
|
||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
/*
|
||||
* Get a list of processes
|
||||
*/
|
||||
@ -920,7 +920,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
||||
}
|
||||
|
||||
if( orte_cr_continue_like_restart && !first_continue_pass ) {
|
||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
/*
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
|
@ -410,7 +410,7 @@ static int ompi_cr_coord_pre_continue(void) {
|
||||
opal_output_verbose(10, ompi_cr_output,
|
||||
"ompi_cr: coord_pre_continue: ompi_cr_coord_pre_continue()");
|
||||
|
||||
if( orte_cr_continue_like_restart ) {
|
||||
if (opal_cr_continue_like_restart) {
|
||||
/* Mimic ompi_cr_coord_pre_restart(); */
|
||||
if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) {
|
||||
exit_status = ret;
|
||||
|
@ -1743,7 +1743,7 @@ int mca_btl_openib_ft_event(int state) {
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
/* Continue must reconstruct the routes (including modex), since we
|
||||
* have to tear down the devices completely. */
|
||||
orte_cr_continue_like_restart = true;
|
||||
opal_cr_continue_like_restart = true;
|
||||
|
||||
/*
|
||||
* To keep the node from crashing we need to call ibv_close_device
|
||||
|
@ -1281,13 +1281,16 @@ int mca_btl_sm_ft_event(int state) {
|
||||
* for these old file handles. The restart procedure will make sure
|
||||
* these files get cleaned up appropriately.
|
||||
*/
|
||||
/* Disabled to get FT code compiled again
|
||||
* TODO: FIXIT soon
|
||||
orte_sstore.set_attr(orte_sstore_handle_current,
|
||||
SSTORE_METADATA_LOCAL_TOUCH,
|
||||
mca_btl_sm_component.sm_seg->shmem_ds.seg_name);
|
||||
*/
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
if( orte_cr_continue_like_restart ) {
|
||||
if (opal_cr_continue_like_restart) {
|
||||
if( NULL != mca_btl_sm_component.sm_seg ) {
|
||||
/* Add shared memory file */
|
||||
opal_crs_base_cleanup_append(mca_btl_sm_component.sm_seg->shmem_ds.seg_name, false);
|
||||
|
@ -1251,13 +1251,16 @@ int mca_btl_smcuda_ft_event(int state) {
|
||||
* for these old file handles. The restart procedure will make sure
|
||||
* these files get cleaned up appropriately.
|
||||
*/
|
||||
/* Disabled to get FT code compiled again
|
||||
* TODO: FIXIT soon
|
||||
orte_sstore.set_attr(orte_sstore_handle_current,
|
||||
SSTORE_METADATA_LOCAL_TOUCH,
|
||||
mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
|
||||
*/
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
if( orte_cr_continue_like_restart ) {
|
||||
if (opal_cr_continue_like_restart) {
|
||||
if( NULL != mca_btl_smcuda_component.sm_seg ) {
|
||||
/* Add shared memory file */
|
||||
opal_crs_base_cleanup_append(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name, false);
|
||||
|
@ -173,12 +173,15 @@ int mca_mpool_sm_ft_event(int state) {
|
||||
asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s",
|
||||
opal_process_info.job_session_dir,
|
||||
opal_proc_local_get()->proc_hostname );
|
||||
/* Disabled to get FT code compiled again
|
||||
* TODO: FIXIT soon
|
||||
orte_sstore.set_attr(orte_sstore_handle_current, SSTORE_METADATA_LOCAL_TOUCH, file_name);
|
||||
*/
|
||||
free(file_name);
|
||||
file_name = NULL;
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
if(orte_cr_continue_like_restart) {
|
||||
if (opal_cr_continue_like_restart) {
|
||||
/* Find the sm module */
|
||||
self_module = mca_mpool_base_module_lookup("sm");
|
||||
self_sm_module = (mca_mpool_sm_module_t*) self_module;
|
||||
|
@ -131,6 +131,8 @@ int opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE;
|
||||
|
||||
static bool opal_cr_debug_sigpipe = false;
|
||||
|
||||
bool opal_cr_continue_like_restart = false;
|
||||
|
||||
#if OPAL_ENABLE_FT_THREAD == 1
|
||||
/*****************
|
||||
* Threading Functions and Variables
|
||||
|
@ -91,6 +91,12 @@ typedef enum opal_cr_ckpt_cmd_state_t opal_cr_ckpt_cmd_state_t;
|
||||
/* The current state of a checkpoint operation */
|
||||
OPAL_DECLSPEC extern int opal_cr_checkpointing_state;
|
||||
|
||||
/*
|
||||
* If one of the BTLs that shutdown require a full, clean rebuild of the
|
||||
* point-to-point stack on 'continue' as well as 'restart'.
|
||||
*/
|
||||
OPAL_DECLSPEC extern bool opal_cr_continue_like_restart;
|
||||
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
/* Whether or not C/R Debugging is enabled for this process */
|
||||
OPAL_DECLSPEC extern int MPIR_debug_with_checkpoint;
|
||||
|
2
orte/mca/ess/env/ess_env_module.c
поставляемый
2
orte/mca/ess/env/ess_env_module.c
поставляемый
@ -258,7 +258,7 @@ static int rte_ft_event(int state)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if( orte_cr_continue_like_restart ) {
|
||||
if (opal_cr_continue_like_restart) {
|
||||
/*
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
|
@ -398,7 +398,7 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
||||
}
|
||||
|
||||
/* Default: use the fast way */
|
||||
orte_cr_continue_like_restart = false;
|
||||
opal_cr_continue_like_restart = false;
|
||||
orte_cr_flush_restart_files = true;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
||||
@ -480,7 +480,7 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
||||
* otherwise just continue.
|
||||
*/
|
||||
if( currently_all_migrating ) {
|
||||
orte_cr_continue_like_restart = true;
|
||||
opal_cr_continue_like_restart = true;
|
||||
orte_cr_flush_restart_files = false;
|
||||
}
|
||||
if( !currently_migrating && currently_all_migrating ) {
|
||||
|
@ -75,7 +75,6 @@ static int orte_cr_coord_post_ckpt(void);
|
||||
static int orte_cr_coord_post_restart(void);
|
||||
static int orte_cr_coord_post_continue(void);
|
||||
|
||||
bool orte_cr_continue_like_restart = false;
|
||||
bool orte_cr_flush_restart_files = true;
|
||||
|
||||
/*************
|
||||
@ -137,7 +136,7 @@ int orte_cr_init(void)
|
||||
opal_cr_reg_coord_callback(orte_cr_coord, &prev_coord_callback);
|
||||
|
||||
/* Typically this is not needed. Individual BTLs will set this as needed */
|
||||
orte_cr_continue_like_restart = false;
|
||||
opal_cr_continue_like_restart = false;
|
||||
orte_cr_flush_restart_files = true;
|
||||
|
||||
cleanup:
|
||||
|
@ -50,11 +50,6 @@ BEGIN_C_DECLS
|
||||
ORTE_DECLSPEC int orte_cr_entry_point_init(void);
|
||||
ORTE_DECLSPEC int orte_cr_entry_point_finalize(void);
|
||||
|
||||
/*
|
||||
* If one of the BTLs that shutdown require a full, clean rebuild of the
|
||||
* point-to-point stack on 'continue' as well as 'restart'.
|
||||
*/
|
||||
ORTE_DECLSPEC extern bool orte_cr_continue_like_restart;
|
||||
ORTE_DECLSPEC extern bool orte_cr_flush_restart_files;
|
||||
|
||||
END_C_DECLS
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user