1
1

FT: fix compilation using --with-ft (1/5)

Enabling the FT code breaks compilation (again). This series
tries to fix the compiler errors. This is again only fixing
the compiler errors without any warranty that the result
might actually support FT again.

This first patch moves orte_cr_continue_like_restart from ORTE
to opal_cr_continue_like_restart in OPAL. This only leaves three
calls from OPAL to ORTE in the FT code. As it is not yet 100%
clear how to handle these calls the code orte_sstore.set_attr()
has been #ifdef'd out for now.
Этот коммит содержится в:
Adrian Reber 2014-12-22 21:36:51 +01:00
родитель a188cb2ff9
Коммит f45dd069bd
15 изменённых файлов: 34 добавлений и 23 удалений

Просмотреть файл

@ -57,7 +57,7 @@ int mca_bml_r2_ft_event(int state)
first_continue_pass = !first_continue_pass;
/* Since nothing in Checkpoint, we are fine here (unless required by BTL) */
if( orte_cr_continue_like_restart && !first_continue_pass) {
if (opal_cr_continue_like_restart && !first_continue_pass) {
procs = ompi_proc_all(&num_procs);
if(NULL == procs) {
return OMPI_ERR_OUT_OF_RESOURCE;
@ -139,7 +139,7 @@ int mca_bml_r2_ft_event(int state)
}
else if(OPAL_CRS_CONTINUE == state) {
/* Matches OPAL_CRS_RESTART_PRE */
if( orte_cr_continue_like_restart && first_continue_pass) {
if (opal_cr_continue_like_restart && first_continue_pass) {
if( OMPI_SUCCESS != (ret = mca_bml_r2_finalize()) ) {
opal_output(0, "bml:r2: ft_event(Restart): Failed to finalize BML framework\n");
return ret;
@ -150,7 +150,7 @@ int mca_bml_r2_ft_event(int state)
}
}
/* Matches OPAL_CRS_RESTART */
else if( orte_cr_continue_like_restart && !first_continue_pass ) {
else if (opal_cr_continue_like_restart && !first_continue_pass) {
/*
* Barrier to make all processes have been successfully restarted before
* we try to remove some restart only files.

Просмотреть файл

@ -3076,7 +3076,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
first_continue_pass = !first_continue_pass;
/* Only finalize the Protocol after the PML has been rebuilt */
if( orte_cr_continue_like_restart && first_continue_pass ) {
if (opal_cr_continue_like_restart && first_continue_pass) {
goto DONE;
}

Просмотреть файл

@ -688,7 +688,7 @@ int mca_pml_bfo_ft_event( int state )
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
}
if( orte_cr_continue_like_restart && !first_continue_pass ) {
if (opal_cr_continue_like_restart && !first_continue_pass) {
/*
* Get a list of processes
*/
@ -791,7 +791,7 @@ int mca_pml_bfo_ft_event( int state )
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
}
if( orte_cr_continue_like_restart && !first_continue_pass ) {
if (opal_cr_continue_like_restart && !first_continue_pass) {
/*
* Exchange the modex information once again.
* BTLs will have republished their modex information.

Просмотреть файл

@ -817,7 +817,7 @@ int mca_pml_ob1_ft_event( int state )
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
}
if( orte_cr_continue_like_restart && !first_continue_pass ) {
if (opal_cr_continue_like_restart && !first_continue_pass) {
/*
* Get a list of processes
*/
@ -920,7 +920,7 @@ int mca_pml_ob1_ft_event( int state )
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
}
if( orte_cr_continue_like_restart && !first_continue_pass ) {
if (opal_cr_continue_like_restart && !first_continue_pass) {
/*
* Exchange the modex information once again.
* BTLs will have republished their modex information.

Просмотреть файл

@ -410,7 +410,7 @@ static int ompi_cr_coord_pre_continue(void) {
opal_output_verbose(10, ompi_cr_output,
"ompi_cr: coord_pre_continue: ompi_cr_coord_pre_continue()");
if( orte_cr_continue_like_restart ) {
if (opal_cr_continue_like_restart) {
/* Mimic ompi_cr_coord_pre_restart(); */
if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) {
exit_status = ret;

Просмотреть файл

@ -1743,7 +1743,7 @@ int mca_btl_openib_ft_event(int state) {
if(OPAL_CRS_CHECKPOINT == state) {
/* Continue must reconstruct the routes (including modex), since we
* have to tear down the devices completely. */
orte_cr_continue_like_restart = true;
opal_cr_continue_like_restart = true;
/*
* To keep the node from crashing we need to call ibv_close_device

Просмотреть файл

@ -1281,13 +1281,16 @@ int mca_btl_sm_ft_event(int state) {
* for these old file handles. The restart procedure will make sure
* these files get cleaned up appropriately.
*/
/* Disabled to get FT code compiled again
* TODO: FIXIT soon
orte_sstore.set_attr(orte_sstore_handle_current,
SSTORE_METADATA_LOCAL_TOUCH,
mca_btl_sm_component.sm_seg->shmem_ds.seg_name);
*/
}
}
else if(OPAL_CRS_CONTINUE == state) {
if( orte_cr_continue_like_restart ) {
if (opal_cr_continue_like_restart) {
if( NULL != mca_btl_sm_component.sm_seg ) {
/* Add shared memory file */
opal_crs_base_cleanup_append(mca_btl_sm_component.sm_seg->shmem_ds.seg_name, false);

Просмотреть файл

@ -1251,13 +1251,16 @@ int mca_btl_smcuda_ft_event(int state) {
* for these old file handles. The restart procedure will make sure
* these files get cleaned up appropriately.
*/
/* Disabled to get FT code compiled again
* TODO: FIXIT soon
orte_sstore.set_attr(orte_sstore_handle_current,
SSTORE_METADATA_LOCAL_TOUCH,
mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
*/
}
}
else if(OPAL_CRS_CONTINUE == state) {
if( orte_cr_continue_like_restart ) {
if (opal_cr_continue_like_restart) {
if( NULL != mca_btl_smcuda_component.sm_seg ) {
/* Add shared memory file */
opal_crs_base_cleanup_append(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name, false);

Просмотреть файл

@ -173,12 +173,15 @@ int mca_mpool_sm_ft_event(int state) {
asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s",
opal_process_info.job_session_dir,
opal_proc_local_get()->proc_hostname );
/* Disabled to get FT code compiled again
* TODO: FIXIT soon
orte_sstore.set_attr(orte_sstore_handle_current, SSTORE_METADATA_LOCAL_TOUCH, file_name);
*/
free(file_name);
file_name = NULL;
}
else if(OPAL_CRS_CONTINUE == state) {
if(orte_cr_continue_like_restart) {
if (opal_cr_continue_like_restart) {
/* Find the sm module */
self_module = mca_mpool_base_module_lookup("sm");
self_sm_module = (mca_mpool_sm_module_t*) self_module;

Просмотреть файл

@ -131,6 +131,8 @@ int opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE;
static bool opal_cr_debug_sigpipe = false;
bool opal_cr_continue_like_restart = false;
#if OPAL_ENABLE_FT_THREAD == 1
/*****************
* Threading Functions and Variables

Просмотреть файл

@ -91,6 +91,12 @@ typedef enum opal_cr_ckpt_cmd_state_t opal_cr_ckpt_cmd_state_t;
/* The current state of a checkpoint operation */
OPAL_DECLSPEC extern int opal_cr_checkpointing_state;
/*
* If one of the BTLs that shutdown require a full, clean rebuild of the
* point-to-point stack on 'continue' as well as 'restart'.
*/
OPAL_DECLSPEC extern bool opal_cr_continue_like_restart;
#if OPAL_ENABLE_CRDEBUG == 1
/* Whether or not C/R Debugging is enabled for this process */
OPAL_DECLSPEC extern int MPIR_debug_with_checkpoint;

2
orte/mca/ess/env/ess_env_module.c поставляемый
Просмотреть файл

@ -258,7 +258,7 @@ static int rte_ft_event(int state)
goto cleanup;
}
if( orte_cr_continue_like_restart ) {
if (opal_cr_continue_like_restart) {
/*
* Barrier to make all processes have been successfully restarted before
* we try to remove some restart only files.

Просмотреть файл

@ -398,7 +398,7 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
}
/* Default: use the fast way */
orte_cr_continue_like_restart = false;
opal_cr_continue_like_restart = false;
orte_cr_flush_restart_files = true;
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
@ -480,7 +480,7 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
* otherwise just continue.
*/
if( currently_all_migrating ) {
orte_cr_continue_like_restart = true;
opal_cr_continue_like_restart = true;
orte_cr_flush_restart_files = false;
}
if( !currently_migrating && currently_all_migrating ) {

Просмотреть файл

@ -75,7 +75,6 @@ static int orte_cr_coord_post_ckpt(void);
static int orte_cr_coord_post_restart(void);
static int orte_cr_coord_post_continue(void);
bool orte_cr_continue_like_restart = false;
bool orte_cr_flush_restart_files = true;
/*************
@ -137,7 +136,7 @@ int orte_cr_init(void)
opal_cr_reg_coord_callback(orte_cr_coord, &prev_coord_callback);
/* Typically this is not needed. Individual BTLs will set this as needed */
orte_cr_continue_like_restart = false;
opal_cr_continue_like_restart = false;
orte_cr_flush_restart_files = true;
cleanup:

Просмотреть файл

@ -50,11 +50,6 @@ BEGIN_C_DECLS
ORTE_DECLSPEC int orte_cr_entry_point_init(void);
ORTE_DECLSPEC int orte_cr_entry_point_finalize(void);
/*
* If one of the BTLs that shutdown require a full, clean rebuild of the
* point-to-point stack on 'continue' as well as 'restart'.
*/
ORTE_DECLSPEC extern bool orte_cr_continue_like_restart;
ORTE_DECLSPEC extern bool orte_cr_flush_restart_files;
END_C_DECLS