From 88aa45dd529d66af958527df9dbed3012952e3bb Mon Sep 17 00:00:00 2001 From: Josh Hursey Date: Thu, 16 Oct 2008 15:09:00 +0000 Subject: [PATCH] Commit to bring online OpenIB, MX, and shared memory support for Open MPI's checkpoint/restart functionality. Some tuning is still needed, but basic functionality is in place. There is still a problem with OpenIB and threads (external to C/R functionality). It has been reported in Ticket #1539 Additionally: * Fix a file cleanup bug in CRS Base. * Fix a possible deadlock in the TCP ft_event function * Add a mca_base_param_deregister() function to MCA base * Add whole process checkpoint timers * Add support for BTL: OpenIB, MX, Shared Memory * Add support Mpool: rdma, sm * Sundry bounds checking an cleanup in some scattered functions This commit was SVN r19756. --- contrib/amca-param-sets/ft-enable-cr | 9 + ompi/mca/bml/r2/bml_r2_ft.c | 173 ++++++++++++++---- ompi/mca/btl/mx/btl_mx.c | 34 +++- ompi/mca/btl/mx/btl_mx_component.c | 4 +- ompi/mca/btl/openib/btl_openib.c | 111 ++++++++++- ompi/mca/btl/openib/btl_openib_component.c | 4 +- .../openib/connect/btl_openib_connect_base.c | 1 + ompi/mca/btl/sm/btl_sm.c | 55 +++++- ompi/mca/btl/sm/btl_sm_component.c | 19 +- ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c | 22 +++ ompi/mca/mpool/rdma/mpool_rdma.h | 8 + ompi/mca/mpool/rdma/mpool_rdma_component.c | 4 +- ompi/mca/mpool/rdma/mpool_rdma_module.c | 22 +++ ompi/mca/mpool/sm/mpool_sm_component.c | 18 +- ompi/mca/mpool/sm/mpool_sm_module.c | 53 +++++- ompi/mca/pml/ob1/pml_ob1.c | 107 ++++++++++- ompi/runtime/ompi_cr.c | 32 +++- ompi/runtime/ompi_cr.h | 6 + opal/mca/base/mca_base_param.c | 15 ++ opal/mca/base/mca_base_param.h | 8 + opal/mca/crs/base/crs_base_fns.c | 2 +- opal/runtime/opal_cr.c | 157 ++++++++++++++++ opal/runtime/opal_cr.h | 60 +++++- orte/mca/oob/tcp/oob_tcp.c | 4 + orte/mca/snapc/full/snapc_full_app.c | 14 ++ 25 files changed, 876 insertions(+), 66 deletions(-) diff --git a/contrib/amca-param-sets/ft-enable-cr b/contrib/amca-param-sets/ft-enable-cr index 7e27354503..f1294a1478 100644 --- a/contrib/amca-param-sets/ft-enable-cr +++ b/contrib/amca-param-sets/ft-enable-cr @@ -41,3 +41,12 @@ crcp=bkmrk # Temporary fix to force the event engine to use poll to behave well with BLCR # opal_event_include=poll + +# +# We currently only support the following options to the OpenIB BTL +# Future development will attempt to eliminate many of these restrictions +# +btl_openib_want_fork_support=1 +btl_openib_use_async_event_thread=0 +btl_openib_use_eager_rdma=0 +btl_openib_cpc_include=oob diff --git a/ompi/mca/bml/r2/bml_r2_ft.c b/ompi/mca/bml/r2/bml_r2_ft.c index 09d993e89b..eb11469628 100644 --- a/ompi/mca/bml/r2/bml_r2_ft.c +++ b/ompi/mca/bml/r2/bml_r2_ft.c @@ -43,17 +43,28 @@ int mca_bml_r2_ft_event(int state) { + static bool first_continue_pass = false; ompi_proc_t** procs = NULL; size_t num_procs; size_t btl_idx; int ret, p; int loc_state; + int param_type = -1; + char *param_list = NULL; if(OPAL_CRS_CHECKPOINT == state) { /* Do nothing for now */ } else if(OPAL_CRS_CONTINUE == state) { - /* Since nothing in Checkpoint, we are fine here */ + first_continue_pass = !first_continue_pass; + + /* Since nothing in Checkpoint, we are fine here (unless required by BTL) */ + if( ompi_cr_continue_like_restart && !first_continue_pass) { + procs = ompi_proc_all(&num_procs); + if(NULL == procs) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + } } else if(OPAL_CRS_RESTART_PRE == state ) { /* Nothing here */ @@ -76,56 +87,119 @@ int mca_bml_r2_ft_event(int state) * no longer exist. */ if( OPAL_CRS_RESTART != state ) { - /* Since we only ever call into the BTLs once during the first restart - * pass, just lie to them on this pass for a bit of local clarity. - */ - if( OPAL_CRS_RESTART_PRE == state ) { - loc_state = OPAL_CRS_RESTART; + if( OPAL_CRS_CONTINUE == state && !first_continue_pass ) { + ; } else { - loc_state = state; - } - - /* - * Call ft_event in: - * - BTL modules - * - MPool modules - * - * These should be cleaning out stale state, and memory references in - * preparation for being shut down. - */ - for(btl_idx = 0; btl_idx < mca_bml_r2.num_btl_modules; btl_idx++) { - /* - * Notify Mpool + /* Since we only ever call into the BTLs once during the first restart + * pass, just lie to them on this pass for a bit of local clarity. */ - if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool && - NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event ) { - opal_output_verbose(10, ompi_cr_output, - "bml:r2: ft_event: Notify the %s MPool.\n", - (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_component->mpool_version.mca_component_name); - if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event(loc_state) ) ) { - continue; - } + if( OPAL_CRS_RESTART_PRE == state ) { + loc_state = OPAL_CRS_RESTART; + } else { + loc_state = state; } /* - * Notify BTL + * Call ft_event in: + * - BTL modules + * - MPool modules + * + * These should be cleaning out stale state, and memory references in + * preparation for being shut down. */ - if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event) { - opal_output_verbose(10, ompi_cr_output, - "bml:r2: ft_event: Notify the %s BTL.\n", - (mca_bml_r2.btl_modules[btl_idx])->btl_component->btl_version.mca_component_name); - if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event(loc_state) ) ) { - continue; + for(btl_idx = 0; btl_idx < mca_bml_r2.num_btl_modules; btl_idx++) { + /* + * Notify Mpool + */ + if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool && + NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event ) { + opal_output_verbose(10, ompi_cr_output, + "bml:r2: ft_event: Notify the %s MPool.\n", + (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_component->mpool_version.mca_component_name); + if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event(loc_state) ) ) { + continue; + } + } + + /* + * Notify BTL + */ + if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event) { + opal_output_verbose(10, ompi_cr_output, + "bml:r2: ft_event: Notify the %s BTL.\n", + (mca_bml_r2.btl_modules[btl_idx])->btl_component->btl_version.mca_component_name); + if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event(loc_state) ) ) { + continue; + } } } - } + } /* OPAL_CRS_CONTINUE == state && !first_continue_pass */ } if(OPAL_CRS_CHECKPOINT == state) { ; } else if(OPAL_CRS_CONTINUE == state) { - ; + /* Matches OPAL_CRS_RESTART_PRE */ + if( ompi_cr_continue_like_restart && first_continue_pass) { + if( OMPI_SUCCESS != (ret = mca_bml_r2_finalize()) ) { + opal_output(0, "bml:r2: ft_event(Restart): Failed to finalize BML framework\n"); + return ret; + } + } + /* Matches OPAL_CRS_RESTART */ + else if( ompi_cr_continue_like_restart && !first_continue_pass ) { + /* + * Barrier to make all processes have been successfully restarted before + * we try to remove some restart only files. + */ + if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) { + opal_output(0, "bml:r2: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret); + return ret; + } + + opal_output_verbose(10, ompi_cr_output, + "bml:r2: ft_event(Restart): Cleanup restart files\n"); + opal_crs_base_cleanup_flush(); + + /* + * Re-open the BTL framework to get the full list of components. + */ + if( OMPI_SUCCESS != (ret = mca_btl_base_open()) ) { + opal_output(0, "bml:r2: ft_event(Restart): Failed to open BTL framework\n"); + return ret; + } + + /* + * Re-select the BTL components/modules + * This will cause the BTL components to discover the available + * network options on this machine, and post proper modex informaiton. + */ + if( OMPI_SUCCESS != (ret = mca_btl_base_select(OMPI_ENABLE_PROGRESS_THREADS, + OMPI_ENABLE_MPI_THREADS) ) ) { + opal_output(0, "bml:r2: ft_event(Restart): Failed to select in BTL framework\n"); + return ret; + } + + /* + * Clear some structures so we can properly repopulate them + */ + mca_bml_r2.btls_added = false; + + for(p = 0; p < (int)num_procs; ++p) { + if( NULL != procs[p]->proc_bml) { + OBJ_RELEASE(procs[p]->proc_bml); + procs[p]->proc_bml = NULL; + } + + OBJ_RELEASE(procs[p]); + } + + if( NULL != procs ) { + free(procs); + procs = NULL; + } + } } else if(OPAL_CRS_RESTART_PRE == state ) { opal_output_verbose(10, ompi_cr_output, @@ -163,12 +237,35 @@ int mca_bml_r2_ft_event(int state) /* * Re-open the BTL framework to get the full list of components. + * - but first clear the MCA value that was there */ + param_type = mca_base_param_find("btl", NULL, NULL); + mca_base_param_lookup_string(param_type, ¶m_list); + opal_output_verbose(11, ompi_cr_output, + "Restart (Previous BTL MCA): <%s>\n", param_list); + if( NULL != param_list ) { + free(param_list); + param_list = NULL; + } + + /* Deregister the old value, and refresh the file cache to grab any updates */ + mca_base_param_deregister(param_type); + mca_base_param_recache_files(false); + if( OMPI_SUCCESS != (ret = mca_btl_base_open()) ) { opal_output(0, "bml:r2: ft_event(Restart): Failed to open BTL framework\n"); return ret; } - + + param_type = mca_base_param_find("btl", NULL, NULL); + mca_base_param_lookup_string(param_type, ¶m_list); + opal_output_verbose(11, ompi_cr_output, + "Restart (New BTL MCA): <%s>\n", param_list); + if( NULL != param_list ) { + free(param_list); + param_list = NULL; + } + /* * Re-select the BTL components/modules * This will cause the BTL components to discover the available diff --git a/ompi/mca/btl/mx/btl_mx.c b/ompi/mca/btl/mx/btl_mx.c index ae73caa537..078aee0ac7 100644 --- a/ompi/mca/btl/mx/btl_mx.c +++ b/ompi/mca/btl/mx/btl_mx.c @@ -19,6 +19,10 @@ #include "ompi_config.h" #include "opal/util/if.h" +#if OPAL_ENABLE_FT == 1 +#include "ompi/runtime/ompi_cr.h" +#endif + #include "btl_mx.h" #include "btl_mx_frag.h" #include "btl_mx_proc.h" @@ -616,9 +620,36 @@ int mca_btl_mx_finalize( struct mca_btl_base_module_t* btl ) } +#if OPAL_ENABLE_FT == 0 int mca_btl_mx_ft_event(int state) { + return OMPI_SUCCESS; +} +#else +int mca_btl_mx_ft_event(int state) { + mca_btl_mx_module_t* mx_btl; + int i; + if(OPAL_CRS_CHECKPOINT == state) { - ; + /* Continue must reconstruct the routes (including modex), since we + * have to tear down the devices completely. + * We have to do this because the MX driver can be checkpointed, but + * cannot be restarted with BLCR due to an mmap problem. If we do not + * close MX then BLCR throws the following error in /var/log/messages: + * kernel: do_mmap(, 00002aaab0aac000, 0000000000400000, ...) failed: ffffffffffffffff + * kernel: vmadump: mmap failed: /dev/mx0 + * kernel: blcr: thaw_threads returned error, aborting. -1 + * JJH: It may be possible to, instead of restarting the entire driver, just reconnect endpoints + */ + ompi_cr_continue_like_restart = true; + + for( i = 0; i < mca_btl_mx_component.mx_num_btls; i++ ) { + mx_btl = mca_btl_mx_component.mx_btls[i]; + + if( NULL != mx_btl->mx_endpoint ) { + mx_close_endpoint(mx_btl->mx_endpoint); + mx_btl->mx_endpoint = NULL; + } + } } else if(OPAL_CRS_CONTINUE == state) { ; @@ -635,6 +666,7 @@ int mca_btl_mx_ft_event(int state) { return OMPI_SUCCESS; } +#endif /* OPAL_ENABLE_FT */ mca_btl_mx_module_t mca_btl_mx_module = { { diff --git a/ompi/mca/btl/mx/btl_mx_component.c b/ompi/mca/btl/mx/btl_mx_component.c index 690e8ebeac..f733bc1e9f 100644 --- a/ompi/mca/btl/mx/btl_mx_component.c +++ b/ompi/mca/btl/mx/btl_mx_component.c @@ -58,8 +58,8 @@ mca_btl_mx_component_t mca_btl_mx_component = { mca_btl_mx_component_close /* component close */ }, { - /* The component is not checkpoint ready */ - MCA_BASE_METADATA_PARAM_NONE + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT }, mca_btl_mx_component_init, diff --git a/ompi/mca/btl/openib/btl_openib.c b/ompi/mca/btl/openib/btl_openib.c index e8837ac377..144cb46eb1 100644 --- a/ompi/mca/btl/openib/btl_openib.c +++ b/ompi/mca/btl/openib/btl_openib.c @@ -32,6 +32,11 @@ #include "ompi/mca/pml/pml.h" #include "ompi/mca/btl/btl.h" #include "ompi/mca/btl/base/btl_base_error.h" + +#if OPAL_ENABLE_FT == 1 +#include "ompi/runtime/ompi_cr.h" +#endif + #include "btl_openib.h" #include "btl_openib_frag.h" #include "btl_openib_proc.h" @@ -91,6 +96,10 @@ mca_btl_openib_module_t mca_btl_openib_module = { } }; +#if OPAL_ENABLE_FT == 1 +static int ft_event_btl_openib_finalize(struct mca_btl_base_module_t* btl); +#endif + static void show_init_error(const char *file, int line, const char *func, const char *dev) { @@ -936,6 +945,11 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl) openib_btl = (mca_btl_openib_module_t*) btl; + /* Sanity check */ + if( mca_btl_openib_component.ib_num_btls <= 0 ) { + return 0; + } + /* Release all QPs */ for (ep_index=0; ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints); @@ -1185,12 +1199,36 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl, return OMPI_SUCCESS; } +#if OPAL_ENABLE_FT == 0 int mca_btl_openib_ft_event(int state) { + return OMPI_SUCCESS; +} +#else +int mca_btl_openib_ft_event(int state) { + int i; + if(OPAL_CRS_CHECKPOINT == state) { - ; + /* Continue must reconstruct the routes (including modex), since we + * have to tear down the devices completely. */ + ompi_cr_continue_like_restart = true; + + /* + * To keep the node from crashing we need to call ibv_close_device + * before the checkpoint is taken. To do this we need to tear + * everything down, and rebuild it all on continue/restart. :( + */ + + /* Shutdown all modules + * - Do this backwards since the openib_finalize function also loops + * over this variable. + */ + for (i = 0; i < mca_btl_openib_component.ib_num_btls; ++i ) { + ft_event_btl_openib_finalize( &(mca_btl_openib_component.openib_btls[i])->super); + } + ompi_btl_openib_connect_base_finalize(); } else if(OPAL_CRS_CONTINUE == state) { - ; + ; /* Cleared by forcing the modex, no work needed */ } else if(OPAL_CRS_RESTART == state) { ; @@ -1204,3 +1242,72 @@ int mca_btl_openib_ft_event(int state) { return OMPI_SUCCESS; } + +static int ft_event_btl_openib_finalize(struct mca_btl_base_module_t* btl) { + mca_btl_openib_module_t* openib_btl; + mca_btl_openib_endpoint_t* endpoint; + int ep_index, i; + int qp, rc = OMPI_SUCCESS; + + openib_btl = (mca_btl_openib_module_t*) btl; + + /* Release all QPs */ + for(ep_index=0; + ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints); + ep_index++) { + endpoint=opal_pointer_array_get_item(openib_btl->device->endpoints, + ep_index); + if(!endpoint) { + BTL_VERBOSE(("In finalize, got another null endpoint")); + continue; + } + if(endpoint->endpoint_btl != openib_btl) + continue; + for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) { + if(openib_btl->device->eager_rdma_buffers[i] == endpoint) { + openib_btl->device->eager_rdma_buffers[i] = NULL; + OBJ_RELEASE(endpoint); + } + } + OBJ_RELEASE(endpoint); + } + + /* Finalize the CPC modules on this openib module */ + for (i = 0; i < openib_btl->num_cpcs; ++i) { + if (NULL != openib_btl->cpcs[i]->cbm_finalize) { + openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]); + } + free(openib_btl->cpcs[i]); + } + free(openib_btl->cpcs); + + /* Release SRQ resources */ + for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { + if(!BTL_OPENIB_QP_TYPE_PP(qp)) { + MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS( + &openib_btl->qps[qp].u.srq_qp.pending_frags[0]); + MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS( + &openib_btl->qps[qp].u.srq_qp.pending_frags[1]); + if (ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)){ + BTL_VERBOSE(("Failed to close SRQ %d", qp)); + rc = OMPI_ERROR; + } + OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]); + OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]); + } + } + + /* Release device if there are no more users */ + if(!(--openib_btl->device->btls)) { + OBJ_RELEASE(openib_btl->device); + } + mca_btl_openib_component.devices_count = 0; + mca_btl_openib_component.ib_num_btls = 0; + OBJ_DESTRUCT(&mca_btl_openib_component.ib_procs); + + BTL_VERBOSE(("Success in closing BTL resources")); + + return rc; +} + +#endif /* OPAL_ENABLE_FT */ diff --git a/ompi/mca/btl/openib/btl_openib_component.c b/ompi/mca/btl/openib/btl_openib_component.c index 099070f249..2985c1e3aa 100644 --- a/ompi/mca/btl/openib/btl_openib_component.c +++ b/ompi/mca/btl/openib/btl_openib_component.c @@ -111,8 +111,8 @@ mca_btl_openib_component_t mca_btl_openib_component = { btl_openib_component_close /* component close */ }, { - /* The component is not checkpoint ready */ - MCA_BASE_METADATA_PARAM_NONE + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT }, btl_openib_component_init, diff --git a/ompi/mca/btl/openib/connect/btl_openib_connect_base.c b/ompi/mca/btl/openib/connect/btl_openib_connect_base.c index a1ec0bffdb..148c2fc803 100644 --- a/ompi/mca/btl/openib/connect/btl_openib_connect_base.c +++ b/ompi/mca/btl/openib/connect/btl_openib_connect_base.c @@ -500,5 +500,6 @@ void ompi_btl_openib_connect_base_finalize(void) } } free(available); + available = NULL; } } diff --git a/ompi/mca/btl/sm/btl_sm.c b/ompi/mca/btl/sm/btl_sm.c index 6028c0be2d..72cde4071c 100644 --- a/ompi/mca/btl/sm/btl_sm.c +++ b/ompi/mca/btl/sm/btl_sm.c @@ -44,6 +44,12 @@ #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/common/sm/common_sm_mmap.h" #include "ompi/mca/mpool/sm/mpool_sm.h" + +#if OPAL_ENABLE_FT == 1 +#include "opal/mca/crs/base/base.h" +#include "ompi/runtime/ompi_cr.h" +#endif + #include "btl_sm.h" #include "btl_sm_endpoint.h" #include "btl_sm_frag.h" @@ -824,15 +830,55 @@ int mca_btl_sm_send( struct mca_btl_base_module_t* btl, return 0; } +#if OPAL_ENABLE_FT == 0 int mca_btl_sm_ft_event(int state) { + return OMPI_SUCCESS; +} +#else +int mca_btl_sm_ft_event(int state) { + /* Notify mpool */ + if( NULL != mca_btl_sm_component.sm_mpool && + NULL != mca_btl_sm_component.sm_mpool->mpool_ft_event) { + mca_btl_sm_component.sm_mpool->mpool_ft_event(state); + } + if(OPAL_CRS_CHECKPOINT == state) { - ; + if( NULL != mca_btl_sm_component.mmap_file ) { + /* On restart we need the old file names to exist (not necessarily + * contain content) so the CRS component does not fail when searching + * for these old file handles. The restart procedure will make sure + * these files get cleaned up appropriately. + */ + opal_crs_base_metadata_write_token(NULL, CRS_METADATA_TOUCH, mca_btl_sm_component.mmap_file->map_path); + + /* Record the job session directory */ + opal_crs_base_metadata_write_token(NULL, CRS_METADATA_MKDIR, orte_process_info.job_session_dir); + } } else if(OPAL_CRS_CONTINUE == state) { - ; + if( ompi_cr_continue_like_restart ) { + if( NULL != mca_btl_sm_component.mmap_file ) { + /* Do not Add session directory on continue */ + + /* Add shared memory file */ + opal_crs_base_cleanup_append(mca_btl_sm_component.mmap_file->map_path, false); + } + + /* Clear this so we force the module to re-init the sm files */ + mca_btl_sm_component.sm_mpool = NULL; + } } - else if(OPAL_CRS_RESTART == state) { - ; + else if(OPAL_CRS_RESTART == state || + OPAL_CRS_RESTART_PRE == state) { + if( NULL != mca_btl_sm_component.mmap_file ) { + /* Add session directory */ + opal_crs_base_cleanup_append(orte_process_info.job_session_dir, true); + /* Add shared memory file */ + opal_crs_base_cleanup_append(mca_btl_sm_component.mmap_file->map_path, false); + } + + /* Clear this so we force the module to re-init the sm files */ + mca_btl_sm_component.sm_mpool = NULL; } else if(OPAL_CRS_TERM == state ) { ; @@ -843,3 +889,4 @@ int mca_btl_sm_ft_event(int state) { return OMPI_SUCCESS; } +#endif /* OPAL_ENABLE_FT */ diff --git a/ompi/mca/btl/sm/btl_sm_component.c b/ompi/mca/btl/sm/btl_sm_component.c index bba99d346b..a3875b5920 100644 --- a/ompi/mca/btl/sm/btl_sm_component.c +++ b/ompi/mca/btl/sm/btl_sm_component.c @@ -52,6 +52,11 @@ #include "ompi/mca/mpool/base/base.h" #include "ompi/mca/common/sm/common_sm_mmap.h" #include "ompi/mca/btl/base/btl_base_error.h" + +#if OPAL_ENABLE_FT == 1 +#include "opal/runtime/opal_cr.h" +#endif + #include "btl_sm.h" #include "btl_sm_frag.h" #include "btl_sm_fifo.h" @@ -74,8 +79,8 @@ mca_btl_sm_component_t mca_btl_sm_component = { mca_btl_sm_component_close /* component close */ }, { - /* The component is not checkpoint ready */ - MCA_BASE_METADATA_PARAM_NONE + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT }, mca_btl_sm_component_init, @@ -210,7 +215,17 @@ int mca_btl_sm_component_close(void) * to it are gone - no error checking, since we want all procs * to call this, so that in an abnormal termination scenario, * this file will still get cleaned up */ +#if OPAL_ENABLE_FT == 1 + /* Only unlink the file if we are *not* restarting + * If we are restarting the file will be unlinked at a later time. + */ + if(OPAL_CR_STATUS_RESTART_PRE != opal_cr_checkpointing_state && + OPAL_CR_STATUS_RESTART_POST != opal_cr_checkpointing_state ) { + unlink(mca_btl_sm_component.mmap_file->map_path); + } +#else unlink(mca_btl_sm_component.mmap_file->map_path); +#endif OBJ_RELEASE(mca_btl_sm_component.mmap_file); } diff --git a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c index a9ec35bf07..c8550fcde1 100644 --- a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c +++ b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c @@ -42,6 +42,7 @@ #include "ompi/mca/crcp/base/base.h" #include "ompi/class/ompi_free_list.h" +#include "ompi/runtime/ompi_cr.h" #include "crcp_bkmrk.h" #include "crcp_bkmrk_pml.h" @@ -2971,6 +2972,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event( ompi_crcp_base_pml_state_t* pml_state) { static int step_to_return_to = 0; + static bool first_continue_pass = false; opal_list_item_t* item = NULL; int exit_status = OMPI_SUCCESS; int ret; @@ -2992,6 +2994,12 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event( goto DONE; } + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR0); + orte_grpcomm.barrier(); + } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP0); + START_TIMER(CRCP_TIMER_TOTAL_CKPT); STEP_1: step_to_return_to = 0; @@ -3030,7 +3038,15 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event( goto DONE; } + first_continue_pass = !first_continue_pass; + + /* Only finalize the Protocol after the PML has been rebuilt */ + if( ompi_cr_continue_like_restart && first_continue_pass ) { + goto DONE; + } + START_TIMER(CRCP_TIMER_TOTAL_CONT); + /* * Finish the coord protocol */ @@ -3045,6 +3061,12 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event( DISPLAY_ALL_TIMERS(state); clear_timers(); + + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR1); + orte_grpcomm.barrier(); + } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE2); } /***************************** * Restart from a checkpoint diff --git a/ompi/mca/mpool/rdma/mpool_rdma.h b/ompi/mca/mpool/rdma/mpool_rdma.h index bf4925b3f6..80ab4697da 100644 --- a/ompi/mca/mpool/rdma/mpool_rdma.h +++ b/ompi/mca/mpool/rdma/mpool_rdma.h @@ -122,6 +122,14 @@ int mca_mpool_rdma_release_memory(mca_mpool_base_module_t* mpool, void *base, * finalize mpool */ void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool); + +/** + * Fault Tolerance Event Notification Function + * @param state Checkpoint Stae + * @return OMPI_SUCCESS or failure status + */ +int mca_mpool_rdma_ft_event(int state); + #if defined(c_plusplus) || defined(__cplusplus) } #endif diff --git a/ompi/mca/mpool/rdma/mpool_rdma_component.c b/ompi/mca/mpool/rdma/mpool_rdma_component.c index 6b91932936..a39618dbc4 100644 --- a/ompi/mca/mpool/rdma/mpool_rdma_component.c +++ b/ompi/mca/mpool/rdma/mpool_rdma_component.c @@ -56,8 +56,8 @@ mca_mpool_rdma_component_t mca_mpool_rdma_component = { NULL }, { - /* The component is not checkpoint ready */ - MCA_BASE_METADATA_PARAM_NONE + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT }, mca_mpool_rdma_init diff --git a/ompi/mca/mpool/rdma/mpool_rdma_module.c b/ompi/mca/mpool/rdma/mpool_rdma_module.c index d93d41effe..e550e403ed 100644 --- a/ompi/mca/mpool/rdma/mpool_rdma_module.c +++ b/ompi/mca/mpool/rdma/mpool_rdma_module.c @@ -54,6 +54,7 @@ void mca_mpool_rdma_module_init(mca_mpool_rdma_module_t* mpool) mpool->super.mpool_deregister = mca_mpool_rdma_deregister; mpool->super.mpool_release_memory = mca_mpool_rdma_release_memory; mpool->super.mpool_finalize = mca_mpool_rdma_finalize; + mpool->super.mpool_ft_event = mca_mpool_rdma_ft_event; mpool->super.rcache = mca_rcache_base_module_create(mca_mpool_rdma_component.rcache_name); mpool->super.flags = MCA_MPOOL_FLAGS_MPI_ALLOC_MEM; @@ -478,3 +479,24 @@ void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool) OBJ_DESTRUCT(&mpool_rdma->reg_list); OPAL_THREAD_UNLOCK(&mpool->rcache->lock); } + +int mca_mpool_rdma_ft_event(int state) { + if(OPAL_CRS_CHECKPOINT == state) { + ; + } + else if(OPAL_CRS_CONTINUE == state) { + ; + } + else if(OPAL_CRS_RESTART == state || + OPAL_CRS_RESTART_PRE == state) { + ; + } + else if(OPAL_CRS_TERM == state ) { + ; + } + else { + ; + } + + return OMPI_SUCCESS; +} diff --git a/ompi/mca/mpool/sm/mpool_sm_component.c b/ompi/mca/mpool/sm/mpool_sm_component.c index 41930824c5..2e01f82f76 100644 --- a/ompi/mca/mpool/sm/mpool_sm_component.c +++ b/ompi/mca/mpool/sm/mpool_sm_component.c @@ -38,6 +38,10 @@ #include "ompi/mca/common/sm/common_sm_mmap.h" #include "ompi/proc/proc.h" +#if OPAL_ENABLE_FT == 1 +#include "opal/runtime/opal_cr.h" +#endif + /* * Local functions */ @@ -62,8 +66,8 @@ mca_mpool_sm_component_t mca_mpool_sm_component = { mca_mpool_sm_close }, { - /* The component is not checkpoint ready */ - MCA_BASE_METADATA_PARAM_NONE + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT }, mca_mpool_sm_init @@ -134,7 +138,17 @@ static int mca_mpool_sm_close( void ) { if( NULL != mca_common_sm_mmap ) { if( OMPI_SUCCESS == mca_common_sm_mmap_fini( mca_common_sm_mmap ) ) { +#if OPAL_ENABLE_FT == 1 + /* Only unlink the file if we are *not* restarting + * If we are restarting the file will be unlinked at a later time. + */ + if(OPAL_CR_STATUS_RESTART_PRE != opal_cr_checkpointing_state && + OPAL_CR_STATUS_RESTART_POST != opal_cr_checkpointing_state ) { + unlink( mca_common_sm_mmap->map_path ); + } +#else unlink( mca_common_sm_mmap->map_path ); +#endif } OBJ_RELEASE( mca_common_sm_mmap ); } diff --git a/ompi/mca/mpool/sm/mpool_sm_module.c b/ompi/mca/mpool/sm/mpool_sm_module.c index e394c7cd4f..295c568485 100644 --- a/ompi/mca/mpool/sm/mpool_sm_module.c +++ b/ompi/mca/mpool/sm/mpool_sm_module.c @@ -28,7 +28,12 @@ #include "opal/mca/maffinity/maffinity.h" #include "opal/mca/maffinity/maffinity_types.h" #include "opal/mca/maffinity/base/base.h" +#include "orte/util/proc_info.h" +#if OPAL_ENABLE_FT == 1 +#include "ompi/mca/mpool/base/base.h" +#include "ompi/runtime/ompi_cr.h" +#endif /* * Initializes the mpool module. @@ -116,15 +121,54 @@ void mca_mpool_sm_free(mca_mpool_base_module_t* mpool, void * addr, mpool_sm->sm_allocator->alc_free(mpool_sm->sm_allocator, addr); } +#if OPAL_ENABLE_FT == 0 int mca_mpool_sm_ft_event(int state) { + return OMPI_SUCCESS; +} +#else +int mca_mpool_sm_ft_event(int state) { + mca_mpool_base_module_t *self_module = NULL; + char * file_name = NULL; + if(OPAL_CRS_CHECKPOINT == state) { - ; + /* Record the shared memory filename */ + asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s", + orte_process_info.job_session_dir, + orte_process_info.nodename ); + opal_crs_base_metadata_write_token(NULL, CRS_METADATA_TOUCH, file_name); + free(file_name); + file_name = NULL; } else if(OPAL_CRS_CONTINUE == state) { - ; + if(ompi_cr_continue_like_restart) { + /* Remove self from the list of all modules */ + self_module = mca_mpool_base_module_lookup("sm"); + mca_mpool_base_module_destroy(self_module); + + /* Release the old sm file, if it exists */ + if( NULL != mca_common_sm_mmap ) { + if( OMPI_SUCCESS == mca_common_sm_mmap_fini( mca_common_sm_mmap ) ) { + /* Add old shared memory file for eventual removal */ + opal_crs_base_cleanup_append(mca_common_sm_mmap->map_path, false); + } + OBJ_RELEASE( mca_common_sm_mmap ); + } + } } - else if(OPAL_CRS_RESTART == state) { - ; + else if(OPAL_CRS_RESTART == state || + OPAL_CRS_RESTART_PRE == state) { + /* Remove self from the list of all modules */ + self_module = mca_mpool_base_module_lookup("sm"); + mca_mpool_base_module_destroy(self_module); + + /* Release the old sm file, if it exists */ + if( NULL != mca_common_sm_mmap ) { + if( OMPI_SUCCESS == mca_common_sm_mmap_fini( mca_common_sm_mmap ) ) { + /* Add old shared memory file for eventual removal */ + opal_crs_base_cleanup_append(mca_common_sm_mmap->map_path, false); + } + OBJ_RELEASE( mca_common_sm_mmap ); + } } else if(OPAL_CRS_TERM == state ) { ; @@ -135,3 +179,4 @@ int mca_mpool_sm_ft_event(int state) { return OMPI_SUCCESS; } +#endif /* OPAL_ENABLE_FT */ diff --git a/ompi/mca/pml/ob1/pml_ob1.c b/ompi/mca/pml/ob1/pml_ob1.c index 02f3684697..ef8250b224 100644 --- a/ompi/mca/pml/ob1/pml_ob1.c +++ b/ompi/mca/pml/ob1/pml_ob1.c @@ -547,17 +547,61 @@ void mca_pml_ob1_error_handler( orte_errmgr.abort(-1, NULL); } +#if OPAL_ENABLE_FT == 0 +int mca_pml_ob1_ft_event( int state ) { + return OMPI_SUCCESS; +} +#else int mca_pml_ob1_ft_event( int state ) { + static bool first_continue_pass = false; ompi_proc_t** procs = NULL; size_t num_procs; int ret, p; if(OPAL_CRS_CHECKPOINT == state) { - ; + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1); + orte_grpcomm.barrier(); + } + + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0); } else if(OPAL_CRS_CONTINUE == state) { - ; + first_continue_pass = !first_continue_pass; + + if( !first_continue_pass ) { + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0); + orte_grpcomm.barrier(); + } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2); + } + + if( ompi_cr_continue_like_restart && !first_continue_pass ) { + /* + * Get a list of processes + */ + procs = ompi_proc_all(&num_procs); + if(NULL == procs) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* + * Refresh the proc structure, and publish our proc info in the modex. + * NOTE: Do *not* call ompi_proc_finalize as there are many places in + * the code that point to indv. procs in this strucutre. For our + * needs here we only need to fix up the modex, bml and pml + * references. + */ + if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) { + opal_output(0, + "pml:ob1: ft_event(Restart): proc_refresh Failed %d", + ret); + free (procs); + return ret; + } + } } else if(OPAL_CRS_RESTART_PRE == state ) { /* Nothing here */ @@ -612,10 +656,64 @@ int mca_pml_ob1_ft_event( int state ) } if(OPAL_CRS_CHECKPOINT == state) { - ; + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P1); + + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR0); + /* JJH Cannot barrier here due to progress engine -- orte_grpcomm.barrier();*/ + } } else if(OPAL_CRS_CONTINUE == state) { - ; + if( !first_continue_pass ) { + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1); + orte_grpcomm.barrier(); + } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3); + } + + if( ompi_cr_continue_like_restart && !first_continue_pass ) { + /* + * Exchange the modex information once again. + * BTLs will have republished their modex information. + */ + if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) { + opal_output(0, + "pml:ob1: ft_event(Restart): Failed orte_grpcomm.modex() = %d", + ret); + return ret; + } + + /* + * Startup the PML stack now that the modex is running again + * Add the new procs (BTLs redo modex recv's) + */ + if( OMPI_SUCCESS != (ret = mca_pml_ob1_add_procs(procs, num_procs) ) ) { + opal_output(0, "pml:ob1: ft_event(Restart): Failed in add_procs (%d)", ret); + return ret; + } + + /* Is this barrier necessary ? JJH */ + if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) { + opal_output(0, "pml:ob1: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret); + return ret; + } + + if( NULL != procs ) { + for(p = 0; p < (int)num_procs; ++p) { + OBJ_RELEASE(procs[p]); + } + free(procs); + procs = NULL; + } + } + if( !first_continue_pass ) { + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2); + orte_grpcomm.barrier(); + } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1); + } } else if(OPAL_CRS_RESTART_PRE == state ) { /* Nothing here */ @@ -664,6 +762,7 @@ int mca_pml_ob1_ft_event( int state ) return OMPI_SUCCESS; } +#endif /* OPAL_ENABLE_FT */ int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2) { diff --git a/ompi/runtime/ompi_cr.c b/ompi/runtime/ompi_cr.c index e558e18f51..2c11251c28 100644 --- a/ompi/runtime/ompi_cr.c +++ b/ompi/runtime/ompi_cr.c @@ -50,6 +50,7 @@ #include "orte/mca/snapc/base/base.h" #include "orte/runtime/runtime.h" #include "orte/util/show_help.h" +#include "orte/mca/grpcomm/grpcomm.h" #include "ompi/constants.h" #include "ompi/mca/pml/pml.h" @@ -72,6 +73,8 @@ static int ompi_cr_coord_post_ckpt(void); static int ompi_cr_coord_post_restart(void); static int ompi_cr_coord_post_continue(void); +bool ompi_cr_continue_like_restart = false; + /************* * Local vars *************/ @@ -159,6 +162,9 @@ int ompi_cr_init(void) ompi_cr_output = opal_cr_output; } + /* Typically this is not needed. Individual BTLs will set this as needed */ + ompi_cr_continue_like_restart = false; + opal_output_verbose(10, ompi_cr_output, "ompi_cr: init: ompi_cr_init()"); @@ -195,6 +201,9 @@ int ompi_cr_coord(int state) * take action given the state. */ if(OPAL_CRS_CHECKPOINT == state) { + /* Default: use the fast way */ + ompi_cr_continue_like_restart = false; + /* Do Checkpoint Phase work */ ret = ompi_cr_coord_pre_ckpt(); if( ret == OMPI_EXISTS) { @@ -317,6 +326,8 @@ static int ompi_cr_coord_pre_restart(void) { } static int ompi_cr_coord_pre_continue(void) { + int ret, exit_status = OMPI_SUCCESS; + /* * Can not really do much until ORTE is up and running, * so defer action until the post_continue function. @@ -324,7 +335,26 @@ static int ompi_cr_coord_pre_continue(void) { opal_output_verbose(10, ompi_cr_output, "ompi_cr: coord_pre_continue: ompi_cr_coord_pre_continue()"); - return OMPI_SUCCESS; + if( ompi_cr_continue_like_restart ) { + /* Mimic ompi_cr_coord_pre_restart(); */ + if( ORTE_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) { + exit_status = ret; + goto cleanup; + } + } + else { + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1); + } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3); + if( opal_cr_timing_barrier_enabled ) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2); + } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1); + } + + cleanup: + return exit_status; } /************* diff --git a/ompi/runtime/ompi_cr.h b/ompi/runtime/ompi_cr.h index a771e0ec60..388854d4b1 100644 --- a/ompi/runtime/ompi_cr.h +++ b/ompi/runtime/ompi_cr.h @@ -49,6 +49,12 @@ extern "C" { */ OMPI_DECLSPEC extern int ompi_cr_output; + /* + * If one of the BTLs that shutdown require a full, clean rebuild of the + * point-to-point stack on 'continue' as well as 'restart'. + */ + OPAL_DECLSPEC extern bool ompi_cr_continue_like_restart; + #if defined(c_plusplus) || defined(__cplusplus) } #endif diff --git a/opal/mca/base/mca_base_param.c b/opal/mca/base/mca_base_param.c index 56e28177da..40b3e50659 100644 --- a/opal/mca/base/mca_base_param.c +++ b/opal/mca/base/mca_base_param.c @@ -503,6 +503,21 @@ int mca_base_param_set_int(int index, int value) return OPAL_SUCCESS; } +/* + * Deregister a parameter + */ +int mca_base_param_deregister(int index) +{ + size_t size; + + /* Lookup the index and see if it's valid */ + size = opal_value_array_get_size(&mca_base_params); + if (index < 0 || ((size_t) index) > size) { + return OPAL_ERROR; + } + + return opal_value_array_remove_item(&mca_base_params, index); +} /* * Look up a string MCA parameter. diff --git a/opal/mca/base/mca_base_param.h b/opal/mca/base/mca_base_param.h index 921d29a532..7e90179778 100644 --- a/opal/mca/base/mca_base_param.h +++ b/opal/mca/base/mca_base_param.h @@ -470,6 +470,14 @@ extern "C" { const char *syn_param_name, bool deprecated); + /** + * Deregister a MCA parameter + * + * @param index Index returned from mca_base_param_register_init() + * + */ + OPAL_DECLSPEC int mca_base_param_deregister(int index); + /** * Look up an integer MCA parameter. * diff --git a/opal/mca/crs/base/crs_base_fns.c b/opal/mca/crs/base/crs_base_fns.c index 6c49509608..dbe0691720 100644 --- a/opal/mca/crs/base/crs_base_fns.c +++ b/opal/mca/crs/base/crs_base_fns.c @@ -304,7 +304,7 @@ int opal_crs_base_cleanup_flush(void) argc = opal_argv_count(cleanup_file_argv); for( i = 0; i < argc; ++i) { opal_output_verbose(15, opal_crs_base_output, - "opal:crs: cleanup_flush: Remove File <%s>\n", cleanup_dir_argv[i]); + "opal:crs: cleanup_flush: Remove File <%s>\n", cleanup_file_argv[i]); unlink(cleanup_file_argv[i]); } diff --git a/opal/runtime/opal_cr.c b/opal/runtime/opal_cr.c index a1ffcb7459..de9e2a75ed 100644 --- a/opal/runtime/opal_cr.c +++ b/opal/runtime/opal_cr.c @@ -77,6 +77,14 @@ bool opal_cr_stall_check = false; bool opal_cr_currently_stalled = false; int opal_cr_output; +static double opal_cr_get_time(void); +static void display_indv_timer_core(double diff, char *str); +static double timer_start[OPAL_CR_TIMER_MAX]; +bool opal_cr_timing_barrier_enabled = false; +bool opal_cr_timing_enabled = false; +int opal_cr_timing_my_rank = 0; +int opal_cr_timing_target_rank = 0; + /****************** * Local Functions & Var Decls ******************/ @@ -214,6 +222,28 @@ int opal_cr_init(void ) "opal_cr: init: FT Enabled: %d", val); + mca_base_param_reg_int_name("opal_cr", "enable_timer", + "Enable Checkpoint timer (Default: Disabled)", + false, false, + 0, &val); + opal_cr_timing_enabled = OPAL_INT_TO_BOOL(val); + + mca_base_param_reg_int_name("opal_cr", "enable_timer_barrier", + "Enable Checkpoint timer Barrier (Default: Disabled)", + false, false, + 0, &val); + if( opal_cr_timing_enabled ) { + opal_cr_timing_barrier_enabled = OPAL_INT_TO_BOOL(val); + } else { + opal_cr_timing_barrier_enabled = false; + } + + mca_base_param_reg_int_name("opal_cr", "timer_target_rank", + "Target Rank for the timer (Default: 0)", + false, false, + 0, &val); + opal_cr_timing_target_rank = val; + #if OPAL_ENABLE_FT_THREAD == 1 mca_base_param_reg_int_name("opal_cr", "use_thread", "Use an async thread to checkpoint this program (Default: Disabled)", @@ -505,6 +535,7 @@ int opal_cr_inc_core(pid_t pid, opal_crs_base_snapshot_t *snapshot, bool term, i /* * Take the checkpoint */ + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE0); if(OPAL_SUCCESS != (ret = opal_crs.crs_checkpoint(pid, snapshot, (opal_crs_state_type_t *)state))) { opal_output(opal_cr_output, "opal_cr: inc_core: Error: The checkpoint failed. %d\n", ret); @@ -513,6 +544,8 @@ int opal_cr_inc_core(pid_t pid, opal_crs_base_snapshot_t *snapshot, bool term, i } if(*state == OPAL_CRS_CONTINUE) { + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE1); + if(term) { *state = OPAL_CRS_TERM; opal_cr_checkpointing_state = OPAL_CR_STATUS_TERM; @@ -869,3 +902,127 @@ void opal_cr_thread_noop_progress(void) } #endif /* OPAL_ENABLE_FT_THREAD == 1 */ + +static double opal_cr_get_time() { + double wtime; + +#if OPAL_TIMER_USEC_NATIVE + wtime = (double)opal_timer_base_get_usec() / 1000000.0; +#else + struct timeval tv; + gettimeofday(&tv, NULL); + wtime = tv.tv_sec; + wtime += (double)tv.tv_usec / 1000000.0; +#endif + + return wtime; +} + +void opal_cr_set_time(int idx) +{ + if(idx < OPAL_CR_TIMER_MAX ) { + if( timer_start[idx] <= 0.0 ) { + timer_start[idx] = opal_cr_get_time(); + } + } +} + +void opal_cr_clear_timers(void) +{ + int i; + for(i = 0; i < OPAL_CR_TIMER_MAX; ++i) { + timer_start[i] = 0.0; + } +} + +static void display_indv_timer_core(double diff, char *str) { + double total = 0; + double perc = 0; + + total = timer_start[OPAL_CR_TIMER_MAX-1] - timer_start[OPAL_CR_TIMER_ENTRY0]; + perc = (diff/total) * 100; + + opal_output(0, + "opal_cr: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n", + str, + diff, + total, + perc); + return; +} + +void opal_cr_display_all_timers(void) +{ + double diff = 0.0; + char * label = NULL; + + if( opal_cr_timing_target_rank != opal_cr_timing_my_rank ) { + return; + } + + opal_output(0, "OPAL CR Timing: ******************** Summary Begin\n"); + + /********** Entry into the system **********/ + label = strdup("Start Entry Point"); + if( opal_cr_timing_barrier_enabled ) { + diff = timer_start[OPAL_CR_TIMER_CRCPBR0] - timer_start[OPAL_CR_TIMER_ENTRY0]; + } else { + diff = timer_start[OPAL_CR_TIMER_CRCP0] - timer_start[OPAL_CR_TIMER_ENTRY0]; + } + display_indv_timer_core(diff, label); + free(label); + + /********** CRCP Protocol **********/ + label = strdup("CRCP Protocol"); + if( opal_cr_timing_barrier_enabled ) { + diff = timer_start[OPAL_CR_TIMER_CRCPBR1] - timer_start[OPAL_CR_TIMER_CRCP0]; + } else { + diff = timer_start[OPAL_CR_TIMER_P2P0] - timer_start[OPAL_CR_TIMER_CRCP0]; + } + display_indv_timer_core(diff, label); + free(label); + + /********** P2P Suspend **********/ + label = strdup("P2P Suspend"); + if( opal_cr_timing_barrier_enabled ) { + diff = timer_start[OPAL_CR_TIMER_P2PBR0] - timer_start[OPAL_CR_TIMER_P2P0]; + } else { + diff = timer_start[OPAL_CR_TIMER_CORE0] - timer_start[OPAL_CR_TIMER_P2P0]; + } + display_indv_timer_core(diff, label); + free(label); + + /********** Checkpoint to Disk **********/ + label = strdup("Checkpoint"); + diff = timer_start[OPAL_CR_TIMER_CORE1] - timer_start[OPAL_CR_TIMER_CORE0]; + display_indv_timer_core(diff, label); + free(label); + + /********** P2P Reactivation **********/ + label = strdup("P2P Reactivation"); + if( opal_cr_timing_barrier_enabled ) { + diff = timer_start[OPAL_CR_TIMER_P2PBR2] - timer_start[OPAL_CR_TIMER_CORE1]; + } else { + diff = timer_start[OPAL_CR_TIMER_CRCP1] - timer_start[OPAL_CR_TIMER_CORE1]; + } + display_indv_timer_core(diff, label); + free(label); + + /********** CRCP Protocol Finalize **********/ + label = strdup("CRCP Cleanup"); + if( opal_cr_timing_barrier_enabled ) { + diff = timer_start[OPAL_CR_TIMER_COREBR1] - timer_start[OPAL_CR_TIMER_CRCP1]; + } else { + diff = timer_start[OPAL_CR_TIMER_CORE2] - timer_start[OPAL_CR_TIMER_CRCP1]; + } + display_indv_timer_core(diff, label); + free(label); + + /********** Exit the system **********/ + label = strdup("Finish Entry Point"); + diff = timer_start[OPAL_CR_TIMER_ENTRY4] - timer_start[OPAL_CR_TIMER_CORE2]; + display_indv_timer_core(diff, label); + free(label); + + opal_output(0, "OPAL CR Timing: ******************** Summary End\n"); +} diff --git a/opal/runtime/opal_cr.h b/opal/runtime/opal_cr.h index 6f686e64fa..f5f89add99 100644 --- a/opal/runtime/opal_cr.h +++ b/opal/runtime/opal_cr.h @@ -267,7 +267,65 @@ typedef enum opal_cr_ckpt_cmd_state_t opal_cr_ckpt_cmd_state_t; * OPAL Checkpoint Coordination Routine */ OPAL_DECLSPEC int opal_cr_coord(int state); - + + /** + * Checkpoint life-cycle timing + */ + OPAL_DECLSPEC void opal_cr_set_time(int idx); + OPAL_DECLSPEC void opal_cr_display_all_timers(void); + OPAL_DECLSPEC void opal_cr_clear_timers(void); + + OPAL_DECLSPEC extern bool opal_cr_timing_enabled; + OPAL_DECLSPEC extern bool opal_cr_timing_barrier_enabled; + OPAL_DECLSPEC extern int opal_cr_timing_my_rank; + OPAL_DECLSPEC extern int opal_cr_timing_target_rank; + + +#define OPAL_CR_TIMER_ENTRY0 0 +#define OPAL_CR_TIMER_ENTRY1 1 +#define OPAL_CR_TIMER_ENTRY2 2 +#define OPAL_CR_TIMER_CRCPBR0 3 +#define OPAL_CR_TIMER_CRCP0 4 +#define OPAL_CR_TIMER_CRCPBR1 5 +#define OPAL_CR_TIMER_P2P0 6 +#define OPAL_CR_TIMER_P2P1 7 +#define OPAL_CR_TIMER_P2PBR0 8 +#define OPAL_CR_TIMER_CORE0 9 +#define OPAL_CR_TIMER_CORE1 10 +#define OPAL_CR_TIMER_COREBR0 11 +#define OPAL_CR_TIMER_P2P2 12 +#define OPAL_CR_TIMER_P2PBR1 13 +#define OPAL_CR_TIMER_P2P3 14 +#define OPAL_CR_TIMER_P2PBR2 15 +#define OPAL_CR_TIMER_CRCP1 16 +#define OPAL_CR_TIMER_COREBR1 17 +#define OPAL_CR_TIMER_CORE2 18 +#define OPAL_CR_TIMER_ENTRY3 19 +#define OPAL_CR_TIMER_ENTRY4 20 +#define OPAL_CR_TIMER_MAX 21 + + +#define OPAL_CR_CLEAR_TIMERS() \ + { \ + if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \ + opal_cr_clear_timers(); \ + } \ + } + +#define OPAL_CR_SET_TIMER(idx) \ + { \ + if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \ + opal_cr_set_time(idx); \ + } \ + } + +#define OPAL_CR_DISPLAY_ALL_TIMERS() \ + { \ + if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \ + opal_cr_display_all_timers(); \ + } \ + } + #if defined(c_plusplus) || defined(__cplusplus) } #endif diff --git a/orte/mca/oob/tcp/oob_tcp.c b/orte/mca/oob/tcp/oob_tcp.c index 9c0e17f2c1..bf76193b7f 100644 --- a/orte/mca/oob/tcp/oob_tcp.c +++ b/orte/mca/oob/tcp/oob_tcp.c @@ -1613,11 +1613,13 @@ int mca_oob_tcp_ft_event(int state) { */ OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); opal_event_disable(); + OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock); } else if(OPAL_CRS_CONTINUE == state) { /* * Resume event processing */ + OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); opal_event_enable(); OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock); } @@ -1636,6 +1638,8 @@ int mca_oob_tcp_ft_event(int state) { MCA_OOB_TCP_PEER_RETURN(peer); } + OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock); + OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_free); OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_names); OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peers); diff --git a/orte/mca/snapc/full/snapc_full_app.c b/orte/mca/snapc/full/snapc_full_app.c index e7b8accabc..16babfcd17 100644 --- a/orte/mca/snapc/full/snapc_full_app.c +++ b/orte/mca/snapc/full/snapc_full_app.c @@ -173,6 +173,10 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp) goto STAGE_1; } + OPAL_CR_CLEAR_TIMERS(); + opal_cr_timing_my_rank = ORTE_PROC_MY_NAME->vpid; + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY0); + /* * Open communication channels */ @@ -193,6 +197,8 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp) goto ckpt_cleanup; } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY1); + /* * Begin checkpoint * - Init the checkpoint metadata file @@ -205,6 +211,8 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp) goto ckpt_cleanup; } + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY2); + OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "App) notify_response: Start checkpoint...")); STAGE_1: @@ -255,6 +263,7 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp) /* * Final Handshake */ + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY3); OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle, "App) notify_response: Waiting for final handshake.")); if( ORTE_SUCCESS != (ret = snapc_full_app_ckpt_handshake_end(cr_state ) ) ) { @@ -282,6 +291,11 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp) opal_cr_checkpointing_state = OPAL_CR_STATUS_NONE; opal_cr_currently_stalled = false; + OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY4); + if(OPAL_CRS_RESTART != cr_state) { + OPAL_CR_DISPLAY_ALL_TIMERS(); + } + return exit_status; }