1
1

Commit to bring online OpenIB, MX, and shared memory support for Open MPI's checkpoint/restart functionality. Some tuning is still needed, but basic functionality is in place.

There is still a problem with OpenIB and threads (external to C/R functionality). It has been reported in Ticket #1539

Additionally:
* Fix a file cleanup bug in CRS Base.
* Fix a possible deadlock in the TCP ft_event function
* Add a mca_base_param_deregister() function to MCA base
* Add whole process checkpoint timers
* Add support for BTL: OpenIB, MX,  Shared Memory
* Add support Mpool: rdma, sm
* Sundry bounds checking an cleanup in some scattered functions

This commit was SVN r19756.
Этот коммит содержится в:
Josh Hursey 2008-10-16 15:09:00 +00:00
родитель b46d3e766e
Коммит 88aa45dd52
25 изменённых файлов: 876 добавлений и 66 удалений

Просмотреть файл

@ -41,3 +41,12 @@ crcp=bkmrk
# Temporary fix to force the event engine to use poll to behave well with BLCR
#
opal_event_include=poll
#
# We currently only support the following options to the OpenIB BTL
# Future development will attempt to eliminate many of these restrictions
#
btl_openib_want_fork_support=1
btl_openib_use_async_event_thread=0
btl_openib_use_eager_rdma=0
btl_openib_cpc_include=oob

Просмотреть файл

@ -43,17 +43,28 @@
int mca_bml_r2_ft_event(int state)
{
static bool first_continue_pass = false;
ompi_proc_t** procs = NULL;
size_t num_procs;
size_t btl_idx;
int ret, p;
int loc_state;
int param_type = -1;
char *param_list = NULL;
if(OPAL_CRS_CHECKPOINT == state) {
/* Do nothing for now */
}
else if(OPAL_CRS_CONTINUE == state) {
/* Since nothing in Checkpoint, we are fine here */
first_continue_pass = !first_continue_pass;
/* Since nothing in Checkpoint, we are fine here (unless required by BTL) */
if( ompi_cr_continue_like_restart && !first_continue_pass) {
procs = ompi_proc_all(&num_procs);
if(NULL == procs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
}
}
else if(OPAL_CRS_RESTART_PRE == state ) {
/* Nothing here */
@ -76,56 +87,119 @@ int mca_bml_r2_ft_event(int state)
* no longer exist.
*/
if( OPAL_CRS_RESTART != state ) {
/* Since we only ever call into the BTLs once during the first restart
* pass, just lie to them on this pass for a bit of local clarity.
*/
if( OPAL_CRS_RESTART_PRE == state ) {
loc_state = OPAL_CRS_RESTART;
if( OPAL_CRS_CONTINUE == state && !first_continue_pass ) {
;
} else {
loc_state = state;
}
/*
* Call ft_event in:
* - BTL modules
* - MPool modules
*
* These should be cleaning out stale state, and memory references in
* preparation for being shut down.
*/
for(btl_idx = 0; btl_idx < mca_bml_r2.num_btl_modules; btl_idx++) {
/*
* Notify Mpool
/* Since we only ever call into the BTLs once during the first restart
* pass, just lie to them on this pass for a bit of local clarity.
*/
if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool &&
NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event ) {
opal_output_verbose(10, ompi_cr_output,
"bml:r2: ft_event: Notify the %s MPool.\n",
(mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_component->mpool_version.mca_component_name);
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event(loc_state) ) ) {
continue;
}
if( OPAL_CRS_RESTART_PRE == state ) {
loc_state = OPAL_CRS_RESTART;
} else {
loc_state = state;
}
/*
* Notify BTL
* Call ft_event in:
* - BTL modules
* - MPool modules
*
* These should be cleaning out stale state, and memory references in
* preparation for being shut down.
*/
if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event) {
opal_output_verbose(10, ompi_cr_output,
"bml:r2: ft_event: Notify the %s BTL.\n",
(mca_bml_r2.btl_modules[btl_idx])->btl_component->btl_version.mca_component_name);
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event(loc_state) ) ) {
continue;
for(btl_idx = 0; btl_idx < mca_bml_r2.num_btl_modules; btl_idx++) {
/*
* Notify Mpool
*/
if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool &&
NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event ) {
opal_output_verbose(10, ompi_cr_output,
"bml:r2: ft_event: Notify the %s MPool.\n",
(mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_component->mpool_version.mca_component_name);
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event(loc_state) ) ) {
continue;
}
}
/*
* Notify BTL
*/
if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event) {
opal_output_verbose(10, ompi_cr_output,
"bml:r2: ft_event: Notify the %s BTL.\n",
(mca_bml_r2.btl_modules[btl_idx])->btl_component->btl_version.mca_component_name);
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event(loc_state) ) ) {
continue;
}
}
}
}
} /* OPAL_CRS_CONTINUE == state && !first_continue_pass */
}
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
/* Matches OPAL_CRS_RESTART_PRE */
if( ompi_cr_continue_like_restart && first_continue_pass) {
if( OMPI_SUCCESS != (ret = mca_bml_r2_finalize()) ) {
opal_output(0, "bml:r2: ft_event(Restart): Failed to finalize BML framework\n");
return ret;
}
}
/* Matches OPAL_CRS_RESTART */
else if( ompi_cr_continue_like_restart && !first_continue_pass ) {
/*
* Barrier to make all processes have been successfully restarted before
* we try to remove some restart only files.
*/
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
opal_output(0, "bml:r2: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
return ret;
}
opal_output_verbose(10, ompi_cr_output,
"bml:r2: ft_event(Restart): Cleanup restart files\n");
opal_crs_base_cleanup_flush();
/*
* Re-open the BTL framework to get the full list of components.
*/
if( OMPI_SUCCESS != (ret = mca_btl_base_open()) ) {
opal_output(0, "bml:r2: ft_event(Restart): Failed to open BTL framework\n");
return ret;
}
/*
* Re-select the BTL components/modules
* This will cause the BTL components to discover the available
* network options on this machine, and post proper modex informaiton.
*/
if( OMPI_SUCCESS != (ret = mca_btl_base_select(OMPI_ENABLE_PROGRESS_THREADS,
OMPI_ENABLE_MPI_THREADS) ) ) {
opal_output(0, "bml:r2: ft_event(Restart): Failed to select in BTL framework\n");
return ret;
}
/*
* Clear some structures so we can properly repopulate them
*/
mca_bml_r2.btls_added = false;
for(p = 0; p < (int)num_procs; ++p) {
if( NULL != procs[p]->proc_bml) {
OBJ_RELEASE(procs[p]->proc_bml);
procs[p]->proc_bml = NULL;
}
OBJ_RELEASE(procs[p]);
}
if( NULL != procs ) {
free(procs);
procs = NULL;
}
}
}
else if(OPAL_CRS_RESTART_PRE == state ) {
opal_output_verbose(10, ompi_cr_output,
@ -163,12 +237,35 @@ int mca_bml_r2_ft_event(int state)
/*
* Re-open the BTL framework to get the full list of components.
* - but first clear the MCA value that was there
*/
param_type = mca_base_param_find("btl", NULL, NULL);
mca_base_param_lookup_string(param_type, &param_list);
opal_output_verbose(11, ompi_cr_output,
"Restart (Previous BTL MCA): <%s>\n", param_list);
if( NULL != param_list ) {
free(param_list);
param_list = NULL;
}
/* Deregister the old value, and refresh the file cache to grab any updates */
mca_base_param_deregister(param_type);
mca_base_param_recache_files(false);
if( OMPI_SUCCESS != (ret = mca_btl_base_open()) ) {
opal_output(0, "bml:r2: ft_event(Restart): Failed to open BTL framework\n");
return ret;
}
param_type = mca_base_param_find("btl", NULL, NULL);
mca_base_param_lookup_string(param_type, &param_list);
opal_output_verbose(11, ompi_cr_output,
"Restart (New BTL MCA): <%s>\n", param_list);
if( NULL != param_list ) {
free(param_list);
param_list = NULL;
}
/*
* Re-select the BTL components/modules
* This will cause the BTL components to discover the available

Просмотреть файл

@ -19,6 +19,10 @@
#include "ompi_config.h"
#include "opal/util/if.h"
#if OPAL_ENABLE_FT == 1
#include "ompi/runtime/ompi_cr.h"
#endif
#include "btl_mx.h"
#include "btl_mx_frag.h"
#include "btl_mx_proc.h"
@ -616,9 +620,36 @@ int mca_btl_mx_finalize( struct mca_btl_base_module_t* btl )
}
#if OPAL_ENABLE_FT == 0
int mca_btl_mx_ft_event(int state) {
return OMPI_SUCCESS;
}
#else
int mca_btl_mx_ft_event(int state) {
mca_btl_mx_module_t* mx_btl;
int i;
if(OPAL_CRS_CHECKPOINT == state) {
;
/* Continue must reconstruct the routes (including modex), since we
* have to tear down the devices completely.
* We have to do this because the MX driver can be checkpointed, but
* cannot be restarted with BLCR due to an mmap problem. If we do not
* close MX then BLCR throws the following error in /var/log/messages:
* kernel: do_mmap(<file>, 00002aaab0aac000, 0000000000400000, ...) failed: ffffffffffffffff
* kernel: vmadump: mmap failed: /dev/mx0
* kernel: blcr: thaw_threads returned error, aborting. -1
* JJH: It may be possible to, instead of restarting the entire driver, just reconnect endpoints
*/
ompi_cr_continue_like_restart = true;
for( i = 0; i < mca_btl_mx_component.mx_num_btls; i++ ) {
mx_btl = mca_btl_mx_component.mx_btls[i];
if( NULL != mx_btl->mx_endpoint ) {
mx_close_endpoint(mx_btl->mx_endpoint);
mx_btl->mx_endpoint = NULL;
}
}
}
else if(OPAL_CRS_CONTINUE == state) {
;
@ -635,6 +666,7 @@ int mca_btl_mx_ft_event(int state) {
return OMPI_SUCCESS;
}
#endif /* OPAL_ENABLE_FT */
mca_btl_mx_module_t mca_btl_mx_module = {
{

Просмотреть файл

@ -58,8 +58,8 @@ mca_btl_mx_component_t mca_btl_mx_component = {
mca_btl_mx_component_close /* component close */
},
{
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
mca_btl_mx_component_init,

Просмотреть файл

@ -32,6 +32,11 @@
#include "ompi/mca/pml/pml.h"
#include "ompi/mca/btl/btl.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#if OPAL_ENABLE_FT == 1
#include "ompi/runtime/ompi_cr.h"
#endif
#include "btl_openib.h"
#include "btl_openib_frag.h"
#include "btl_openib_proc.h"
@ -91,6 +96,10 @@ mca_btl_openib_module_t mca_btl_openib_module = {
}
};
#if OPAL_ENABLE_FT == 1
static int ft_event_btl_openib_finalize(struct mca_btl_base_module_t* btl);
#endif
static void show_init_error(const char *file, int line,
const char *func, const char *dev)
{
@ -936,6 +945,11 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl)
openib_btl = (mca_btl_openib_module_t*) btl;
/* Sanity check */
if( mca_btl_openib_component.ib_num_btls <= 0 ) {
return 0;
}
/* Release all QPs */
for (ep_index=0;
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
@ -1185,12 +1199,36 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl,
return OMPI_SUCCESS;
}
#if OPAL_ENABLE_FT == 0
int mca_btl_openib_ft_event(int state) {
return OMPI_SUCCESS;
}
#else
int mca_btl_openib_ft_event(int state) {
int i;
if(OPAL_CRS_CHECKPOINT == state) {
;
/* Continue must reconstruct the routes (including modex), since we
* have to tear down the devices completely. */
ompi_cr_continue_like_restart = true;
/*
* To keep the node from crashing we need to call ibv_close_device
* before the checkpoint is taken. To do this we need to tear
* everything down, and rebuild it all on continue/restart. :(
*/
/* Shutdown all modules
* - Do this backwards since the openib_finalize function also loops
* over this variable.
*/
for (i = 0; i < mca_btl_openib_component.ib_num_btls; ++i ) {
ft_event_btl_openib_finalize( &(mca_btl_openib_component.openib_btls[i])->super);
}
ompi_btl_openib_connect_base_finalize();
}
else if(OPAL_CRS_CONTINUE == state) {
;
; /* Cleared by forcing the modex, no work needed */
}
else if(OPAL_CRS_RESTART == state) {
;
@ -1204,3 +1242,72 @@ int mca_btl_openib_ft_event(int state) {
return OMPI_SUCCESS;
}
static int ft_event_btl_openib_finalize(struct mca_btl_base_module_t* btl) {
mca_btl_openib_module_t* openib_btl;
mca_btl_openib_endpoint_t* endpoint;
int ep_index, i;
int qp, rc = OMPI_SUCCESS;
openib_btl = (mca_btl_openib_module_t*) btl;
/* Release all QPs */
for(ep_index=0;
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
ep_index++) {
endpoint=opal_pointer_array_get_item(openib_btl->device->endpoints,
ep_index);
if(!endpoint) {
BTL_VERBOSE(("In finalize, got another null endpoint"));
continue;
}
if(endpoint->endpoint_btl != openib_btl)
continue;
for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) {
if(openib_btl->device->eager_rdma_buffers[i] == endpoint) {
openib_btl->device->eager_rdma_buffers[i] = NULL;
OBJ_RELEASE(endpoint);
}
}
OBJ_RELEASE(endpoint);
}
/* Finalize the CPC modules on this openib module */
for (i = 0; i < openib_btl->num_cpcs; ++i) {
if (NULL != openib_btl->cpcs[i]->cbm_finalize) {
openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]);
}
free(openib_btl->cpcs[i]);
}
free(openib_btl->cpcs);
/* Release SRQ resources */
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
if (ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)){
BTL_VERBOSE(("Failed to close SRQ %d", qp));
rc = OMPI_ERROR;
}
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
}
}
/* Release device if there are no more users */
if(!(--openib_btl->device->btls)) {
OBJ_RELEASE(openib_btl->device);
}
mca_btl_openib_component.devices_count = 0;
mca_btl_openib_component.ib_num_btls = 0;
OBJ_DESTRUCT(&mca_btl_openib_component.ib_procs);
BTL_VERBOSE(("Success in closing BTL resources"));
return rc;
}
#endif /* OPAL_ENABLE_FT */

Просмотреть файл

@ -111,8 +111,8 @@ mca_btl_openib_component_t mca_btl_openib_component = {
btl_openib_component_close /* component close */
},
{
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
btl_openib_component_init,

Просмотреть файл

@ -500,5 +500,6 @@ void ompi_btl_openib_connect_base_finalize(void)
}
}
free(available);
available = NULL;
}
}

Просмотреть файл

@ -44,6 +44,12 @@
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/common/sm/common_sm_mmap.h"
#include "ompi/mca/mpool/sm/mpool_sm.h"
#if OPAL_ENABLE_FT == 1
#include "opal/mca/crs/base/base.h"
#include "ompi/runtime/ompi_cr.h"
#endif
#include "btl_sm.h"
#include "btl_sm_endpoint.h"
#include "btl_sm_frag.h"
@ -824,15 +830,55 @@ int mca_btl_sm_send( struct mca_btl_base_module_t* btl,
return 0;
}
#if OPAL_ENABLE_FT == 0
int mca_btl_sm_ft_event(int state) {
return OMPI_SUCCESS;
}
#else
int mca_btl_sm_ft_event(int state) {
/* Notify mpool */
if( NULL != mca_btl_sm_component.sm_mpool &&
NULL != mca_btl_sm_component.sm_mpool->mpool_ft_event) {
mca_btl_sm_component.sm_mpool->mpool_ft_event(state);
}
if(OPAL_CRS_CHECKPOINT == state) {
;
if( NULL != mca_btl_sm_component.mmap_file ) {
/* On restart we need the old file names to exist (not necessarily
* contain content) so the CRS component does not fail when searching
* for these old file handles. The restart procedure will make sure
* these files get cleaned up appropriately.
*/
opal_crs_base_metadata_write_token(NULL, CRS_METADATA_TOUCH, mca_btl_sm_component.mmap_file->map_path);
/* Record the job session directory */
opal_crs_base_metadata_write_token(NULL, CRS_METADATA_MKDIR, orte_process_info.job_session_dir);
}
}
else if(OPAL_CRS_CONTINUE == state) {
;
if( ompi_cr_continue_like_restart ) {
if( NULL != mca_btl_sm_component.mmap_file ) {
/* Do not Add session directory on continue */
/* Add shared memory file */
opal_crs_base_cleanup_append(mca_btl_sm_component.mmap_file->map_path, false);
}
/* Clear this so we force the module to re-init the sm files */
mca_btl_sm_component.sm_mpool = NULL;
}
}
else if(OPAL_CRS_RESTART == state) {
;
else if(OPAL_CRS_RESTART == state ||
OPAL_CRS_RESTART_PRE == state) {
if( NULL != mca_btl_sm_component.mmap_file ) {
/* Add session directory */
opal_crs_base_cleanup_append(orte_process_info.job_session_dir, true);
/* Add shared memory file */
opal_crs_base_cleanup_append(mca_btl_sm_component.mmap_file->map_path, false);
}
/* Clear this so we force the module to re-init the sm files */
mca_btl_sm_component.sm_mpool = NULL;
}
else if(OPAL_CRS_TERM == state ) {
;
@ -843,3 +889,4 @@ int mca_btl_sm_ft_event(int state) {
return OMPI_SUCCESS;
}
#endif /* OPAL_ENABLE_FT */

Просмотреть файл

@ -52,6 +52,11 @@
#include "ompi/mca/mpool/base/base.h"
#include "ompi/mca/common/sm/common_sm_mmap.h"
#include "ompi/mca/btl/base/btl_base_error.h"
#if OPAL_ENABLE_FT == 1
#include "opal/runtime/opal_cr.h"
#endif
#include "btl_sm.h"
#include "btl_sm_frag.h"
#include "btl_sm_fifo.h"
@ -74,8 +79,8 @@ mca_btl_sm_component_t mca_btl_sm_component = {
mca_btl_sm_component_close /* component close */
},
{
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
mca_btl_sm_component_init,
@ -210,7 +215,17 @@ int mca_btl_sm_component_close(void)
* to it are gone - no error checking, since we want all procs
* to call this, so that in an abnormal termination scenario,
* this file will still get cleaned up */
#if OPAL_ENABLE_FT == 1
/* Only unlink the file if we are *not* restarting
* If we are restarting the file will be unlinked at a later time.
*/
if(OPAL_CR_STATUS_RESTART_PRE != opal_cr_checkpointing_state &&
OPAL_CR_STATUS_RESTART_POST != opal_cr_checkpointing_state ) {
unlink(mca_btl_sm_component.mmap_file->map_path);
}
#else
unlink(mca_btl_sm_component.mmap_file->map_path);
#endif
OBJ_RELEASE(mca_btl_sm_component.mmap_file);
}

Просмотреть файл

@ -42,6 +42,7 @@
#include "ompi/mca/crcp/base/base.h"
#include "ompi/class/ompi_free_list.h"
#include "ompi/runtime/ompi_cr.h"
#include "crcp_bkmrk.h"
#include "crcp_bkmrk_pml.h"
@ -2971,6 +2972,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
ompi_crcp_base_pml_state_t* pml_state)
{
static int step_to_return_to = 0;
static bool first_continue_pass = false;
opal_list_item_t* item = NULL;
int exit_status = OMPI_SUCCESS;
int ret;
@ -2992,6 +2994,12 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
goto DONE;
}
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR0);
orte_grpcomm.barrier();
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP0);
START_TIMER(CRCP_TIMER_TOTAL_CKPT);
STEP_1:
step_to_return_to = 0;
@ -3030,7 +3038,15 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
goto DONE;
}
first_continue_pass = !first_continue_pass;
/* Only finalize the Protocol after the PML has been rebuilt */
if( ompi_cr_continue_like_restart && first_continue_pass ) {
goto DONE;
}
START_TIMER(CRCP_TIMER_TOTAL_CONT);
/*
* Finish the coord protocol
*/
@ -3045,6 +3061,12 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
DISPLAY_ALL_TIMERS(state);
clear_timers();
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR1);
orte_grpcomm.barrier();
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE2);
}
/*****************************
* Restart from a checkpoint

Просмотреть файл

@ -122,6 +122,14 @@ int mca_mpool_rdma_release_memory(mca_mpool_base_module_t* mpool, void *base,
* finalize mpool
*/
void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool);
/**
* Fault Tolerance Event Notification Function
* @param state Checkpoint Stae
* @return OMPI_SUCCESS or failure status
*/
int mca_mpool_rdma_ft_event(int state);
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -56,8 +56,8 @@ mca_mpool_rdma_component_t mca_mpool_rdma_component = {
NULL
},
{
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
mca_mpool_rdma_init

Просмотреть файл

@ -54,6 +54,7 @@ void mca_mpool_rdma_module_init(mca_mpool_rdma_module_t* mpool)
mpool->super.mpool_deregister = mca_mpool_rdma_deregister;
mpool->super.mpool_release_memory = mca_mpool_rdma_release_memory;
mpool->super.mpool_finalize = mca_mpool_rdma_finalize;
mpool->super.mpool_ft_event = mca_mpool_rdma_ft_event;
mpool->super.rcache =
mca_rcache_base_module_create(mca_mpool_rdma_component.rcache_name);
mpool->super.flags = MCA_MPOOL_FLAGS_MPI_ALLOC_MEM;
@ -478,3 +479,24 @@ void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool)
OBJ_DESTRUCT(&mpool_rdma->reg_list);
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
}
int mca_mpool_rdma_ft_event(int state) {
if(OPAL_CRS_CHECKPOINT == state) {
;
}
else if(OPAL_CRS_CONTINUE == state) {
;
}
else if(OPAL_CRS_RESTART == state ||
OPAL_CRS_RESTART_PRE == state) {
;
}
else if(OPAL_CRS_TERM == state ) {
;
}
else {
;
}
return OMPI_SUCCESS;
}

Просмотреть файл

@ -38,6 +38,10 @@
#include "ompi/mca/common/sm/common_sm_mmap.h"
#include "ompi/proc/proc.h"
#if OPAL_ENABLE_FT == 1
#include "opal/runtime/opal_cr.h"
#endif
/*
* Local functions
*/
@ -62,8 +66,8 @@ mca_mpool_sm_component_t mca_mpool_sm_component = {
mca_mpool_sm_close
},
{
/* The component is not checkpoint ready */
MCA_BASE_METADATA_PARAM_NONE
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
mca_mpool_sm_init
@ -134,7 +138,17 @@ static int mca_mpool_sm_close( void )
{
if( NULL != mca_common_sm_mmap ) {
if( OMPI_SUCCESS == mca_common_sm_mmap_fini( mca_common_sm_mmap ) ) {
#if OPAL_ENABLE_FT == 1
/* Only unlink the file if we are *not* restarting
* If we are restarting the file will be unlinked at a later time.
*/
if(OPAL_CR_STATUS_RESTART_PRE != opal_cr_checkpointing_state &&
OPAL_CR_STATUS_RESTART_POST != opal_cr_checkpointing_state ) {
unlink( mca_common_sm_mmap->map_path );
}
#else
unlink( mca_common_sm_mmap->map_path );
#endif
}
OBJ_RELEASE( mca_common_sm_mmap );
}

Просмотреть файл

@ -28,7 +28,12 @@
#include "opal/mca/maffinity/maffinity.h"
#include "opal/mca/maffinity/maffinity_types.h"
#include "opal/mca/maffinity/base/base.h"
#include "orte/util/proc_info.h"
#if OPAL_ENABLE_FT == 1
#include "ompi/mca/mpool/base/base.h"
#include "ompi/runtime/ompi_cr.h"
#endif
/*
* Initializes the mpool module.
@ -116,15 +121,54 @@ void mca_mpool_sm_free(mca_mpool_base_module_t* mpool, void * addr,
mpool_sm->sm_allocator->alc_free(mpool_sm->sm_allocator, addr);
}
#if OPAL_ENABLE_FT == 0
int mca_mpool_sm_ft_event(int state) {
return OMPI_SUCCESS;
}
#else
int mca_mpool_sm_ft_event(int state) {
mca_mpool_base_module_t *self_module = NULL;
char * file_name = NULL;
if(OPAL_CRS_CHECKPOINT == state) {
;
/* Record the shared memory filename */
asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s",
orte_process_info.job_session_dir,
orte_process_info.nodename );
opal_crs_base_metadata_write_token(NULL, CRS_METADATA_TOUCH, file_name);
free(file_name);
file_name = NULL;
}
else if(OPAL_CRS_CONTINUE == state) {
;
if(ompi_cr_continue_like_restart) {
/* Remove self from the list of all modules */
self_module = mca_mpool_base_module_lookup("sm");
mca_mpool_base_module_destroy(self_module);
/* Release the old sm file, if it exists */
if( NULL != mca_common_sm_mmap ) {
if( OMPI_SUCCESS == mca_common_sm_mmap_fini( mca_common_sm_mmap ) ) {
/* Add old shared memory file for eventual removal */
opal_crs_base_cleanup_append(mca_common_sm_mmap->map_path, false);
}
OBJ_RELEASE( mca_common_sm_mmap );
}
}
}
else if(OPAL_CRS_RESTART == state) {
;
else if(OPAL_CRS_RESTART == state ||
OPAL_CRS_RESTART_PRE == state) {
/* Remove self from the list of all modules */
self_module = mca_mpool_base_module_lookup("sm");
mca_mpool_base_module_destroy(self_module);
/* Release the old sm file, if it exists */
if( NULL != mca_common_sm_mmap ) {
if( OMPI_SUCCESS == mca_common_sm_mmap_fini( mca_common_sm_mmap ) ) {
/* Add old shared memory file for eventual removal */
opal_crs_base_cleanup_append(mca_common_sm_mmap->map_path, false);
}
OBJ_RELEASE( mca_common_sm_mmap );
}
}
else if(OPAL_CRS_TERM == state ) {
;
@ -135,3 +179,4 @@ int mca_mpool_sm_ft_event(int state) {
return OMPI_SUCCESS;
}
#endif /* OPAL_ENABLE_FT */

Просмотреть файл

@ -547,17 +547,61 @@ void mca_pml_ob1_error_handler(
orte_errmgr.abort(-1, NULL);
}
#if OPAL_ENABLE_FT == 0
int mca_pml_ob1_ft_event( int state ) {
return OMPI_SUCCESS;
}
#else
int mca_pml_ob1_ft_event( int state )
{
static bool first_continue_pass = false;
ompi_proc_t** procs = NULL;
size_t num_procs;
int ret, p;
if(OPAL_CRS_CHECKPOINT == state) {
;
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
orte_grpcomm.barrier();
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
}
else if(OPAL_CRS_CONTINUE == state) {
;
first_continue_pass = !first_continue_pass;
if( !first_continue_pass ) {
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
orte_grpcomm.barrier();
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
}
if( ompi_cr_continue_like_restart && !first_continue_pass ) {
/*
* Get a list of processes
*/
procs = ompi_proc_all(&num_procs);
if(NULL == procs) {
return OMPI_ERR_OUT_OF_RESOURCE;
}
/*
* Refresh the proc structure, and publish our proc info in the modex.
* NOTE: Do *not* call ompi_proc_finalize as there are many places in
* the code that point to indv. procs in this strucutre. For our
* needs here we only need to fix up the modex, bml and pml
* references.
*/
if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) {
opal_output(0,
"pml:ob1: ft_event(Restart): proc_refresh Failed %d",
ret);
free (procs);
return ret;
}
}
}
else if(OPAL_CRS_RESTART_PRE == state ) {
/* Nothing here */
@ -612,10 +656,64 @@ int mca_pml_ob1_ft_event( int state )
}
if(OPAL_CRS_CHECKPOINT == state) {
;
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P1);
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR0);
/* JJH Cannot barrier here due to progress engine -- orte_grpcomm.barrier();*/
}
}
else if(OPAL_CRS_CONTINUE == state) {
;
if( !first_continue_pass ) {
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
orte_grpcomm.barrier();
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
}
if( ompi_cr_continue_like_restart && !first_continue_pass ) {
/*
* Exchange the modex information once again.
* BTLs will have republished their modex information.
*/
if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) {
opal_output(0,
"pml:ob1: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
ret);
return ret;
}
/*
* Startup the PML stack now that the modex is running again
* Add the new procs (BTLs redo modex recv's)
*/
if( OMPI_SUCCESS != (ret = mca_pml_ob1_add_procs(procs, num_procs) ) ) {
opal_output(0, "pml:ob1: ft_event(Restart): Failed in add_procs (%d)", ret);
return ret;
}
/* Is this barrier necessary ? JJH */
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
opal_output(0, "pml:ob1: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
return ret;
}
if( NULL != procs ) {
for(p = 0; p < (int)num_procs; ++p) {
OBJ_RELEASE(procs[p]);
}
free(procs);
procs = NULL;
}
}
if( !first_continue_pass ) {
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
orte_grpcomm.barrier();
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
}
}
else if(OPAL_CRS_RESTART_PRE == state ) {
/* Nothing here */
@ -664,6 +762,7 @@ int mca_pml_ob1_ft_event( int state )
return OMPI_SUCCESS;
}
#endif /* OPAL_ENABLE_FT */
int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2)
{

Просмотреть файл

@ -50,6 +50,7 @@
#include "orte/mca/snapc/base/base.h"
#include "orte/runtime/runtime.h"
#include "orte/util/show_help.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "ompi/constants.h"
#include "ompi/mca/pml/pml.h"
@ -72,6 +73,8 @@ static int ompi_cr_coord_post_ckpt(void);
static int ompi_cr_coord_post_restart(void);
static int ompi_cr_coord_post_continue(void);
bool ompi_cr_continue_like_restart = false;
/*************
* Local vars
*************/
@ -159,6 +162,9 @@ int ompi_cr_init(void)
ompi_cr_output = opal_cr_output;
}
/* Typically this is not needed. Individual BTLs will set this as needed */
ompi_cr_continue_like_restart = false;
opal_output_verbose(10, ompi_cr_output,
"ompi_cr: init: ompi_cr_init()");
@ -195,6 +201,9 @@ int ompi_cr_coord(int state)
* take action given the state.
*/
if(OPAL_CRS_CHECKPOINT == state) {
/* Default: use the fast way */
ompi_cr_continue_like_restart = false;
/* Do Checkpoint Phase work */
ret = ompi_cr_coord_pre_ckpt();
if( ret == OMPI_EXISTS) {
@ -317,6 +326,8 @@ static int ompi_cr_coord_pre_restart(void) {
}
static int ompi_cr_coord_pre_continue(void) {
int ret, exit_status = OMPI_SUCCESS;
/*
* Can not really do much until ORTE is up and running,
* so defer action until the post_continue function.
@ -324,7 +335,26 @@ static int ompi_cr_coord_pre_continue(void) {
opal_output_verbose(10, ompi_cr_output,
"ompi_cr: coord_pre_continue: ompi_cr_coord_pre_continue()");
return OMPI_SUCCESS;
if( ompi_cr_continue_like_restart ) {
/* Mimic ompi_cr_coord_pre_restart(); */
if( ORTE_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) {
exit_status = ret;
goto cleanup;
}
}
else {
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
if( opal_cr_timing_barrier_enabled ) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
}
cleanup:
return exit_status;
}
/*************

Просмотреть файл

@ -49,6 +49,12 @@ extern "C" {
*/
OMPI_DECLSPEC extern int ompi_cr_output;
/*
* If one of the BTLs that shutdown require a full, clean rebuild of the
* point-to-point stack on 'continue' as well as 'restart'.
*/
OPAL_DECLSPEC extern bool ompi_cr_continue_like_restart;
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -503,6 +503,21 @@ int mca_base_param_set_int(int index, int value)
return OPAL_SUCCESS;
}
/*
* Deregister a parameter
*/
int mca_base_param_deregister(int index)
{
size_t size;
/* Lookup the index and see if it's valid */
size = opal_value_array_get_size(&mca_base_params);
if (index < 0 || ((size_t) index) > size) {
return OPAL_ERROR;
}
return opal_value_array_remove_item(&mca_base_params, index);
}
/*
* Look up a string MCA parameter.

Просмотреть файл

@ -470,6 +470,14 @@ extern "C" {
const char *syn_param_name,
bool deprecated);
/**
* Deregister a MCA parameter
*
* @param index Index returned from mca_base_param_register_init()
*
*/
OPAL_DECLSPEC int mca_base_param_deregister(int index);
/**
* Look up an integer MCA parameter.
*

Просмотреть файл

@ -304,7 +304,7 @@ int opal_crs_base_cleanup_flush(void)
argc = opal_argv_count(cleanup_file_argv);
for( i = 0; i < argc; ++i) {
opal_output_verbose(15, opal_crs_base_output,
"opal:crs: cleanup_flush: Remove File <%s>\n", cleanup_dir_argv[i]);
"opal:crs: cleanup_flush: Remove File <%s>\n", cleanup_file_argv[i]);
unlink(cleanup_file_argv[i]);
}

Просмотреть файл

@ -77,6 +77,14 @@ bool opal_cr_stall_check = false;
bool opal_cr_currently_stalled = false;
int opal_cr_output;
static double opal_cr_get_time(void);
static void display_indv_timer_core(double diff, char *str);
static double timer_start[OPAL_CR_TIMER_MAX];
bool opal_cr_timing_barrier_enabled = false;
bool opal_cr_timing_enabled = false;
int opal_cr_timing_my_rank = 0;
int opal_cr_timing_target_rank = 0;
/******************
* Local Functions & Var Decls
******************/
@ -214,6 +222,28 @@ int opal_cr_init(void )
"opal_cr: init: FT Enabled: %d",
val);
mca_base_param_reg_int_name("opal_cr", "enable_timer",
"Enable Checkpoint timer (Default: Disabled)",
false, false,
0, &val);
opal_cr_timing_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int_name("opal_cr", "enable_timer_barrier",
"Enable Checkpoint timer Barrier (Default: Disabled)",
false, false,
0, &val);
if( opal_cr_timing_enabled ) {
opal_cr_timing_barrier_enabled = OPAL_INT_TO_BOOL(val);
} else {
opal_cr_timing_barrier_enabled = false;
}
mca_base_param_reg_int_name("opal_cr", "timer_target_rank",
"Target Rank for the timer (Default: 0)",
false, false,
0, &val);
opal_cr_timing_target_rank = val;
#if OPAL_ENABLE_FT_THREAD == 1
mca_base_param_reg_int_name("opal_cr", "use_thread",
"Use an async thread to checkpoint this program (Default: Disabled)",
@ -505,6 +535,7 @@ int opal_cr_inc_core(pid_t pid, opal_crs_base_snapshot_t *snapshot, bool term, i
/*
* Take the checkpoint
*/
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE0);
if(OPAL_SUCCESS != (ret = opal_crs.crs_checkpoint(pid, snapshot, (opal_crs_state_type_t *)state))) {
opal_output(opal_cr_output,
"opal_cr: inc_core: Error: The checkpoint failed. %d\n", ret);
@ -513,6 +544,8 @@ int opal_cr_inc_core(pid_t pid, opal_crs_base_snapshot_t *snapshot, bool term, i
}
if(*state == OPAL_CRS_CONTINUE) {
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE1);
if(term) {
*state = OPAL_CRS_TERM;
opal_cr_checkpointing_state = OPAL_CR_STATUS_TERM;
@ -869,3 +902,127 @@ void opal_cr_thread_noop_progress(void)
}
#endif /* OPAL_ENABLE_FT_THREAD == 1 */
static double opal_cr_get_time() {
double wtime;
#if OPAL_TIMER_USEC_NATIVE
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
#else
struct timeval tv;
gettimeofday(&tv, NULL);
wtime = tv.tv_sec;
wtime += (double)tv.tv_usec / 1000000.0;
#endif
return wtime;
}
void opal_cr_set_time(int idx)
{
if(idx < OPAL_CR_TIMER_MAX ) {
if( timer_start[idx] <= 0.0 ) {
timer_start[idx] = opal_cr_get_time();
}
}
}
void opal_cr_clear_timers(void)
{
int i;
for(i = 0; i < OPAL_CR_TIMER_MAX; ++i) {
timer_start[i] = 0.0;
}
}
static void display_indv_timer_core(double diff, char *str) {
double total = 0;
double perc = 0;
total = timer_start[OPAL_CR_TIMER_MAX-1] - timer_start[OPAL_CR_TIMER_ENTRY0];
perc = (diff/total) * 100;
opal_output(0,
"opal_cr: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n",
str,
diff,
total,
perc);
return;
}
void opal_cr_display_all_timers(void)
{
double diff = 0.0;
char * label = NULL;
if( opal_cr_timing_target_rank != opal_cr_timing_my_rank ) {
return;
}
opal_output(0, "OPAL CR Timing: ******************** Summary Begin\n");
/********** Entry into the system **********/
label = strdup("Start Entry Point");
if( opal_cr_timing_barrier_enabled ) {
diff = timer_start[OPAL_CR_TIMER_CRCPBR0] - timer_start[OPAL_CR_TIMER_ENTRY0];
} else {
diff = timer_start[OPAL_CR_TIMER_CRCP0] - timer_start[OPAL_CR_TIMER_ENTRY0];
}
display_indv_timer_core(diff, label);
free(label);
/********** CRCP Protocol **********/
label = strdup("CRCP Protocol");
if( opal_cr_timing_barrier_enabled ) {
diff = timer_start[OPAL_CR_TIMER_CRCPBR1] - timer_start[OPAL_CR_TIMER_CRCP0];
} else {
diff = timer_start[OPAL_CR_TIMER_P2P0] - timer_start[OPAL_CR_TIMER_CRCP0];
}
display_indv_timer_core(diff, label);
free(label);
/********** P2P Suspend **********/
label = strdup("P2P Suspend");
if( opal_cr_timing_barrier_enabled ) {
diff = timer_start[OPAL_CR_TIMER_P2PBR0] - timer_start[OPAL_CR_TIMER_P2P0];
} else {
diff = timer_start[OPAL_CR_TIMER_CORE0] - timer_start[OPAL_CR_TIMER_P2P0];
}
display_indv_timer_core(diff, label);
free(label);
/********** Checkpoint to Disk **********/
label = strdup("Checkpoint");
diff = timer_start[OPAL_CR_TIMER_CORE1] - timer_start[OPAL_CR_TIMER_CORE0];
display_indv_timer_core(diff, label);
free(label);
/********** P2P Reactivation **********/
label = strdup("P2P Reactivation");
if( opal_cr_timing_barrier_enabled ) {
diff = timer_start[OPAL_CR_TIMER_P2PBR2] - timer_start[OPAL_CR_TIMER_CORE1];
} else {
diff = timer_start[OPAL_CR_TIMER_CRCP1] - timer_start[OPAL_CR_TIMER_CORE1];
}
display_indv_timer_core(diff, label);
free(label);
/********** CRCP Protocol Finalize **********/
label = strdup("CRCP Cleanup");
if( opal_cr_timing_barrier_enabled ) {
diff = timer_start[OPAL_CR_TIMER_COREBR1] - timer_start[OPAL_CR_TIMER_CRCP1];
} else {
diff = timer_start[OPAL_CR_TIMER_CORE2] - timer_start[OPAL_CR_TIMER_CRCP1];
}
display_indv_timer_core(diff, label);
free(label);
/********** Exit the system **********/
label = strdup("Finish Entry Point");
diff = timer_start[OPAL_CR_TIMER_ENTRY4] - timer_start[OPAL_CR_TIMER_CORE2];
display_indv_timer_core(diff, label);
free(label);
opal_output(0, "OPAL CR Timing: ******************** Summary End\n");
}

Просмотреть файл

@ -267,7 +267,65 @@ typedef enum opal_cr_ckpt_cmd_state_t opal_cr_ckpt_cmd_state_t;
* OPAL Checkpoint Coordination Routine
*/
OPAL_DECLSPEC int opal_cr_coord(int state);
/**
* Checkpoint life-cycle timing
*/
OPAL_DECLSPEC void opal_cr_set_time(int idx);
OPAL_DECLSPEC void opal_cr_display_all_timers(void);
OPAL_DECLSPEC void opal_cr_clear_timers(void);
OPAL_DECLSPEC extern bool opal_cr_timing_enabled;
OPAL_DECLSPEC extern bool opal_cr_timing_barrier_enabled;
OPAL_DECLSPEC extern int opal_cr_timing_my_rank;
OPAL_DECLSPEC extern int opal_cr_timing_target_rank;
#define OPAL_CR_TIMER_ENTRY0 0
#define OPAL_CR_TIMER_ENTRY1 1
#define OPAL_CR_TIMER_ENTRY2 2
#define OPAL_CR_TIMER_CRCPBR0 3
#define OPAL_CR_TIMER_CRCP0 4
#define OPAL_CR_TIMER_CRCPBR1 5
#define OPAL_CR_TIMER_P2P0 6
#define OPAL_CR_TIMER_P2P1 7
#define OPAL_CR_TIMER_P2PBR0 8
#define OPAL_CR_TIMER_CORE0 9
#define OPAL_CR_TIMER_CORE1 10
#define OPAL_CR_TIMER_COREBR0 11
#define OPAL_CR_TIMER_P2P2 12
#define OPAL_CR_TIMER_P2PBR1 13
#define OPAL_CR_TIMER_P2P3 14
#define OPAL_CR_TIMER_P2PBR2 15
#define OPAL_CR_TIMER_CRCP1 16
#define OPAL_CR_TIMER_COREBR1 17
#define OPAL_CR_TIMER_CORE2 18
#define OPAL_CR_TIMER_ENTRY3 19
#define OPAL_CR_TIMER_ENTRY4 20
#define OPAL_CR_TIMER_MAX 21
#define OPAL_CR_CLEAR_TIMERS() \
{ \
if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
opal_cr_clear_timers(); \
} \
}
#define OPAL_CR_SET_TIMER(idx) \
{ \
if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
opal_cr_set_time(idx); \
} \
}
#define OPAL_CR_DISPLAY_ALL_TIMERS() \
{ \
if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
opal_cr_display_all_timers(); \
} \
}
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif

Просмотреть файл

@ -1613,11 +1613,13 @@ int mca_oob_tcp_ft_event(int state) {
*/
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
opal_event_disable();
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
}
else if(OPAL_CRS_CONTINUE == state) {
/*
* Resume event processing
*/
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
opal_event_enable();
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
}
@ -1636,6 +1638,8 @@ int mca_oob_tcp_ft_event(int state) {
MCA_OOB_TCP_PEER_RETURN(peer);
}
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_free);
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_names);
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peers);

Просмотреть файл

@ -173,6 +173,10 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
goto STAGE_1;
}
OPAL_CR_CLEAR_TIMERS();
opal_cr_timing_my_rank = ORTE_PROC_MY_NAME->vpid;
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY0);
/*
* Open communication channels
*/
@ -193,6 +197,8 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
goto ckpt_cleanup;
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY1);
/*
* Begin checkpoint
* - Init the checkpoint metadata file
@ -205,6 +211,8 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
goto ckpt_cleanup;
}
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY2);
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
"App) notify_response: Start checkpoint..."));
STAGE_1:
@ -255,6 +263,7 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
/*
* Final Handshake
*/
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY3);
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
"App) notify_response: Waiting for final handshake."));
if( ORTE_SUCCESS != (ret = snapc_full_app_ckpt_handshake_end(cr_state ) ) ) {
@ -282,6 +291,11 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
opal_cr_checkpointing_state = OPAL_CR_STATUS_NONE;
opal_cr_currently_stalled = false;
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY4);
if(OPAL_CRS_RESTART != cr_state) {
OPAL_CR_DISPLAY_ALL_TIMERS();
}
return exit_status;
}