Commit to bring online OpenIB, MX, and shared memory support for Open MPI's checkpoint/restart functionality. Some tuning is still needed, but basic functionality is in place.
There is still a problem with OpenIB and threads (external to C/R functionality). It has been reported in Ticket #1539 Additionally: * Fix a file cleanup bug in CRS Base. * Fix a possible deadlock in the TCP ft_event function * Add a mca_base_param_deregister() function to MCA base * Add whole process checkpoint timers * Add support for BTL: OpenIB, MX, Shared Memory * Add support Mpool: rdma, sm * Sundry bounds checking an cleanup in some scattered functions This commit was SVN r19756.
Этот коммит содержится в:
родитель
b46d3e766e
Коммит
88aa45dd52
@ -41,3 +41,12 @@ crcp=bkmrk
|
||||
# Temporary fix to force the event engine to use poll to behave well with BLCR
|
||||
#
|
||||
opal_event_include=poll
|
||||
|
||||
#
|
||||
# We currently only support the following options to the OpenIB BTL
|
||||
# Future development will attempt to eliminate many of these restrictions
|
||||
#
|
||||
btl_openib_want_fork_support=1
|
||||
btl_openib_use_async_event_thread=0
|
||||
btl_openib_use_eager_rdma=0
|
||||
btl_openib_cpc_include=oob
|
||||
|
@ -43,17 +43,28 @@
|
||||
|
||||
int mca_bml_r2_ft_event(int state)
|
||||
{
|
||||
static bool first_continue_pass = false;
|
||||
ompi_proc_t** procs = NULL;
|
||||
size_t num_procs;
|
||||
size_t btl_idx;
|
||||
int ret, p;
|
||||
int loc_state;
|
||||
int param_type = -1;
|
||||
char *param_list = NULL;
|
||||
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
/* Do nothing for now */
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
/* Since nothing in Checkpoint, we are fine here */
|
||||
first_continue_pass = !first_continue_pass;
|
||||
|
||||
/* Since nothing in Checkpoint, we are fine here (unless required by BTL) */
|
||||
if( ompi_cr_continue_like_restart && !first_continue_pass) {
|
||||
procs = ompi_proc_all(&num_procs);
|
||||
if(NULL == procs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_RESTART_PRE == state ) {
|
||||
/* Nothing here */
|
||||
@ -76,56 +87,119 @@ int mca_bml_r2_ft_event(int state)
|
||||
* no longer exist.
|
||||
*/
|
||||
if( OPAL_CRS_RESTART != state ) {
|
||||
/* Since we only ever call into the BTLs once during the first restart
|
||||
* pass, just lie to them on this pass for a bit of local clarity.
|
||||
*/
|
||||
if( OPAL_CRS_RESTART_PRE == state ) {
|
||||
loc_state = OPAL_CRS_RESTART;
|
||||
if( OPAL_CRS_CONTINUE == state && !first_continue_pass ) {
|
||||
;
|
||||
} else {
|
||||
loc_state = state;
|
||||
}
|
||||
|
||||
/*
|
||||
* Call ft_event in:
|
||||
* - BTL modules
|
||||
* - MPool modules
|
||||
*
|
||||
* These should be cleaning out stale state, and memory references in
|
||||
* preparation for being shut down.
|
||||
*/
|
||||
for(btl_idx = 0; btl_idx < mca_bml_r2.num_btl_modules; btl_idx++) {
|
||||
/*
|
||||
* Notify Mpool
|
||||
/* Since we only ever call into the BTLs once during the first restart
|
||||
* pass, just lie to them on this pass for a bit of local clarity.
|
||||
*/
|
||||
if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool &&
|
||||
NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event ) {
|
||||
opal_output_verbose(10, ompi_cr_output,
|
||||
"bml:r2: ft_event: Notify the %s MPool.\n",
|
||||
(mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_component->mpool_version.mca_component_name);
|
||||
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event(loc_state) ) ) {
|
||||
continue;
|
||||
}
|
||||
if( OPAL_CRS_RESTART_PRE == state ) {
|
||||
loc_state = OPAL_CRS_RESTART;
|
||||
} else {
|
||||
loc_state = state;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify BTL
|
||||
* Call ft_event in:
|
||||
* - BTL modules
|
||||
* - MPool modules
|
||||
*
|
||||
* These should be cleaning out stale state, and memory references in
|
||||
* preparation for being shut down.
|
||||
*/
|
||||
if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event) {
|
||||
opal_output_verbose(10, ompi_cr_output,
|
||||
"bml:r2: ft_event: Notify the %s BTL.\n",
|
||||
(mca_bml_r2.btl_modules[btl_idx])->btl_component->btl_version.mca_component_name);
|
||||
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event(loc_state) ) ) {
|
||||
continue;
|
||||
for(btl_idx = 0; btl_idx < mca_bml_r2.num_btl_modules; btl_idx++) {
|
||||
/*
|
||||
* Notify Mpool
|
||||
*/
|
||||
if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool &&
|
||||
NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event ) {
|
||||
opal_output_verbose(10, ompi_cr_output,
|
||||
"bml:r2: ft_event: Notify the %s MPool.\n",
|
||||
(mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_component->mpool_version.mca_component_name);
|
||||
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event(loc_state) ) ) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify BTL
|
||||
*/
|
||||
if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event) {
|
||||
opal_output_verbose(10, ompi_cr_output,
|
||||
"bml:r2: ft_event: Notify the %s BTL.\n",
|
||||
(mca_bml_r2.btl_modules[btl_idx])->btl_component->btl_version.mca_component_name);
|
||||
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event(loc_state) ) ) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} /* OPAL_CRS_CONTINUE == state && !first_continue_pass */
|
||||
}
|
||||
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
;
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
;
|
||||
/* Matches OPAL_CRS_RESTART_PRE */
|
||||
if( ompi_cr_continue_like_restart && first_continue_pass) {
|
||||
if( OMPI_SUCCESS != (ret = mca_bml_r2_finalize()) ) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed to finalize BML framework\n");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
/* Matches OPAL_CRS_RESTART */
|
||||
else if( ompi_cr_continue_like_restart && !first_continue_pass ) {
|
||||
/*
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
opal_output_verbose(10, ompi_cr_output,
|
||||
"bml:r2: ft_event(Restart): Cleanup restart files\n");
|
||||
opal_crs_base_cleanup_flush();
|
||||
|
||||
/*
|
||||
* Re-open the BTL framework to get the full list of components.
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = mca_btl_base_open()) ) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed to open BTL framework\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Re-select the BTL components/modules
|
||||
* This will cause the BTL components to discover the available
|
||||
* network options on this machine, and post proper modex informaiton.
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = mca_btl_base_select(OMPI_ENABLE_PROGRESS_THREADS,
|
||||
OMPI_ENABLE_MPI_THREADS) ) ) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed to select in BTL framework\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear some structures so we can properly repopulate them
|
||||
*/
|
||||
mca_bml_r2.btls_added = false;
|
||||
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
if( NULL != procs[p]->proc_bml) {
|
||||
OBJ_RELEASE(procs[p]->proc_bml);
|
||||
procs[p]->proc_bml = NULL;
|
||||
}
|
||||
|
||||
OBJ_RELEASE(procs[p]);
|
||||
}
|
||||
|
||||
if( NULL != procs ) {
|
||||
free(procs);
|
||||
procs = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_RESTART_PRE == state ) {
|
||||
opal_output_verbose(10, ompi_cr_output,
|
||||
@ -163,12 +237,35 @@ int mca_bml_r2_ft_event(int state)
|
||||
|
||||
/*
|
||||
* Re-open the BTL framework to get the full list of components.
|
||||
* - but first clear the MCA value that was there
|
||||
*/
|
||||
param_type = mca_base_param_find("btl", NULL, NULL);
|
||||
mca_base_param_lookup_string(param_type, ¶m_list);
|
||||
opal_output_verbose(11, ompi_cr_output,
|
||||
"Restart (Previous BTL MCA): <%s>\n", param_list);
|
||||
if( NULL != param_list ) {
|
||||
free(param_list);
|
||||
param_list = NULL;
|
||||
}
|
||||
|
||||
/* Deregister the old value, and refresh the file cache to grab any updates */
|
||||
mca_base_param_deregister(param_type);
|
||||
mca_base_param_recache_files(false);
|
||||
|
||||
if( OMPI_SUCCESS != (ret = mca_btl_base_open()) ) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed to open BTL framework\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
param_type = mca_base_param_find("btl", NULL, NULL);
|
||||
mca_base_param_lookup_string(param_type, ¶m_list);
|
||||
opal_output_verbose(11, ompi_cr_output,
|
||||
"Restart (New BTL MCA): <%s>\n", param_list);
|
||||
if( NULL != param_list ) {
|
||||
free(param_list);
|
||||
param_list = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Re-select the BTL components/modules
|
||||
* This will cause the BTL components to discover the available
|
||||
|
@ -19,6 +19,10 @@
|
||||
#include "ompi_config.h"
|
||||
#include "opal/util/if.h"
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
#include "ompi/runtime/ompi_cr.h"
|
||||
#endif
|
||||
|
||||
#include "btl_mx.h"
|
||||
#include "btl_mx_frag.h"
|
||||
#include "btl_mx_proc.h"
|
||||
@ -616,9 +620,36 @@ int mca_btl_mx_finalize( struct mca_btl_base_module_t* btl )
|
||||
}
|
||||
|
||||
|
||||
#if OPAL_ENABLE_FT == 0
|
||||
int mca_btl_mx_ft_event(int state) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#else
|
||||
int mca_btl_mx_ft_event(int state) {
|
||||
mca_btl_mx_module_t* mx_btl;
|
||||
int i;
|
||||
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
;
|
||||
/* Continue must reconstruct the routes (including modex), since we
|
||||
* have to tear down the devices completely.
|
||||
* We have to do this because the MX driver can be checkpointed, but
|
||||
* cannot be restarted with BLCR due to an mmap problem. If we do not
|
||||
* close MX then BLCR throws the following error in /var/log/messages:
|
||||
* kernel: do_mmap(<file>, 00002aaab0aac000, 0000000000400000, ...) failed: ffffffffffffffff
|
||||
* kernel: vmadump: mmap failed: /dev/mx0
|
||||
* kernel: blcr: thaw_threads returned error, aborting. -1
|
||||
* JJH: It may be possible to, instead of restarting the entire driver, just reconnect endpoints
|
||||
*/
|
||||
ompi_cr_continue_like_restart = true;
|
||||
|
||||
for( i = 0; i < mca_btl_mx_component.mx_num_btls; i++ ) {
|
||||
mx_btl = mca_btl_mx_component.mx_btls[i];
|
||||
|
||||
if( NULL != mx_btl->mx_endpoint ) {
|
||||
mx_close_endpoint(mx_btl->mx_endpoint);
|
||||
mx_btl->mx_endpoint = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
;
|
||||
@ -635,6 +666,7 @@ int mca_btl_mx_ft_event(int state) {
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#endif /* OPAL_ENABLE_FT */
|
||||
|
||||
mca_btl_mx_module_t mca_btl_mx_module = {
|
||||
{
|
||||
|
@ -58,8 +58,8 @@ mca_btl_mx_component_t mca_btl_mx_component = {
|
||||
mca_btl_mx_component_close /* component close */
|
||||
},
|
||||
{
|
||||
/* The component is not checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_NONE
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
mca_btl_mx_component_init,
|
||||
|
@ -32,6 +32,11 @@
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/btl/btl.h"
|
||||
#include "ompi/mca/btl/base/btl_base_error.h"
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
#include "ompi/runtime/ompi_cr.h"
|
||||
#endif
|
||||
|
||||
#include "btl_openib.h"
|
||||
#include "btl_openib_frag.h"
|
||||
#include "btl_openib_proc.h"
|
||||
@ -91,6 +96,10 @@ mca_btl_openib_module_t mca_btl_openib_module = {
|
||||
}
|
||||
};
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
static int ft_event_btl_openib_finalize(struct mca_btl_base_module_t* btl);
|
||||
#endif
|
||||
|
||||
static void show_init_error(const char *file, int line,
|
||||
const char *func, const char *dev)
|
||||
{
|
||||
@ -936,6 +945,11 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl)
|
||||
|
||||
openib_btl = (mca_btl_openib_module_t*) btl;
|
||||
|
||||
/* Sanity check */
|
||||
if( mca_btl_openib_component.ib_num_btls <= 0 ) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Release all QPs */
|
||||
for (ep_index=0;
|
||||
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
|
||||
@ -1185,12 +1199,36 @@ int mca_btl_openib_get(mca_btl_base_module_t* btl,
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT == 0
|
||||
int mca_btl_openib_ft_event(int state) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#else
|
||||
int mca_btl_openib_ft_event(int state) {
|
||||
int i;
|
||||
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
;
|
||||
/* Continue must reconstruct the routes (including modex), since we
|
||||
* have to tear down the devices completely. */
|
||||
ompi_cr_continue_like_restart = true;
|
||||
|
||||
/*
|
||||
* To keep the node from crashing we need to call ibv_close_device
|
||||
* before the checkpoint is taken. To do this we need to tear
|
||||
* everything down, and rebuild it all on continue/restart. :(
|
||||
*/
|
||||
|
||||
/* Shutdown all modules
|
||||
* - Do this backwards since the openib_finalize function also loops
|
||||
* over this variable.
|
||||
*/
|
||||
for (i = 0; i < mca_btl_openib_component.ib_num_btls; ++i ) {
|
||||
ft_event_btl_openib_finalize( &(mca_btl_openib_component.openib_btls[i])->super);
|
||||
}
|
||||
ompi_btl_openib_connect_base_finalize();
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
;
|
||||
; /* Cleared by forcing the modex, no work needed */
|
||||
}
|
||||
else if(OPAL_CRS_RESTART == state) {
|
||||
;
|
||||
@ -1204,3 +1242,72 @@ int mca_btl_openib_ft_event(int state) {
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
static int ft_event_btl_openib_finalize(struct mca_btl_base_module_t* btl) {
|
||||
mca_btl_openib_module_t* openib_btl;
|
||||
mca_btl_openib_endpoint_t* endpoint;
|
||||
int ep_index, i;
|
||||
int qp, rc = OMPI_SUCCESS;
|
||||
|
||||
openib_btl = (mca_btl_openib_module_t*) btl;
|
||||
|
||||
/* Release all QPs */
|
||||
for(ep_index=0;
|
||||
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
|
||||
ep_index++) {
|
||||
endpoint=opal_pointer_array_get_item(openib_btl->device->endpoints,
|
||||
ep_index);
|
||||
if(!endpoint) {
|
||||
BTL_VERBOSE(("In finalize, got another null endpoint"));
|
||||
continue;
|
||||
}
|
||||
if(endpoint->endpoint_btl != openib_btl)
|
||||
continue;
|
||||
for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) {
|
||||
if(openib_btl->device->eager_rdma_buffers[i] == endpoint) {
|
||||
openib_btl->device->eager_rdma_buffers[i] = NULL;
|
||||
OBJ_RELEASE(endpoint);
|
||||
}
|
||||
}
|
||||
OBJ_RELEASE(endpoint);
|
||||
}
|
||||
|
||||
/* Finalize the CPC modules on this openib module */
|
||||
for (i = 0; i < openib_btl->num_cpcs; ++i) {
|
||||
if (NULL != openib_btl->cpcs[i]->cbm_finalize) {
|
||||
openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]);
|
||||
}
|
||||
free(openib_btl->cpcs[i]);
|
||||
}
|
||||
free(openib_btl->cpcs);
|
||||
|
||||
/* Release SRQ resources */
|
||||
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
||||
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
|
||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
|
||||
if (ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)){
|
||||
BTL_VERBOSE(("Failed to close SRQ %d", qp));
|
||||
rc = OMPI_ERROR;
|
||||
}
|
||||
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
|
||||
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
|
||||
}
|
||||
}
|
||||
|
||||
/* Release device if there are no more users */
|
||||
if(!(--openib_btl->device->btls)) {
|
||||
OBJ_RELEASE(openib_btl->device);
|
||||
}
|
||||
mca_btl_openib_component.devices_count = 0;
|
||||
mca_btl_openib_component.ib_num_btls = 0;
|
||||
OBJ_DESTRUCT(&mca_btl_openib_component.ib_procs);
|
||||
|
||||
BTL_VERBOSE(("Success in closing BTL resources"));
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
#endif /* OPAL_ENABLE_FT */
|
||||
|
@ -111,8 +111,8 @@ mca_btl_openib_component_t mca_btl_openib_component = {
|
||||
btl_openib_component_close /* component close */
|
||||
},
|
||||
{
|
||||
/* The component is not checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_NONE
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
btl_openib_component_init,
|
||||
|
@ -500,5 +500,6 @@ void ompi_btl_openib_connect_base_finalize(void)
|
||||
}
|
||||
}
|
||||
free(available);
|
||||
available = NULL;
|
||||
}
|
||||
}
|
||||
|
@ -44,6 +44,12 @@
|
||||
#include "ompi/mca/mpool/base/base.h"
|
||||
#include "ompi/mca/common/sm/common_sm_mmap.h"
|
||||
#include "ompi/mca/mpool/sm/mpool_sm.h"
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
#include "ompi/runtime/ompi_cr.h"
|
||||
#endif
|
||||
|
||||
#include "btl_sm.h"
|
||||
#include "btl_sm_endpoint.h"
|
||||
#include "btl_sm_frag.h"
|
||||
@ -824,15 +830,55 @@ int mca_btl_sm_send( struct mca_btl_base_module_t* btl,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT == 0
|
||||
int mca_btl_sm_ft_event(int state) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#else
|
||||
int mca_btl_sm_ft_event(int state) {
|
||||
/* Notify mpool */
|
||||
if( NULL != mca_btl_sm_component.sm_mpool &&
|
||||
NULL != mca_btl_sm_component.sm_mpool->mpool_ft_event) {
|
||||
mca_btl_sm_component.sm_mpool->mpool_ft_event(state);
|
||||
}
|
||||
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
;
|
||||
if( NULL != mca_btl_sm_component.mmap_file ) {
|
||||
/* On restart we need the old file names to exist (not necessarily
|
||||
* contain content) so the CRS component does not fail when searching
|
||||
* for these old file handles. The restart procedure will make sure
|
||||
* these files get cleaned up appropriately.
|
||||
*/
|
||||
opal_crs_base_metadata_write_token(NULL, CRS_METADATA_TOUCH, mca_btl_sm_component.mmap_file->map_path);
|
||||
|
||||
/* Record the job session directory */
|
||||
opal_crs_base_metadata_write_token(NULL, CRS_METADATA_MKDIR, orte_process_info.job_session_dir);
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
;
|
||||
if( ompi_cr_continue_like_restart ) {
|
||||
if( NULL != mca_btl_sm_component.mmap_file ) {
|
||||
/* Do not Add session directory on continue */
|
||||
|
||||
/* Add shared memory file */
|
||||
opal_crs_base_cleanup_append(mca_btl_sm_component.mmap_file->map_path, false);
|
||||
}
|
||||
|
||||
/* Clear this so we force the module to re-init the sm files */
|
||||
mca_btl_sm_component.sm_mpool = NULL;
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_RESTART == state) {
|
||||
;
|
||||
else if(OPAL_CRS_RESTART == state ||
|
||||
OPAL_CRS_RESTART_PRE == state) {
|
||||
if( NULL != mca_btl_sm_component.mmap_file ) {
|
||||
/* Add session directory */
|
||||
opal_crs_base_cleanup_append(orte_process_info.job_session_dir, true);
|
||||
/* Add shared memory file */
|
||||
opal_crs_base_cleanup_append(mca_btl_sm_component.mmap_file->map_path, false);
|
||||
}
|
||||
|
||||
/* Clear this so we force the module to re-init the sm files */
|
||||
mca_btl_sm_component.sm_mpool = NULL;
|
||||
}
|
||||
else if(OPAL_CRS_TERM == state ) {
|
||||
;
|
||||
@ -843,3 +889,4 @@ int mca_btl_sm_ft_event(int state) {
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#endif /* OPAL_ENABLE_FT */
|
||||
|
@ -52,6 +52,11 @@
|
||||
#include "ompi/mca/mpool/base/base.h"
|
||||
#include "ompi/mca/common/sm/common_sm_mmap.h"
|
||||
#include "ompi/mca/btl/base/btl_base_error.h"
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
#include "opal/runtime/opal_cr.h"
|
||||
#endif
|
||||
|
||||
#include "btl_sm.h"
|
||||
#include "btl_sm_frag.h"
|
||||
#include "btl_sm_fifo.h"
|
||||
@ -74,8 +79,8 @@ mca_btl_sm_component_t mca_btl_sm_component = {
|
||||
mca_btl_sm_component_close /* component close */
|
||||
},
|
||||
{
|
||||
/* The component is not checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_NONE
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
mca_btl_sm_component_init,
|
||||
@ -210,7 +215,17 @@ int mca_btl_sm_component_close(void)
|
||||
* to it are gone - no error checking, since we want all procs
|
||||
* to call this, so that in an abnormal termination scenario,
|
||||
* this file will still get cleaned up */
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
/* Only unlink the file if we are *not* restarting
|
||||
* If we are restarting the file will be unlinked at a later time.
|
||||
*/
|
||||
if(OPAL_CR_STATUS_RESTART_PRE != opal_cr_checkpointing_state &&
|
||||
OPAL_CR_STATUS_RESTART_POST != opal_cr_checkpointing_state ) {
|
||||
unlink(mca_btl_sm_component.mmap_file->map_path);
|
||||
}
|
||||
#else
|
||||
unlink(mca_btl_sm_component.mmap_file->map_path);
|
||||
#endif
|
||||
OBJ_RELEASE(mca_btl_sm_component.mmap_file);
|
||||
}
|
||||
|
||||
|
@ -42,6 +42,7 @@
|
||||
#include "ompi/mca/crcp/base/base.h"
|
||||
|
||||
#include "ompi/class/ompi_free_list.h"
|
||||
#include "ompi/runtime/ompi_cr.h"
|
||||
|
||||
#include "crcp_bkmrk.h"
|
||||
#include "crcp_bkmrk_pml.h"
|
||||
@ -2971,6 +2972,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
ompi_crcp_base_pml_state_t* pml_state)
|
||||
{
|
||||
static int step_to_return_to = 0;
|
||||
static bool first_continue_pass = false;
|
||||
opal_list_item_t* item = NULL;
|
||||
int exit_status = OMPI_SUCCESS;
|
||||
int ret;
|
||||
@ -2992,6 +2994,12 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR0);
|
||||
orte_grpcomm.barrier();
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP0);
|
||||
|
||||
START_TIMER(CRCP_TIMER_TOTAL_CKPT);
|
||||
STEP_1:
|
||||
step_to_return_to = 0;
|
||||
@ -3030,7 +3038,15 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
first_continue_pass = !first_continue_pass;
|
||||
|
||||
/* Only finalize the Protocol after the PML has been rebuilt */
|
||||
if( ompi_cr_continue_like_restart && first_continue_pass ) {
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
START_TIMER(CRCP_TIMER_TOTAL_CONT);
|
||||
|
||||
/*
|
||||
* Finish the coord protocol
|
||||
*/
|
||||
@ -3045,6 +3061,12 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
|
||||
DISPLAY_ALL_TIMERS(state);
|
||||
clear_timers();
|
||||
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR1);
|
||||
orte_grpcomm.barrier();
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE2);
|
||||
}
|
||||
/*****************************
|
||||
* Restart from a checkpoint
|
||||
|
@ -122,6 +122,14 @@ int mca_mpool_rdma_release_memory(mca_mpool_base_module_t* mpool, void *base,
|
||||
* finalize mpool
|
||||
*/
|
||||
void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool);
|
||||
|
||||
/**
|
||||
* Fault Tolerance Event Notification Function
|
||||
* @param state Checkpoint Stae
|
||||
* @return OMPI_SUCCESS or failure status
|
||||
*/
|
||||
int mca_mpool_rdma_ft_event(int state);
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -56,8 +56,8 @@ mca_mpool_rdma_component_t mca_mpool_rdma_component = {
|
||||
NULL
|
||||
},
|
||||
{
|
||||
/* The component is not checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_NONE
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
mca_mpool_rdma_init
|
||||
|
@ -54,6 +54,7 @@ void mca_mpool_rdma_module_init(mca_mpool_rdma_module_t* mpool)
|
||||
mpool->super.mpool_deregister = mca_mpool_rdma_deregister;
|
||||
mpool->super.mpool_release_memory = mca_mpool_rdma_release_memory;
|
||||
mpool->super.mpool_finalize = mca_mpool_rdma_finalize;
|
||||
mpool->super.mpool_ft_event = mca_mpool_rdma_ft_event;
|
||||
mpool->super.rcache =
|
||||
mca_rcache_base_module_create(mca_mpool_rdma_component.rcache_name);
|
||||
mpool->super.flags = MCA_MPOOL_FLAGS_MPI_ALLOC_MEM;
|
||||
@ -478,3 +479,24 @@ void mca_mpool_rdma_finalize(struct mca_mpool_base_module_t *mpool)
|
||||
OBJ_DESTRUCT(&mpool_rdma->reg_list);
|
||||
OPAL_THREAD_UNLOCK(&mpool->rcache->lock);
|
||||
}
|
||||
|
||||
int mca_mpool_rdma_ft_event(int state) {
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
;
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
;
|
||||
}
|
||||
else if(OPAL_CRS_RESTART == state ||
|
||||
OPAL_CRS_RESTART_PRE == state) {
|
||||
;
|
||||
}
|
||||
else if(OPAL_CRS_TERM == state ) {
|
||||
;
|
||||
}
|
||||
else {
|
||||
;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -38,6 +38,10 @@
|
||||
#include "ompi/mca/common/sm/common_sm_mmap.h"
|
||||
#include "ompi/proc/proc.h"
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
#include "opal/runtime/opal_cr.h"
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Local functions
|
||||
*/
|
||||
@ -62,8 +66,8 @@ mca_mpool_sm_component_t mca_mpool_sm_component = {
|
||||
mca_mpool_sm_close
|
||||
},
|
||||
{
|
||||
/* The component is not checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_NONE
|
||||
/* The component is checkpoint ready */
|
||||
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
||||
},
|
||||
|
||||
mca_mpool_sm_init
|
||||
@ -134,7 +138,17 @@ static int mca_mpool_sm_close( void )
|
||||
{
|
||||
if( NULL != mca_common_sm_mmap ) {
|
||||
if( OMPI_SUCCESS == mca_common_sm_mmap_fini( mca_common_sm_mmap ) ) {
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
/* Only unlink the file if we are *not* restarting
|
||||
* If we are restarting the file will be unlinked at a later time.
|
||||
*/
|
||||
if(OPAL_CR_STATUS_RESTART_PRE != opal_cr_checkpointing_state &&
|
||||
OPAL_CR_STATUS_RESTART_POST != opal_cr_checkpointing_state ) {
|
||||
unlink( mca_common_sm_mmap->map_path );
|
||||
}
|
||||
#else
|
||||
unlink( mca_common_sm_mmap->map_path );
|
||||
#endif
|
||||
}
|
||||
OBJ_RELEASE( mca_common_sm_mmap );
|
||||
}
|
||||
|
@ -28,7 +28,12 @@
|
||||
#include "opal/mca/maffinity/maffinity.h"
|
||||
#include "opal/mca/maffinity/maffinity_types.h"
|
||||
#include "opal/mca/maffinity/base/base.h"
|
||||
#include "orte/util/proc_info.h"
|
||||
|
||||
#if OPAL_ENABLE_FT == 1
|
||||
#include "ompi/mca/mpool/base/base.h"
|
||||
#include "ompi/runtime/ompi_cr.h"
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Initializes the mpool module.
|
||||
@ -116,15 +121,54 @@ void mca_mpool_sm_free(mca_mpool_base_module_t* mpool, void * addr,
|
||||
mpool_sm->sm_allocator->alc_free(mpool_sm->sm_allocator, addr);
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT == 0
|
||||
int mca_mpool_sm_ft_event(int state) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#else
|
||||
int mca_mpool_sm_ft_event(int state) {
|
||||
mca_mpool_base_module_t *self_module = NULL;
|
||||
char * file_name = NULL;
|
||||
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
;
|
||||
/* Record the shared memory filename */
|
||||
asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s",
|
||||
orte_process_info.job_session_dir,
|
||||
orte_process_info.nodename );
|
||||
opal_crs_base_metadata_write_token(NULL, CRS_METADATA_TOUCH, file_name);
|
||||
free(file_name);
|
||||
file_name = NULL;
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
;
|
||||
if(ompi_cr_continue_like_restart) {
|
||||
/* Remove self from the list of all modules */
|
||||
self_module = mca_mpool_base_module_lookup("sm");
|
||||
mca_mpool_base_module_destroy(self_module);
|
||||
|
||||
/* Release the old sm file, if it exists */
|
||||
if( NULL != mca_common_sm_mmap ) {
|
||||
if( OMPI_SUCCESS == mca_common_sm_mmap_fini( mca_common_sm_mmap ) ) {
|
||||
/* Add old shared memory file for eventual removal */
|
||||
opal_crs_base_cleanup_append(mca_common_sm_mmap->map_path, false);
|
||||
}
|
||||
OBJ_RELEASE( mca_common_sm_mmap );
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_RESTART == state) {
|
||||
;
|
||||
else if(OPAL_CRS_RESTART == state ||
|
||||
OPAL_CRS_RESTART_PRE == state) {
|
||||
/* Remove self from the list of all modules */
|
||||
self_module = mca_mpool_base_module_lookup("sm");
|
||||
mca_mpool_base_module_destroy(self_module);
|
||||
|
||||
/* Release the old sm file, if it exists */
|
||||
if( NULL != mca_common_sm_mmap ) {
|
||||
if( OMPI_SUCCESS == mca_common_sm_mmap_fini( mca_common_sm_mmap ) ) {
|
||||
/* Add old shared memory file for eventual removal */
|
||||
opal_crs_base_cleanup_append(mca_common_sm_mmap->map_path, false);
|
||||
}
|
||||
OBJ_RELEASE( mca_common_sm_mmap );
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_TERM == state ) {
|
||||
;
|
||||
@ -135,3 +179,4 @@ int mca_mpool_sm_ft_event(int state) {
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#endif /* OPAL_ENABLE_FT */
|
||||
|
@ -547,17 +547,61 @@ void mca_pml_ob1_error_handler(
|
||||
orte_errmgr.abort(-1, NULL);
|
||||
}
|
||||
|
||||
#if OPAL_ENABLE_FT == 0
|
||||
int mca_pml_ob1_ft_event( int state ) {
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#else
|
||||
int mca_pml_ob1_ft_event( int state )
|
||||
{
|
||||
static bool first_continue_pass = false;
|
||||
ompi_proc_t** procs = NULL;
|
||||
size_t num_procs;
|
||||
int ret, p;
|
||||
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
;
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
|
||||
orte_grpcomm.barrier();
|
||||
}
|
||||
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
;
|
||||
first_continue_pass = !first_continue_pass;
|
||||
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
|
||||
orte_grpcomm.barrier();
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
|
||||
}
|
||||
|
||||
if( ompi_cr_continue_like_restart && !first_continue_pass ) {
|
||||
/*
|
||||
* Get a list of processes
|
||||
*/
|
||||
procs = ompi_proc_all(&num_procs);
|
||||
if(NULL == procs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Refresh the proc structure, and publish our proc info in the modex.
|
||||
* NOTE: Do *not* call ompi_proc_finalize as there are many places in
|
||||
* the code that point to indv. procs in this strucutre. For our
|
||||
* needs here we only need to fix up the modex, bml and pml
|
||||
* references.
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) {
|
||||
opal_output(0,
|
||||
"pml:ob1: ft_event(Restart): proc_refresh Failed %d",
|
||||
ret);
|
||||
free (procs);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_RESTART_PRE == state ) {
|
||||
/* Nothing here */
|
||||
@ -612,10 +656,64 @@ int mca_pml_ob1_ft_event( int state )
|
||||
}
|
||||
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
;
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P1);
|
||||
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR0);
|
||||
/* JJH Cannot barrier here due to progress engine -- orte_grpcomm.barrier();*/
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
;
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
|
||||
orte_grpcomm.barrier();
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
||||
}
|
||||
|
||||
if( ompi_cr_continue_like_restart && !first_continue_pass ) {
|
||||
/*
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
*/
|
||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) {
|
||||
opal_output(0,
|
||||
"pml:ob1: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
|
||||
ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
* Add the new procs (BTLs redo modex recv's)
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = mca_pml_ob1_add_procs(procs, num_procs) ) ) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed in add_procs (%d)", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
OBJ_RELEASE(procs[p]);
|
||||
}
|
||||
free(procs);
|
||||
procs = NULL;
|
||||
}
|
||||
}
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
|
||||
orte_grpcomm.barrier();
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_RESTART_PRE == state ) {
|
||||
/* Nothing here */
|
||||
@ -664,6 +762,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#endif /* OPAL_ENABLE_FT */
|
||||
|
||||
int mca_pml_ob1_com_btl_comp(const void *v1, const void *v2)
|
||||
{
|
||||
|
@ -50,6 +50,7 @@
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#include "orte/runtime/runtime.h"
|
||||
#include "orte/util/show_help.h"
|
||||
#include "orte/mca/grpcomm/grpcomm.h"
|
||||
|
||||
#include "ompi/constants.h"
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
@ -72,6 +73,8 @@ static int ompi_cr_coord_post_ckpt(void);
|
||||
static int ompi_cr_coord_post_restart(void);
|
||||
static int ompi_cr_coord_post_continue(void);
|
||||
|
||||
bool ompi_cr_continue_like_restart = false;
|
||||
|
||||
/*************
|
||||
* Local vars
|
||||
*************/
|
||||
@ -159,6 +162,9 @@ int ompi_cr_init(void)
|
||||
ompi_cr_output = opal_cr_output;
|
||||
}
|
||||
|
||||
/* Typically this is not needed. Individual BTLs will set this as needed */
|
||||
ompi_cr_continue_like_restart = false;
|
||||
|
||||
opal_output_verbose(10, ompi_cr_output,
|
||||
"ompi_cr: init: ompi_cr_init()");
|
||||
|
||||
@ -195,6 +201,9 @@ int ompi_cr_coord(int state)
|
||||
* take action given the state.
|
||||
*/
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
/* Default: use the fast way */
|
||||
ompi_cr_continue_like_restart = false;
|
||||
|
||||
/* Do Checkpoint Phase work */
|
||||
ret = ompi_cr_coord_pre_ckpt();
|
||||
if( ret == OMPI_EXISTS) {
|
||||
@ -317,6 +326,8 @@ static int ompi_cr_coord_pre_restart(void) {
|
||||
}
|
||||
|
||||
static int ompi_cr_coord_pre_continue(void) {
|
||||
int ret, exit_status = OMPI_SUCCESS;
|
||||
|
||||
/*
|
||||
* Can not really do much until ORTE is up and running,
|
||||
* so defer action until the post_continue function.
|
||||
@ -324,7 +335,26 @@ static int ompi_cr_coord_pre_continue(void) {
|
||||
opal_output_verbose(10, ompi_cr_output,
|
||||
"ompi_cr: coord_pre_continue: ompi_cr_coord_pre_continue()");
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
if( ompi_cr_continue_like_restart ) {
|
||||
/* Mimic ompi_cr_coord_pre_restart(); */
|
||||
if( ORTE_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) {
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
|
||||
}
|
||||
|
||||
cleanup:
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
/*************
|
||||
|
@ -49,6 +49,12 @@ extern "C" {
|
||||
*/
|
||||
OMPI_DECLSPEC extern int ompi_cr_output;
|
||||
|
||||
/*
|
||||
* If one of the BTLs that shutdown require a full, clean rebuild of the
|
||||
* point-to-point stack on 'continue' as well as 'restart'.
|
||||
*/
|
||||
OPAL_DECLSPEC extern bool ompi_cr_continue_like_restart;
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -503,6 +503,21 @@ int mca_base_param_set_int(int index, int value)
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Deregister a parameter
|
||||
*/
|
||||
int mca_base_param_deregister(int index)
|
||||
{
|
||||
size_t size;
|
||||
|
||||
/* Lookup the index and see if it's valid */
|
||||
size = opal_value_array_get_size(&mca_base_params);
|
||||
if (index < 0 || ((size_t) index) > size) {
|
||||
return OPAL_ERROR;
|
||||
}
|
||||
|
||||
return opal_value_array_remove_item(&mca_base_params, index);
|
||||
}
|
||||
|
||||
/*
|
||||
* Look up a string MCA parameter.
|
||||
|
@ -470,6 +470,14 @@ extern "C" {
|
||||
const char *syn_param_name,
|
||||
bool deprecated);
|
||||
|
||||
/**
|
||||
* Deregister a MCA parameter
|
||||
*
|
||||
* @param index Index returned from mca_base_param_register_init()
|
||||
*
|
||||
*/
|
||||
OPAL_DECLSPEC int mca_base_param_deregister(int index);
|
||||
|
||||
/**
|
||||
* Look up an integer MCA parameter.
|
||||
*
|
||||
|
@ -304,7 +304,7 @@ int opal_crs_base_cleanup_flush(void)
|
||||
argc = opal_argv_count(cleanup_file_argv);
|
||||
for( i = 0; i < argc; ++i) {
|
||||
opal_output_verbose(15, opal_crs_base_output,
|
||||
"opal:crs: cleanup_flush: Remove File <%s>\n", cleanup_dir_argv[i]);
|
||||
"opal:crs: cleanup_flush: Remove File <%s>\n", cleanup_file_argv[i]);
|
||||
unlink(cleanup_file_argv[i]);
|
||||
}
|
||||
|
||||
|
@ -77,6 +77,14 @@ bool opal_cr_stall_check = false;
|
||||
bool opal_cr_currently_stalled = false;
|
||||
int opal_cr_output;
|
||||
|
||||
static double opal_cr_get_time(void);
|
||||
static void display_indv_timer_core(double diff, char *str);
|
||||
static double timer_start[OPAL_CR_TIMER_MAX];
|
||||
bool opal_cr_timing_barrier_enabled = false;
|
||||
bool opal_cr_timing_enabled = false;
|
||||
int opal_cr_timing_my_rank = 0;
|
||||
int opal_cr_timing_target_rank = 0;
|
||||
|
||||
/******************
|
||||
* Local Functions & Var Decls
|
||||
******************/
|
||||
@ -214,6 +222,28 @@ int opal_cr_init(void )
|
||||
"opal_cr: init: FT Enabled: %d",
|
||||
val);
|
||||
|
||||
mca_base_param_reg_int_name("opal_cr", "enable_timer",
|
||||
"Enable Checkpoint timer (Default: Disabled)",
|
||||
false, false,
|
||||
0, &val);
|
||||
opal_cr_timing_enabled = OPAL_INT_TO_BOOL(val);
|
||||
|
||||
mca_base_param_reg_int_name("opal_cr", "enable_timer_barrier",
|
||||
"Enable Checkpoint timer Barrier (Default: Disabled)",
|
||||
false, false,
|
||||
0, &val);
|
||||
if( opal_cr_timing_enabled ) {
|
||||
opal_cr_timing_barrier_enabled = OPAL_INT_TO_BOOL(val);
|
||||
} else {
|
||||
opal_cr_timing_barrier_enabled = false;
|
||||
}
|
||||
|
||||
mca_base_param_reg_int_name("opal_cr", "timer_target_rank",
|
||||
"Target Rank for the timer (Default: 0)",
|
||||
false, false,
|
||||
0, &val);
|
||||
opal_cr_timing_target_rank = val;
|
||||
|
||||
#if OPAL_ENABLE_FT_THREAD == 1
|
||||
mca_base_param_reg_int_name("opal_cr", "use_thread",
|
||||
"Use an async thread to checkpoint this program (Default: Disabled)",
|
||||
@ -505,6 +535,7 @@ int opal_cr_inc_core(pid_t pid, opal_crs_base_snapshot_t *snapshot, bool term, i
|
||||
/*
|
||||
* Take the checkpoint
|
||||
*/
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE0);
|
||||
if(OPAL_SUCCESS != (ret = opal_crs.crs_checkpoint(pid, snapshot, (opal_crs_state_type_t *)state))) {
|
||||
opal_output(opal_cr_output,
|
||||
"opal_cr: inc_core: Error: The checkpoint failed. %d\n", ret);
|
||||
@ -513,6 +544,8 @@ int opal_cr_inc_core(pid_t pid, opal_crs_base_snapshot_t *snapshot, bool term, i
|
||||
}
|
||||
|
||||
if(*state == OPAL_CRS_CONTINUE) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE1);
|
||||
|
||||
if(term) {
|
||||
*state = OPAL_CRS_TERM;
|
||||
opal_cr_checkpointing_state = OPAL_CR_STATUS_TERM;
|
||||
@ -869,3 +902,127 @@ void opal_cr_thread_noop_progress(void)
|
||||
}
|
||||
|
||||
#endif /* OPAL_ENABLE_FT_THREAD == 1 */
|
||||
|
||||
static double opal_cr_get_time() {
|
||||
double wtime;
|
||||
|
||||
#if OPAL_TIMER_USEC_NATIVE
|
||||
wtime = (double)opal_timer_base_get_usec() / 1000000.0;
|
||||
#else
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
wtime = tv.tv_sec;
|
||||
wtime += (double)tv.tv_usec / 1000000.0;
|
||||
#endif
|
||||
|
||||
return wtime;
|
||||
}
|
||||
|
||||
void opal_cr_set_time(int idx)
|
||||
{
|
||||
if(idx < OPAL_CR_TIMER_MAX ) {
|
||||
if( timer_start[idx] <= 0.0 ) {
|
||||
timer_start[idx] = opal_cr_get_time();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void opal_cr_clear_timers(void)
|
||||
{
|
||||
int i;
|
||||
for(i = 0; i < OPAL_CR_TIMER_MAX; ++i) {
|
||||
timer_start[i] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
static void display_indv_timer_core(double diff, char *str) {
|
||||
double total = 0;
|
||||
double perc = 0;
|
||||
|
||||
total = timer_start[OPAL_CR_TIMER_MAX-1] - timer_start[OPAL_CR_TIMER_ENTRY0];
|
||||
perc = (diff/total) * 100;
|
||||
|
||||
opal_output(0,
|
||||
"opal_cr: timing: %-20s = %10.2f s\t%10.2f s\t%6.2f\n",
|
||||
str,
|
||||
diff,
|
||||
total,
|
||||
perc);
|
||||
return;
|
||||
}
|
||||
|
||||
void opal_cr_display_all_timers(void)
|
||||
{
|
||||
double diff = 0.0;
|
||||
char * label = NULL;
|
||||
|
||||
if( opal_cr_timing_target_rank != opal_cr_timing_my_rank ) {
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output(0, "OPAL CR Timing: ******************** Summary Begin\n");
|
||||
|
||||
/********** Entry into the system **********/
|
||||
label = strdup("Start Entry Point");
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
diff = timer_start[OPAL_CR_TIMER_CRCPBR0] - timer_start[OPAL_CR_TIMER_ENTRY0];
|
||||
} else {
|
||||
diff = timer_start[OPAL_CR_TIMER_CRCP0] - timer_start[OPAL_CR_TIMER_ENTRY0];
|
||||
}
|
||||
display_indv_timer_core(diff, label);
|
||||
free(label);
|
||||
|
||||
/********** CRCP Protocol **********/
|
||||
label = strdup("CRCP Protocol");
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
diff = timer_start[OPAL_CR_TIMER_CRCPBR1] - timer_start[OPAL_CR_TIMER_CRCP0];
|
||||
} else {
|
||||
diff = timer_start[OPAL_CR_TIMER_P2P0] - timer_start[OPAL_CR_TIMER_CRCP0];
|
||||
}
|
||||
display_indv_timer_core(diff, label);
|
||||
free(label);
|
||||
|
||||
/********** P2P Suspend **********/
|
||||
label = strdup("P2P Suspend");
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
diff = timer_start[OPAL_CR_TIMER_P2PBR0] - timer_start[OPAL_CR_TIMER_P2P0];
|
||||
} else {
|
||||
diff = timer_start[OPAL_CR_TIMER_CORE0] - timer_start[OPAL_CR_TIMER_P2P0];
|
||||
}
|
||||
display_indv_timer_core(diff, label);
|
||||
free(label);
|
||||
|
||||
/********** Checkpoint to Disk **********/
|
||||
label = strdup("Checkpoint");
|
||||
diff = timer_start[OPAL_CR_TIMER_CORE1] - timer_start[OPAL_CR_TIMER_CORE0];
|
||||
display_indv_timer_core(diff, label);
|
||||
free(label);
|
||||
|
||||
/********** P2P Reactivation **********/
|
||||
label = strdup("P2P Reactivation");
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
diff = timer_start[OPAL_CR_TIMER_P2PBR2] - timer_start[OPAL_CR_TIMER_CORE1];
|
||||
} else {
|
||||
diff = timer_start[OPAL_CR_TIMER_CRCP1] - timer_start[OPAL_CR_TIMER_CORE1];
|
||||
}
|
||||
display_indv_timer_core(diff, label);
|
||||
free(label);
|
||||
|
||||
/********** CRCP Protocol Finalize **********/
|
||||
label = strdup("CRCP Cleanup");
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
diff = timer_start[OPAL_CR_TIMER_COREBR1] - timer_start[OPAL_CR_TIMER_CRCP1];
|
||||
} else {
|
||||
diff = timer_start[OPAL_CR_TIMER_CORE2] - timer_start[OPAL_CR_TIMER_CRCP1];
|
||||
}
|
||||
display_indv_timer_core(diff, label);
|
||||
free(label);
|
||||
|
||||
/********** Exit the system **********/
|
||||
label = strdup("Finish Entry Point");
|
||||
diff = timer_start[OPAL_CR_TIMER_ENTRY4] - timer_start[OPAL_CR_TIMER_CORE2];
|
||||
display_indv_timer_core(diff, label);
|
||||
free(label);
|
||||
|
||||
opal_output(0, "OPAL CR Timing: ******************** Summary End\n");
|
||||
}
|
||||
|
@ -267,7 +267,65 @@ typedef enum opal_cr_ckpt_cmd_state_t opal_cr_ckpt_cmd_state_t;
|
||||
* OPAL Checkpoint Coordination Routine
|
||||
*/
|
||||
OPAL_DECLSPEC int opal_cr_coord(int state);
|
||||
|
||||
|
||||
/**
|
||||
* Checkpoint life-cycle timing
|
||||
*/
|
||||
OPAL_DECLSPEC void opal_cr_set_time(int idx);
|
||||
OPAL_DECLSPEC void opal_cr_display_all_timers(void);
|
||||
OPAL_DECLSPEC void opal_cr_clear_timers(void);
|
||||
|
||||
OPAL_DECLSPEC extern bool opal_cr_timing_enabled;
|
||||
OPAL_DECLSPEC extern bool opal_cr_timing_barrier_enabled;
|
||||
OPAL_DECLSPEC extern int opal_cr_timing_my_rank;
|
||||
OPAL_DECLSPEC extern int opal_cr_timing_target_rank;
|
||||
|
||||
|
||||
#define OPAL_CR_TIMER_ENTRY0 0
|
||||
#define OPAL_CR_TIMER_ENTRY1 1
|
||||
#define OPAL_CR_TIMER_ENTRY2 2
|
||||
#define OPAL_CR_TIMER_CRCPBR0 3
|
||||
#define OPAL_CR_TIMER_CRCP0 4
|
||||
#define OPAL_CR_TIMER_CRCPBR1 5
|
||||
#define OPAL_CR_TIMER_P2P0 6
|
||||
#define OPAL_CR_TIMER_P2P1 7
|
||||
#define OPAL_CR_TIMER_P2PBR0 8
|
||||
#define OPAL_CR_TIMER_CORE0 9
|
||||
#define OPAL_CR_TIMER_CORE1 10
|
||||
#define OPAL_CR_TIMER_COREBR0 11
|
||||
#define OPAL_CR_TIMER_P2P2 12
|
||||
#define OPAL_CR_TIMER_P2PBR1 13
|
||||
#define OPAL_CR_TIMER_P2P3 14
|
||||
#define OPAL_CR_TIMER_P2PBR2 15
|
||||
#define OPAL_CR_TIMER_CRCP1 16
|
||||
#define OPAL_CR_TIMER_COREBR1 17
|
||||
#define OPAL_CR_TIMER_CORE2 18
|
||||
#define OPAL_CR_TIMER_ENTRY3 19
|
||||
#define OPAL_CR_TIMER_ENTRY4 20
|
||||
#define OPAL_CR_TIMER_MAX 21
|
||||
|
||||
|
||||
#define OPAL_CR_CLEAR_TIMERS() \
|
||||
{ \
|
||||
if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
|
||||
opal_cr_clear_timers(); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define OPAL_CR_SET_TIMER(idx) \
|
||||
{ \
|
||||
if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
|
||||
opal_cr_set_time(idx); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define OPAL_CR_DISPLAY_ALL_TIMERS() \
|
||||
{ \
|
||||
if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
|
||||
opal_cr_display_all_timers(); \
|
||||
} \
|
||||
}
|
||||
|
||||
#if defined(c_plusplus) || defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
@ -1613,11 +1613,13 @@ int mca_oob_tcp_ft_event(int state) {
|
||||
*/
|
||||
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
||||
opal_event_disable();
|
||||
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
/*
|
||||
* Resume event processing
|
||||
*/
|
||||
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
||||
opal_event_enable();
|
||||
OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_lock);
|
||||
}
|
||||
@ -1636,6 +1638,8 @@ int mca_oob_tcp_ft_event(int state) {
|
||||
MCA_OOB_TCP_PEER_RETURN(peer);
|
||||
}
|
||||
|
||||
OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_lock);
|
||||
|
||||
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_free);
|
||||
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peer_names);
|
||||
OBJ_DESTRUCT(&mca_oob_tcp_component.tcp_peers);
|
||||
|
@ -173,6 +173,10 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
||||
goto STAGE_1;
|
||||
}
|
||||
|
||||
OPAL_CR_CLEAR_TIMERS();
|
||||
opal_cr_timing_my_rank = ORTE_PROC_MY_NAME->vpid;
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY0);
|
||||
|
||||
/*
|
||||
* Open communication channels
|
||||
*/
|
||||
@ -193,6 +197,8 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
||||
goto ckpt_cleanup;
|
||||
}
|
||||
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY1);
|
||||
|
||||
/*
|
||||
* Begin checkpoint
|
||||
* - Init the checkpoint metadata file
|
||||
@ -205,6 +211,8 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
||||
goto ckpt_cleanup;
|
||||
}
|
||||
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY2);
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
||||
"App) notify_response: Start checkpoint..."));
|
||||
STAGE_1:
|
||||
@ -255,6 +263,7 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
||||
/*
|
||||
* Final Handshake
|
||||
*/
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY3);
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
||||
"App) notify_response: Waiting for final handshake."));
|
||||
if( ORTE_SUCCESS != (ret = snapc_full_app_ckpt_handshake_end(cr_state ) ) ) {
|
||||
@ -282,6 +291,11 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
||||
opal_cr_checkpointing_state = OPAL_CR_STATUS_NONE;
|
||||
opal_cr_currently_stalled = false;
|
||||
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_ENTRY4);
|
||||
if(OPAL_CRS_RESTART != cr_state) {
|
||||
OPAL_CR_DISPLAY_ALL_TIMERS();
|
||||
}
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user