Merge pull request #348 from adrianreber/topic/orte_cr_continue_like_restart
Topic/orte cr continue like restart
Этот коммит содержится в:
Коммит
714d9aa67e
@ -27,6 +27,7 @@
|
||||
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
#include "opal/mca/btl/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
#include "ompi/runtime/ompi_cr.h"
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
@ -48,7 +49,6 @@ int mca_bml_r2_ft_event(int state)
|
||||
int loc_state;
|
||||
int param_type = -1;
|
||||
const char **btl_list;
|
||||
ompi_rte_collective_t coll;
|
||||
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
/* Do nothing for now */
|
||||
@ -57,7 +57,7 @@ int mca_bml_r2_ft_event(int state)
|
||||
first_continue_pass = !first_continue_pass;
|
||||
|
||||
/* Since nothing in Checkpoint, we are fine here (unless required by BTL) */
|
||||
if( orte_cr_continue_like_restart && !first_continue_pass) {
|
||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
procs = ompi_proc_all(&num_procs);
|
||||
if(NULL == procs) {
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
@ -139,7 +139,7 @@ int mca_bml_r2_ft_event(int state)
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
/* Matches OPAL_CRS_RESTART_PRE */
|
||||
if( orte_cr_continue_like_restart && first_continue_pass) {
|
||||
if (opal_cr_continue_like_restart && first_continue_pass) {
|
||||
if( OMPI_SUCCESS != (ret = mca_bml_r2_finalize()) ) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed to finalize BML framework\n");
|
||||
return ret;
|
||||
@ -150,20 +150,12 @@ int mca_bml_r2_ft_event(int state)
|
||||
}
|
||||
}
|
||||
/* Matches OPAL_CRS_RESTART */
|
||||
else if( orte_cr_continue_like_restart && !first_continue_pass ) {
|
||||
else if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
/*
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
OBJ_CONSTRUCT(&coll, ompi_rte_collective_t);
|
||||
coll.id = ompi_process_info.peer_init_barrier;
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_barrier(&coll))) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed in ompi_rte_barrier (%d)", ret);
|
||||
return ret;
|
||||
}
|
||||
while (coll.active) {
|
||||
opal_progress();
|
||||
}
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
/*
|
||||
* Re-open the BTL framework to get the full list of components.
|
||||
@ -233,15 +225,7 @@ int mca_bml_r2_ft_event(int state)
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
OBJ_CONSTRUCT(&coll, ompi_rte_collective_t);
|
||||
coll.id = ompi_process_info.peer_init_barrier;
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_barrier(&coll))) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed in ompi_rte_barrier (%d)", ret);
|
||||
return ret;
|
||||
}
|
||||
while (coll.active) {
|
||||
opal_progress();
|
||||
}
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
/*
|
||||
* Re-open the BTL framework to get the full list of components.
|
||||
|
@ -33,6 +33,7 @@
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
#include "ompi/request/request.h"
|
||||
#include "ompi/mca/rte/rte.h"
|
||||
@ -1445,8 +1446,8 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_add_procs(
|
||||
for( i = 0; i < nprocs; ++i) {
|
||||
HOKE_PEER_REF_ALLOC(new_peer_ref);
|
||||
|
||||
new_peer_ref->proc_name.jobid = procs[i]->proc_name.jobid;
|
||||
new_peer_ref->proc_name.vpid = procs[i]->proc_name.vpid;
|
||||
new_peer_ref->proc_name.jobid = OMPI_CAST_RTE_NAME(&procs[i]->super.proc_name)->jobid;
|
||||
new_peer_ref->proc_name.vpid = OMPI_CAST_RTE_NAME(&procs[i]->super.proc_name)->vpid;
|
||||
|
||||
opal_list_append(&ompi_crcp_bkmrk_pml_peer_refs, &(new_peer_ref->super));
|
||||
}
|
||||
@ -1474,11 +1475,11 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_del_procs(
|
||||
"crcp:bkmrk: pml_del_procs()"));
|
||||
|
||||
for( i = 0; i < nprocs; ++i) {
|
||||
item = (opal_list_item_t*)find_peer(procs[i]->proc_name);
|
||||
item = (opal_list_item_t*)find_peer(*(ompi_process_name_t*)&procs[i]->super.proc_name);
|
||||
if(NULL == item) {
|
||||
opal_output(mca_crcp_bkmrk_component.super.output_handle,
|
||||
"crcp:bkmrk: del_procs: Unable to find peer %s\n",
|
||||
OMPI_NAME_PRINT(&(procs[i]->proc_name)));
|
||||
OMPI_NAME_PRINT(&procs[i]->super.proc_name));
|
||||
exit_status = OMPI_ERROR;
|
||||
goto DONE;
|
||||
}
|
||||
@ -3006,12 +3007,9 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
static bool first_continue_pass = false;
|
||||
opal_list_item_t* item = NULL;
|
||||
int exit_status = OMPI_SUCCESS;
|
||||
ompi_rte_collective_t coll;
|
||||
int ret;
|
||||
|
||||
ft_event_state = state;
|
||||
OBJ_CONSTRUCT(&coll, ompi_rte_collective_t);
|
||||
coll.id = ompi_process_info.peer_init_barrier;
|
||||
|
||||
if( step_to_return_to == 1 ) {
|
||||
goto STEP_1;
|
||||
@ -3030,8 +3028,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR0);
|
||||
ompi_rte_barrier(&coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll.active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP0);
|
||||
|
||||
@ -3076,7 +3073,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
first_continue_pass = !first_continue_pass;
|
||||
|
||||
/* Only finalize the Protocol after the PML has been rebuilt */
|
||||
if( orte_cr_continue_like_restart && first_continue_pass ) {
|
||||
if (opal_cr_continue_like_restart && first_continue_pass) {
|
||||
goto DONE;
|
||||
}
|
||||
|
||||
@ -3099,8 +3096,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR1);
|
||||
ompi_rte_barrier(&coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll.active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE2);
|
||||
}
|
||||
@ -3156,7 +3152,6 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
}
|
||||
|
||||
DONE:
|
||||
OBJ_DESTRUCT(&coll);
|
||||
step_to_return_to = 0;
|
||||
ft_event_state = OPAL_CRS_RUNNING;
|
||||
|
||||
@ -3919,7 +3914,7 @@ static int drain_message_find_any(size_t count, int tag, int peer,
|
||||
|
||||
if( OPAL_EQUAL != ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL,
|
||||
&(cur_peer_ref->proc_name),
|
||||
&(comm->c_local_group->grp_proc_pointers[peer]->proc_name)) ) {
|
||||
OMPI_CAST_RTE_NAME(&comm->c_local_group->grp_proc_pointers[peer]->super.proc_name))) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -4162,7 +4157,7 @@ static ompi_crcp_bkmrk_pml_peer_ref_t * find_peer(ompi_process_name_t proc)
|
||||
static int find_peer_in_comm(struct ompi_communicator_t* comm, int proc_idx,
|
||||
ompi_crcp_bkmrk_pml_peer_ref_t **peer_ref)
|
||||
{
|
||||
*peer_ref = find_peer(comm->c_remote_group->grp_proc_pointers[proc_idx]->proc_name);
|
||||
*peer_ref = find_peer(*(ompi_process_name_t *)&comm->c_remote_group->grp_proc_pointers[proc_idx]->super.proc_name);
|
||||
|
||||
if( NULL == *peer_ref) {
|
||||
opal_output(mca_crcp_bkmrk_component.super.output_handle,
|
||||
@ -6212,19 +6207,15 @@ static void clear_timers(void) {
|
||||
static void display_all_timers(int state) {
|
||||
bool report_ready = false;
|
||||
double barrier_start, barrier_stop;
|
||||
ompi_rte_collective_t coll;
|
||||
int i;
|
||||
|
||||
OBJ_CONSTRUCT(&coll, ompi_rte_collective_t);
|
||||
coll.id = ompi_process_info.peer_init_barrier;
|
||||
if( 0 != OMPI_PROC_MY_NAME->vpid ) {
|
||||
if( 2 > timing_enabled ) {
|
||||
goto done;
|
||||
return;
|
||||
}
|
||||
else if( 2 == timing_enabled ) {
|
||||
ompi_rte_barrier(&coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll.active);
|
||||
goto done;
|
||||
opal_pmix.fence(NULL, 0);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
@ -6234,7 +6225,7 @@ static void display_all_timers(int state) {
|
||||
}
|
||||
}
|
||||
if( !report_ready ) {
|
||||
goto done;
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output(0, "crcp:bkmrk: timing(%20s): ******************** Begin: [State = %12s]\n", "Summary", opal_crs_base_state_str(state));
|
||||
@ -6244,8 +6235,7 @@ static void display_all_timers(int state) {
|
||||
|
||||
if( timing_enabled >= 2) {
|
||||
barrier_start = get_time();
|
||||
ompi_rte_barrier(&coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll.active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
barrier_stop = get_time();
|
||||
opal_output(0,
|
||||
"crcp:bkmrk: timing(%20s): %20s = %10.2f s\n",
|
||||
@ -6256,8 +6246,6 @@ static void display_all_timers(int state) {
|
||||
|
||||
opal_output(0, "crcp:bkmrk: timing(%20s): ******************** End: [State = %12s]\n", "Summary", opal_crs_base_state_str(state));
|
||||
|
||||
done:
|
||||
OBJ_DESTRUCT(&coll);
|
||||
}
|
||||
|
||||
static void display_indv_timer(int idx, int proc, int msgs) {
|
||||
|
@ -33,6 +33,7 @@
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "opal/mca/btl/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/pml/base/base.h"
|
||||
@ -663,15 +664,11 @@ int mca_pml_bfo_ft_event( int state )
|
||||
ompi_proc_t** procs = NULL;
|
||||
size_t num_procs;
|
||||
int ret, p;
|
||||
ompi_rte_collective_t *coll, *modex;
|
||||
|
||||
coll = OBJ_NEW(ompi_rte_collective_t);
|
||||
coll->id = ompi_process_info.peer_init_barrier;
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
|
||||
@ -682,20 +679,18 @@ int mca_pml_bfo_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
|
||||
}
|
||||
|
||||
if( orte_cr_continue_like_restart && !first_continue_pass ) {
|
||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
/*
|
||||
* Get a list of processes
|
||||
*/
|
||||
procs = ompi_proc_all(&num_procs);
|
||||
if(NULL == procs) {
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto clean;
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -713,7 +708,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
OBJ_RELEASE(procs[p]);
|
||||
}
|
||||
free (procs);
|
||||
goto clean;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -726,8 +721,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
*/
|
||||
procs = ompi_proc_all(&num_procs);
|
||||
if(NULL == procs) {
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto clean;
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -753,7 +747,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
OBJ_RELEASE(procs[p]);
|
||||
}
|
||||
free (procs);
|
||||
goto clean;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_TERM == state ) {
|
||||
@ -785,28 +779,17 @@ int mca_pml_bfo_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
||||
}
|
||||
|
||||
if( orte_cr_continue_like_restart && !first_continue_pass ) {
|
||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
/*
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
*/
|
||||
modex = OBJ_NEW(ompi_rte_collective_t);
|
||||
modex->id = ompi_process_info.peer_modex;
|
||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(modex))) {
|
||||
opal_output(0,
|
||||
"pml:bfo: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
|
||||
ret);
|
||||
OBJ_RELEASE(modex);
|
||||
goto clean;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(modex->active);
|
||||
OBJ_RELEASE(modex);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
@ -814,15 +797,11 @@ int mca_pml_bfo_ft_event( int state )
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = mca_pml_bfo_add_procs(procs, num_procs) ) ) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed in add_procs (%d)", ret);
|
||||
goto clean;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_barrier(coll))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed in ompi_rte_barrier (%d)", ret);
|
||||
return ret;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
@ -835,8 +814,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
|
||||
}
|
||||
@ -849,17 +827,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
*/
|
||||
modex = OBJ_NEW(ompi_rte_collective_t);
|
||||
modex->id = ompi_process_info.peer_modex;
|
||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) {
|
||||
opal_output(0,
|
||||
"pml:bfo: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
|
||||
ret);
|
||||
OBJ_RELEASE(modex);
|
||||
goto clean;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(modex->active);
|
||||
OBJ_RELEASE(modex);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
@ -867,15 +835,11 @@ int mca_pml_bfo_ft_event( int state )
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = mca_pml_bfo_add_procs(procs, num_procs) ) ) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed in add_procs (%d)", ret);
|
||||
goto clean;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_barrier(coll))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed in ompi_rte_barrier (%d)", ret);
|
||||
goto clean;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
@ -892,11 +856,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
;
|
||||
}
|
||||
|
||||
ret = OMPI_SUCCESS;
|
||||
|
||||
clean:
|
||||
OBJ_RELEASE(coll);
|
||||
return ret;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
|
||||
|
@ -40,6 +40,7 @@
|
||||
#include "ompi/mca/pml/base/base.h"
|
||||
#include "ompi/mca/pml/base/base.h"
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
#include "ompi/runtime/ompi_cr.h"
|
||||
|
||||
#include "pml_ob1.h"
|
||||
@ -792,15 +793,11 @@ int mca_pml_ob1_ft_event( int state )
|
||||
ompi_proc_t** procs = NULL;
|
||||
size_t num_procs;
|
||||
int ret, p;
|
||||
ompi_rte_collective_t *coll, *modex;
|
||||
|
||||
coll = OBJ_NEW(ompi_rte_collective_t);
|
||||
coll->id = ompi_process_info.peer_init_barrier;
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
|
||||
@ -811,20 +808,18 @@ int mca_pml_ob1_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
|
||||
}
|
||||
|
||||
if( orte_cr_continue_like_restart && !first_continue_pass ) {
|
||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
/*
|
||||
* Get a list of processes
|
||||
*/
|
||||
procs = ompi_proc_all(&num_procs);
|
||||
if(NULL == procs) {
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto clean;
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -842,7 +837,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
OBJ_RELEASE(procs[p]);
|
||||
}
|
||||
free (procs);
|
||||
goto clean;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -855,8 +850,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
*/
|
||||
procs = ompi_proc_all(&num_procs);
|
||||
if(NULL == procs) {
|
||||
ret = OMPI_ERR_OUT_OF_RESOURCE;
|
||||
goto clean;
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -882,7 +876,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
OBJ_RELEASE(procs[p]);
|
||||
}
|
||||
free (procs);
|
||||
goto clean;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_TERM == state ) {
|
||||
@ -914,28 +908,13 @@ int mca_pml_ob1_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
||||
}
|
||||
|
||||
if( orte_cr_continue_like_restart && !first_continue_pass ) {
|
||||
/*
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
*/
|
||||
modex = OBJ_NEW(ompi_rte_collective_t);
|
||||
modex->id = ompi_process_info.peer_modex;
|
||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(modex))) {
|
||||
opal_output(0,
|
||||
"pml:ob1: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
|
||||
ret);
|
||||
OBJ_RELEASE(modex);
|
||||
goto clean;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(modex->active);
|
||||
OBJ_RELEASE(modex);
|
||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
@ -943,15 +922,11 @@ int mca_pml_ob1_ft_event( int state )
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = mca_pml_ob1_add_procs(procs, num_procs) ) ) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed in add_procs (%d)", ret);
|
||||
goto clean;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_barrier(coll))) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed in ompi_rte_barrier (%d)", ret);
|
||||
goto clean;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
@ -964,8 +939,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
|
||||
}
|
||||
@ -978,17 +952,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
*/
|
||||
modex = OBJ_NEW(ompi_rte_collective_t);
|
||||
modex->id = ompi_process_info.peer_modex;
|
||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(modex))) {
|
||||
opal_output(0,
|
||||
"pml:ob1: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
|
||||
ret);
|
||||
OBJ_RELEASE(modex);
|
||||
goto clean;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(modex->active);
|
||||
OBJ_RELEASE(modex);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
@ -996,15 +960,11 @@ int mca_pml_ob1_ft_event( int state )
|
||||
*/
|
||||
if( OMPI_SUCCESS != (ret = mca_pml_ob1_add_procs(procs, num_procs) ) ) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed in add_procs (%d)", ret);
|
||||
goto clean;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_barrier(coll))) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed in ompi_rte_barrier (%d)", ret);
|
||||
goto clean;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
@ -1021,11 +981,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
;
|
||||
}
|
||||
|
||||
ret = OMPI_SUCCESS;
|
||||
|
||||
clean:
|
||||
OBJ_RELEASE(coll);
|
||||
return ret;
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
|
||||
|
@ -410,7 +410,7 @@ static int ompi_cr_coord_pre_continue(void) {
|
||||
opal_output_verbose(10, ompi_cr_output,
|
||||
"ompi_cr: coord_pre_continue: ompi_cr_coord_pre_continue()");
|
||||
|
||||
if( orte_cr_continue_like_restart ) {
|
||||
if (opal_cr_continue_like_restart) {
|
||||
/* Mimic ompi_cr_coord_pre_restart(); */
|
||||
if( OMPI_SUCCESS != (ret = mca_pml.pml_ft_event(OPAL_CRS_CONTINUE))) {
|
||||
exit_status = ret;
|
||||
|
@ -1743,7 +1743,7 @@ int mca_btl_openib_ft_event(int state) {
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
/* Continue must reconstruct the routes (including modex), since we
|
||||
* have to tear down the devices completely. */
|
||||
orte_cr_continue_like_restart = true;
|
||||
opal_cr_continue_like_restart = true;
|
||||
|
||||
/*
|
||||
* To keep the node from crashing we need to call ibv_close_device
|
||||
|
@ -1281,13 +1281,16 @@ int mca_btl_sm_ft_event(int state) {
|
||||
* for these old file handles. The restart procedure will make sure
|
||||
* these files get cleaned up appropriately.
|
||||
*/
|
||||
/* Disabled to get FT code compiled again
|
||||
* TODO: FIXIT soon
|
||||
orte_sstore.set_attr(orte_sstore_handle_current,
|
||||
SSTORE_METADATA_LOCAL_TOUCH,
|
||||
mca_btl_sm_component.sm_seg->shmem_ds.seg_name);
|
||||
*/
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
if( orte_cr_continue_like_restart ) {
|
||||
if (opal_cr_continue_like_restart) {
|
||||
if( NULL != mca_btl_sm_component.sm_seg ) {
|
||||
/* Add shared memory file */
|
||||
opal_crs_base_cleanup_append(mca_btl_sm_component.sm_seg->shmem_ds.seg_name, false);
|
||||
|
@ -1251,13 +1251,16 @@ int mca_btl_smcuda_ft_event(int state) {
|
||||
* for these old file handles. The restart procedure will make sure
|
||||
* these files get cleaned up appropriately.
|
||||
*/
|
||||
/* Disabled to get FT code compiled again
|
||||
* TODO: FIXIT soon
|
||||
orte_sstore.set_attr(orte_sstore_handle_current,
|
||||
SSTORE_METADATA_LOCAL_TOUCH,
|
||||
mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name);
|
||||
*/
|
||||
}
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
if( orte_cr_continue_like_restart ) {
|
||||
if (opal_cr_continue_like_restart) {
|
||||
if( NULL != mca_btl_smcuda_component.sm_seg ) {
|
||||
/* Add shared memory file */
|
||||
opal_crs_base_cleanup_append(mca_btl_smcuda_component.sm_seg->shmem_ds.seg_name, false);
|
||||
|
@ -173,12 +173,15 @@ int mca_mpool_sm_ft_event(int state) {
|
||||
asprintf( &file_name, "%s"OPAL_PATH_SEP"shared_mem_pool.%s",
|
||||
opal_process_info.job_session_dir,
|
||||
opal_proc_local_get()->proc_hostname );
|
||||
/* Disabled to get FT code compiled again
|
||||
* TODO: FIXIT soon
|
||||
orte_sstore.set_attr(orte_sstore_handle_current, SSTORE_METADATA_LOCAL_TOUCH, file_name);
|
||||
*/
|
||||
free(file_name);
|
||||
file_name = NULL;
|
||||
}
|
||||
else if(OPAL_CRS_CONTINUE == state) {
|
||||
if(orte_cr_continue_like_restart) {
|
||||
if (opal_cr_continue_like_restart) {
|
||||
/* Find the sm module */
|
||||
self_module = mca_mpool_base_module_lookup("sm");
|
||||
self_sm_module = (mca_mpool_sm_module_t*) self_module;
|
||||
|
@ -131,6 +131,8 @@ int opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE;
|
||||
|
||||
static bool opal_cr_debug_sigpipe = false;
|
||||
|
||||
bool opal_cr_continue_like_restart = false;
|
||||
|
||||
#if OPAL_ENABLE_FT_THREAD == 1
|
||||
/*****************
|
||||
* Threading Functions and Variables
|
||||
|
@ -91,6 +91,12 @@ typedef enum opal_cr_ckpt_cmd_state_t opal_cr_ckpt_cmd_state_t;
|
||||
/* The current state of a checkpoint operation */
|
||||
OPAL_DECLSPEC extern int opal_cr_checkpointing_state;
|
||||
|
||||
/*
|
||||
* If one of the BTLs that shutdown require a full, clean rebuild of the
|
||||
* point-to-point stack on 'continue' as well as 'restart'.
|
||||
*/
|
||||
OPAL_DECLSPEC extern bool opal_cr_continue_like_restart;
|
||||
|
||||
#if OPAL_ENABLE_CRDEBUG == 1
|
||||
/* Whether or not C/R Debugging is enabled for this process */
|
||||
OPAL_DECLSPEC extern int MPIR_debug_with_checkpoint;
|
||||
|
82
orte/mca/ess/env/ess_env_module.c
поставляемый
82
orte/mca/ess/env/ess_env_module.c
поставляемый
@ -191,10 +191,6 @@ static int rte_ft_event(int state)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
orte_proc_type_t svtype;
|
||||
orte_grpcomm_collective_t coll;
|
||||
|
||||
OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
|
||||
coll.id = orte_process_info.snapc_init_barrier;
|
||||
|
||||
/******** Checkpoint Prep ********/
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
@ -203,8 +199,7 @@ static int rte_ft_event(int state)
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CHECKPOINT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -212,8 +207,7 @@ static int rte_ft_event(int state)
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CHECKPOINT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -221,8 +215,7 @@ static int rte_ft_event(int state)
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
/******** Continue Recovery ********/
|
||||
@ -236,8 +229,7 @@ static int rte_ft_event(int state)
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -245,8 +237,7 @@ static int rte_ft_event(int state)
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CONTINUE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -254,23 +245,15 @@ static int rte_ft_event(int state)
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CONTINUE))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
return ret;
|
||||
}
|
||||
|
||||
if( orte_cr_continue_like_restart ) {
|
||||
if (opal_cr_continue_like_restart) {
|
||||
/*
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) {
|
||||
opal_output(0, "ess:env: ft_event(%2d): Failed in orte_grpcomm.barrier (%d)",
|
||||
state, ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
coll.active = true;
|
||||
ORTE_WAIT_FOR_COMPLETION(coll.active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( orte_cr_flush_restart_files ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
|
||||
@ -291,11 +274,6 @@ static int rte_ft_event(int state)
|
||||
* This should follow the ess init() function
|
||||
*/
|
||||
|
||||
/*
|
||||
* Clear nidmap and jmap
|
||||
*/
|
||||
orte_util_nidmap_finalize();
|
||||
|
||||
/*
|
||||
* - Reset Contact information
|
||||
*/
|
||||
@ -308,8 +286,7 @@ static int rte_ft_event(int state)
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -320,25 +297,14 @@ static int rte_ft_event(int state)
|
||||
orte_process_info.proc_type = ORTE_PROC_TOOL;
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.finalize()) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
return ret;
|
||||
}
|
||||
orte_process_info.proc_type = svtype;
|
||||
if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Group Comm - Clean out stale data
|
||||
*/
|
||||
orte_grpcomm.finalize();
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
/* RHC: you can't pass NULL as the identifier - what you'll need to do is
|
||||
* close all open dstore handles, and then open the ones you need
|
||||
*/
|
||||
@ -355,14 +321,12 @@ static int rte_ft_event(int state)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_plm.finalize())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (ORTE_SUCCESS != (ret = orte_plm.init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -370,8 +334,7 @@ static int rte_ft_event(int state)
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -379,22 +342,14 @@ static int rte_ft_event(int state)
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_RESTART))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) {
|
||||
opal_output(0, "ess:env ft_event(%2d): Failed in orte_grpcomm.barrier (%d)",
|
||||
state, ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
coll.active = true;
|
||||
ORTE_WAIT_FOR_COMPLETION(coll.active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( orte_cr_flush_restart_files ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
|
||||
@ -426,8 +381,7 @@ static int rte_ft_event(int state)
|
||||
*/
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_RESTART))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
else if (OPAL_CRS_TERM == state ) {
|
||||
@ -437,8 +391,6 @@ static int rte_ft_event(int state)
|
||||
/* Error state = Nothing */
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OBJ_DESTRUCT(&coll);
|
||||
return exit_status;
|
||||
}
|
||||
#endif
|
||||
|
@ -50,6 +50,7 @@
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -110,7 +111,6 @@ int app_coord_init()
|
||||
orte_snapc_full_cmd_flag_t command = ORTE_SNAPC_FULL_REQUEST_OP_CMD;
|
||||
orte_snapc_base_request_op_event_t op_event = ORTE_SNAPC_OP_INIT;
|
||||
opal_buffer_t *buffer = NULL;
|
||||
orte_grpcomm_collective_t *coll;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle,
|
||||
"App) Initalized for Application %s\n",
|
||||
@ -154,16 +154,7 @@ int app_coord_init()
|
||||
"app) Startup Barrier..."));
|
||||
}
|
||||
|
||||
coll = OBJ_NEW(orte_grpcomm_collective_t);
|
||||
coll->id = orte_process_info.snapc_init_barrier;
|
||||
if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
coll->active = true;
|
||||
ORTE_WAIT_FOR_COMPLETION(coll->active);
|
||||
OBJ_RELEASE(coll);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
||||
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
||||
@ -217,7 +208,6 @@ int app_coord_finalize()
|
||||
orte_snapc_base_request_op_event_t op_event = ORTE_SNAPC_OP_FIN;
|
||||
opal_buffer_t *buffer = NULL;
|
||||
orte_std_cntr_t count;
|
||||
orte_grpcomm_collective_t *coll;
|
||||
orte_rml_recv_cb_t *rb = NULL;
|
||||
|
||||
/*
|
||||
@ -230,15 +220,7 @@ int app_coord_finalize()
|
||||
"app) Shutdown Barrier..."));
|
||||
}
|
||||
|
||||
coll = OBJ_NEW(orte_grpcomm_collective_t);
|
||||
coll->id = orte_process_info.snapc_init_barrier;
|
||||
if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
coll->active = true;
|
||||
ORTE_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
||||
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
||||
@ -309,13 +291,6 @@ int app_coord_finalize()
|
||||
"app) Shutdown Barrier: Waiting on barrier...!"));
|
||||
}
|
||||
|
||||
coll->id = orte_process_info.snapc_fini_barrier;
|
||||
if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
||||
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
||||
"app) Shutdown Barrier, Done!"));
|
||||
@ -332,8 +307,6 @@ int app_coord_finalize()
|
||||
rb = NULL;
|
||||
}
|
||||
|
||||
OBJ_RELEASE(coll);
|
||||
|
||||
/*
|
||||
* Cleanup named pipes
|
||||
*/
|
||||
@ -398,7 +371,7 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
||||
}
|
||||
|
||||
/* Default: use the fast way */
|
||||
orte_cr_continue_like_restart = false;
|
||||
opal_cr_continue_like_restart = false;
|
||||
orte_cr_flush_restart_files = true;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
||||
@ -480,7 +453,7 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
||||
* otherwise just continue.
|
||||
*/
|
||||
if( currently_all_migrating ) {
|
||||
orte_cr_continue_like_restart = true;
|
||||
opal_cr_continue_like_restart = true;
|
||||
orte_cr_flush_restart_files = false;
|
||||
}
|
||||
if( !currently_migrating && currently_all_migrating ) {
|
||||
|
@ -524,13 +524,6 @@ int global_coord_end_ckpt(orte_snapc_base_quiesce_t *datum)
|
||||
}
|
||||
#endif
|
||||
|
||||
orte_grpcomm.finalize();
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_ESTABLISH);
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_full_global_set_job_ckpt_info(current_global_jobid,
|
||||
ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL,
|
||||
@ -2106,6 +2099,7 @@ static int orte_snapc_full_global_set_job_ckpt_info( orte_jobid_t jobid,
|
||||
orte_proc_t *proc = NULL;
|
||||
opal_list_item_t *item = NULL;
|
||||
size_t num_procs;
|
||||
orte_grpcomm_signature_t *sig;
|
||||
|
||||
/*
|
||||
* Update all Local Coordinators (broadcast operation)
|
||||
@ -2187,7 +2181,12 @@ static int orte_snapc_full_global_set_job_ckpt_info( orte_jobid_t jobid,
|
||||
free(state_str);
|
||||
state_str = NULL;
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, buffer, ORTE_RML_TAG_SNAPC_FULL))) {
|
||||
/* goes to all daemons */
|
||||
sig = OBJ_NEW(orte_grpcomm_signature_t);
|
||||
sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
|
||||
sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
sig->signature[0].vpid = ORTE_VPID_WILDCARD;
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.xcast(sig, ORTE_RML_TAG_SNAPC_FULL, buffer))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -2204,6 +2203,7 @@ static int orte_snapc_full_global_set_job_ckpt_info( orte_jobid_t jobid,
|
||||
}
|
||||
|
||||
OBJ_RELEASE(buffer);
|
||||
OBJ_RELEASE(sig);
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
@ -1763,11 +1763,6 @@ static void snapc_full_local_comm_read_event(int fd, short flags, void *arg)
|
||||
}
|
||||
#endif
|
||||
|
||||
orte_grpcomm.finalize();
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto cleanup;
|
||||
}
|
||||
flushed_modex = true;
|
||||
}
|
||||
|
||||
|
@ -1521,6 +1521,7 @@ static int xcast_remove_all(orte_sstore_stage_global_snapshot_info_t *handle_inf
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
opal_buffer_t *loc_buffer = NULL;
|
||||
orte_sstore_stage_cmd_flag_t command;
|
||||
orte_grpcomm_signature_t *sig;
|
||||
|
||||
handle_info->num_procs_done = 0;
|
||||
|
||||
@ -1539,7 +1540,12 @@ static int xcast_remove_all(orte_sstore_stage_global_snapshot_info_t *handle_inf
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if( ORTE_SUCCESS != (ret = orte_grpcomm.xcast(ORTE_PROC_MY_NAME->jobid, loc_buffer, ORTE_RML_TAG_SSTORE_INTERNAL))) {
|
||||
/* goes to all daemons */
|
||||
sig = OBJ_NEW(orte_grpcomm_signature_t);
|
||||
sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t));
|
||||
sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid;
|
||||
sig->signature[0].vpid = ORTE_VPID_WILDCARD;
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.xcast(sig, ORTE_RML_TAG_SSTORE_INTERNAL, loc_buffer))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
@ -1554,6 +1560,8 @@ static int xcast_remove_all(orte_sstore_stage_global_snapshot_info_t *handle_inf
|
||||
loc_buffer = NULL;
|
||||
}
|
||||
|
||||
OBJ_RELEASE(sig);
|
||||
|
||||
return exit_status;
|
||||
}
|
||||
|
||||
|
@ -75,7 +75,6 @@ static int orte_cr_coord_post_ckpt(void);
|
||||
static int orte_cr_coord_post_restart(void);
|
||||
static int orte_cr_coord_post_continue(void);
|
||||
|
||||
bool orte_cr_continue_like_restart = false;
|
||||
bool orte_cr_flush_restart_files = true;
|
||||
|
||||
/*************
|
||||
@ -137,7 +136,7 @@ int orte_cr_init(void)
|
||||
opal_cr_reg_coord_callback(orte_cr_coord, &prev_coord_callback);
|
||||
|
||||
/* Typically this is not needed. Individual BTLs will set this as needed */
|
||||
orte_cr_continue_like_restart = false;
|
||||
opal_cr_continue_like_restart = false;
|
||||
orte_cr_flush_restart_files = true;
|
||||
|
||||
cleanup:
|
||||
|
@ -50,11 +50,6 @@ BEGIN_C_DECLS
|
||||
ORTE_DECLSPEC int orte_cr_entry_point_init(void);
|
||||
ORTE_DECLSPEC int orte_cr_entry_point_finalize(void);
|
||||
|
||||
/*
|
||||
* If one of the BTLs that shutdown require a full, clean rebuild of the
|
||||
* point-to-point stack on 'continue' as well as 'restart'.
|
||||
*/
|
||||
ORTE_DECLSPEC extern bool orte_cr_continue_like_restart;
|
||||
ORTE_DECLSPEC extern bool orte_cr_flush_restart_files;
|
||||
|
||||
END_C_DECLS
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user