FT: fix compilation using --with-ft (2/5)
Enabling the FT code breaks compilation (again). This series tries to fix the compiler errors. This is again only fixing the compiler errors without any warranty that the result might actually support FT again. The FT code used barrier mechanisms which have been removed with aec5cd08bd8c33677276612b899b48618d271efa. This patch replaces all those different barriers with opal_pmix.fence(NULL, 0); I am not sure this is completely correct but at least a starting point for a review.
Этот коммит содержится в:
родитель
f45dd069bd
Коммит
1c5a8df724
@ -27,6 +27,7 @@
|
||||
|
||||
#include "opal/runtime/opal_progress.h"
|
||||
#include "opal/mca/btl/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
#include "ompi/runtime/ompi_cr.h"
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
@ -48,7 +49,6 @@ int mca_bml_r2_ft_event(int state)
|
||||
int loc_state;
|
||||
int param_type = -1;
|
||||
const char **btl_list;
|
||||
ompi_rte_collective_t coll;
|
||||
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
/* Do nothing for now */
|
||||
@ -155,15 +155,7 @@ int mca_bml_r2_ft_event(int state)
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
OBJ_CONSTRUCT(&coll, ompi_rte_collective_t);
|
||||
coll.id = ompi_process_info.peer_init_barrier;
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_barrier(&coll))) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed in ompi_rte_barrier (%d)", ret);
|
||||
return ret;
|
||||
}
|
||||
while (coll.active) {
|
||||
opal_progress();
|
||||
}
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
/*
|
||||
* Re-open the BTL framework to get the full list of components.
|
||||
@ -233,15 +225,7 @@ int mca_bml_r2_ft_event(int state)
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
OBJ_CONSTRUCT(&coll, ompi_rte_collective_t);
|
||||
coll.id = ompi_process_info.peer_init_barrier;
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_barrier(&coll))) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed in ompi_rte_barrier (%d)", ret);
|
||||
return ret;
|
||||
}
|
||||
while (coll.active) {
|
||||
opal_progress();
|
||||
}
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
/*
|
||||
* Re-open the BTL framework to get the full list of components.
|
||||
|
@ -33,6 +33,7 @@
|
||||
#include "opal/util/opal_environ.h"
|
||||
#include "opal/mca/mca.h"
|
||||
#include "opal/mca/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
#include "ompi/request/request.h"
|
||||
#include "ompi/mca/rte/rte.h"
|
||||
@ -3006,12 +3007,9 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
static bool first_continue_pass = false;
|
||||
opal_list_item_t* item = NULL;
|
||||
int exit_status = OMPI_SUCCESS;
|
||||
ompi_rte_collective_t coll;
|
||||
int ret;
|
||||
|
||||
ft_event_state = state;
|
||||
OBJ_CONSTRUCT(&coll, ompi_rte_collective_t);
|
||||
coll.id = ompi_process_info.peer_init_barrier;
|
||||
|
||||
if( step_to_return_to == 1 ) {
|
||||
goto STEP_1;
|
||||
@ -3030,8 +3028,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR0);
|
||||
ompi_rte_barrier(&coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll.active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP0);
|
||||
|
||||
@ -3099,8 +3096,7 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR1);
|
||||
ompi_rte_barrier(&coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll.active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE2);
|
||||
}
|
||||
@ -3156,7 +3152,6 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
}
|
||||
|
||||
DONE:
|
||||
OBJ_DESTRUCT(&coll);
|
||||
step_to_return_to = 0;
|
||||
ft_event_state = OPAL_CRS_RUNNING;
|
||||
|
||||
@ -6212,19 +6207,15 @@ static void clear_timers(void) {
|
||||
static void display_all_timers(int state) {
|
||||
bool report_ready = false;
|
||||
double barrier_start, barrier_stop;
|
||||
ompi_rte_collective_t coll;
|
||||
int i;
|
||||
|
||||
OBJ_CONSTRUCT(&coll, ompi_rte_collective_t);
|
||||
coll.id = ompi_process_info.peer_init_barrier;
|
||||
if( 0 != OMPI_PROC_MY_NAME->vpid ) {
|
||||
if( 2 > timing_enabled ) {
|
||||
goto done;
|
||||
return;
|
||||
}
|
||||
else if( 2 == timing_enabled ) {
|
||||
ompi_rte_barrier(&coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll.active);
|
||||
goto done;
|
||||
opal_pmix.fence(NULL, 0);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
@ -6234,7 +6225,7 @@ static void display_all_timers(int state) {
|
||||
}
|
||||
}
|
||||
if( !report_ready ) {
|
||||
goto done;
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output(0, "crcp:bkmrk: timing(%20s): ******************** Begin: [State = %12s]\n", "Summary", opal_crs_base_state_str(state));
|
||||
@ -6244,8 +6235,7 @@ static void display_all_timers(int state) {
|
||||
|
||||
if( timing_enabled >= 2) {
|
||||
barrier_start = get_time();
|
||||
ompi_rte_barrier(&coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll.active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
barrier_stop = get_time();
|
||||
opal_output(0,
|
||||
"crcp:bkmrk: timing(%20s): %20s = %10.2f s\n",
|
||||
@ -6256,8 +6246,6 @@ static void display_all_timers(int state) {
|
||||
|
||||
opal_output(0, "crcp:bkmrk: timing(%20s): ******************** End: [State = %12s]\n", "Summary", opal_crs_base_state_str(state));
|
||||
|
||||
done:
|
||||
OBJ_DESTRUCT(&coll);
|
||||
}
|
||||
|
||||
static void display_indv_timer(int idx, int proc, int msgs) {
|
||||
|
@ -33,6 +33,7 @@
|
||||
#include "opal/util/show_help.h"
|
||||
#include "opal/mca/btl/btl.h"
|
||||
#include "opal/mca/btl/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
|
||||
#include "ompi/mca/pml/pml.h"
|
||||
#include "ompi/mca/pml/base/base.h"
|
||||
@ -663,15 +664,11 @@ int mca_pml_bfo_ft_event( int state )
|
||||
ompi_proc_t** procs = NULL;
|
||||
size_t num_procs;
|
||||
int ret, p;
|
||||
ompi_rte_collective_t *coll, *modex;
|
||||
|
||||
coll = OBJ_NEW(ompi_rte_collective_t);
|
||||
coll->id = ompi_process_info.peer_init_barrier;
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
|
||||
@ -682,8 +679,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
|
||||
}
|
||||
@ -785,8 +781,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
||||
}
|
||||
@ -796,17 +791,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
*/
|
||||
modex = OBJ_NEW(ompi_rte_collective_t);
|
||||
modex->id = ompi_process_info.peer_modex;
|
||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(modex))) {
|
||||
opal_output(0,
|
||||
"pml:bfo: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
|
||||
ret);
|
||||
OBJ_RELEASE(modex);
|
||||
goto clean;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(modex->active);
|
||||
OBJ_RELEASE(modex);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
@ -818,11 +803,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_barrier(coll))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed in ompi_rte_barrier (%d)", ret);
|
||||
return ret;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
@ -835,8 +816,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
|
||||
}
|
||||
@ -849,17 +829,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
*/
|
||||
modex = OBJ_NEW(ompi_rte_collective_t);
|
||||
modex->id = ompi_process_info.peer_modex;
|
||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) {
|
||||
opal_output(0,
|
||||
"pml:bfo: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
|
||||
ret);
|
||||
OBJ_RELEASE(modex);
|
||||
goto clean;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(modex->active);
|
||||
OBJ_RELEASE(modex);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
@ -871,11 +841,7 @@ int mca_pml_bfo_ft_event( int state )
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_barrier(coll))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed in ompi_rte_barrier (%d)", ret);
|
||||
goto clean;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
@ -895,7 +861,6 @@ int mca_pml_bfo_ft_event( int state )
|
||||
ret = OMPI_SUCCESS;
|
||||
|
||||
clean:
|
||||
OBJ_RELEASE(coll);
|
||||
return ret;
|
||||
}
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
|
@ -40,6 +40,7 @@
|
||||
#include "ompi/mca/pml/base/base.h"
|
||||
#include "ompi/mca/pml/base/base.h"
|
||||
#include "ompi/mca/bml/base/base.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
#include "ompi/runtime/ompi_cr.h"
|
||||
|
||||
#include "pml_ob1.h"
|
||||
@ -792,15 +793,11 @@ int mca_pml_ob1_ft_event( int state )
|
||||
ompi_proc_t** procs = NULL;
|
||||
size_t num_procs;
|
||||
int ret, p;
|
||||
ompi_rte_collective_t *coll, *modex;
|
||||
|
||||
coll = OBJ_NEW(ompi_rte_collective_t);
|
||||
coll->id = ompi_process_info.peer_init_barrier;
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
|
||||
@ -811,8 +808,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
|
||||
}
|
||||
@ -914,28 +910,13 @@ int mca_pml_ob1_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
||||
}
|
||||
|
||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
/*
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
*/
|
||||
modex = OBJ_NEW(ompi_rte_collective_t);
|
||||
modex->id = ompi_process_info.peer_modex;
|
||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(modex))) {
|
||||
opal_output(0,
|
||||
"pml:ob1: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
|
||||
ret);
|
||||
OBJ_RELEASE(modex);
|
||||
goto clean;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(modex->active);
|
||||
OBJ_RELEASE(modex);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
@ -947,11 +928,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_barrier(coll))) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed in ompi_rte_barrier (%d)", ret);
|
||||
goto clean;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
@ -964,8 +941,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
|
||||
ompi_rte_barrier(coll);
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
|
||||
}
|
||||
@ -978,17 +954,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
*/
|
||||
modex = OBJ_NEW(ompi_rte_collective_t);
|
||||
modex->id = ompi_process_info.peer_modex;
|
||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(modex))) {
|
||||
opal_output(0,
|
||||
"pml:ob1: ft_event(Restart): Failed orte_grpcomm.modex() = %d",
|
||||
ret);
|
||||
OBJ_RELEASE(modex);
|
||||
goto clean;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(modex->active);
|
||||
OBJ_RELEASE(modex);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
@ -1000,11 +966,7 @@ int mca_pml_ob1_ft_event( int state )
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
if (OMPI_SUCCESS != (ret = ompi_rte_barrier(coll))) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed in ompi_rte_barrier (%d)", ret);
|
||||
goto clean;
|
||||
}
|
||||
OMPI_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
@ -1024,7 +986,6 @@ int mca_pml_ob1_ft_event( int state )
|
||||
ret = OMPI_SUCCESS;
|
||||
|
||||
clean:
|
||||
OBJ_RELEASE(coll);
|
||||
return ret;
|
||||
}
|
||||
#endif /* OPAL_ENABLE_FT_CR */
|
||||
|
37
orte/mca/ess/env/ess_env_module.c
поставляемый
37
orte/mca/ess/env/ess_env_module.c
поставляемый
@ -191,10 +191,6 @@ static int rte_ft_event(int state)
|
||||
{
|
||||
int ret, exit_status = ORTE_SUCCESS;
|
||||
orte_proc_type_t svtype;
|
||||
orte_grpcomm_collective_t coll;
|
||||
|
||||
OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
|
||||
coll.id = orte_process_info.snapc_init_barrier;
|
||||
|
||||
/******** Checkpoint Prep ********/
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
@ -263,14 +259,7 @@ static int rte_ft_event(int state)
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) {
|
||||
opal_output(0, "ess:env: ft_event(%2d): Failed in orte_grpcomm.barrier (%d)",
|
||||
state, ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
coll.active = true;
|
||||
ORTE_WAIT_FOR_COMPLETION(coll.active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( orte_cr_flush_restart_files ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
|
||||
@ -291,11 +280,6 @@ static int rte_ft_event(int state)
|
||||
* This should follow the ess init() function
|
||||
*/
|
||||
|
||||
/*
|
||||
* Clear nidmap and jmap
|
||||
*/
|
||||
orte_util_nidmap_finalize();
|
||||
|
||||
/*
|
||||
* - Reset Contact information
|
||||
*/
|
||||
@ -330,15 +314,6 @@ static int rte_ft_event(int state)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/*
|
||||
* Group Comm - Clean out stale data
|
||||
*/
|
||||
orte_grpcomm.finalize();
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
/* RHC: you can't pass NULL as the identifier - what you'll need to do is
|
||||
* close all open dstore handles, and then open the ones you need
|
||||
*/
|
||||
@ -387,14 +362,7 @@ static int rte_ft_event(int state)
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) {
|
||||
opal_output(0, "ess:env ft_event(%2d): Failed in orte_grpcomm.barrier (%d)",
|
||||
state, ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
coll.active = true;
|
||||
ORTE_WAIT_FOR_COMPLETION(coll.active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( orte_cr_flush_restart_files ) {
|
||||
OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output,
|
||||
@ -438,7 +406,6 @@ static int rte_ft_event(int state)
|
||||
}
|
||||
|
||||
cleanup:
|
||||
OBJ_DESTRUCT(&coll);
|
||||
return exit_status;
|
||||
}
|
||||
#endif
|
||||
|
@ -50,6 +50,7 @@
|
||||
#include "opal/mca/crs/base/base.h"
|
||||
|
||||
#include "orte/util/name_fns.h"
|
||||
#include "opal/mca/pmix/pmix.h"
|
||||
#include "orte/mca/snapc/snapc.h"
|
||||
#include "orte/mca/snapc/base/base.h"
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
@ -110,7 +111,6 @@ int app_coord_init()
|
||||
orte_snapc_full_cmd_flag_t command = ORTE_SNAPC_FULL_REQUEST_OP_CMD;
|
||||
orte_snapc_base_request_op_event_t op_event = ORTE_SNAPC_OP_INIT;
|
||||
opal_buffer_t *buffer = NULL;
|
||||
orte_grpcomm_collective_t *coll;
|
||||
|
||||
OPAL_OUTPUT_VERBOSE((20, mca_snapc_full_component.super.output_handle,
|
||||
"App) Initalized for Application %s\n",
|
||||
@ -154,16 +154,7 @@ int app_coord_init()
|
||||
"app) Startup Barrier..."));
|
||||
}
|
||||
|
||||
coll = OBJ_NEW(orte_grpcomm_collective_t);
|
||||
coll->id = orte_process_info.snapc_init_barrier;
|
||||
if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
coll->active = true;
|
||||
ORTE_WAIT_FOR_COMPLETION(coll->active);
|
||||
OBJ_RELEASE(coll);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
||||
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
||||
@ -217,7 +208,6 @@ int app_coord_finalize()
|
||||
orte_snapc_base_request_op_event_t op_event = ORTE_SNAPC_OP_FIN;
|
||||
opal_buffer_t *buffer = NULL;
|
||||
orte_std_cntr_t count;
|
||||
orte_grpcomm_collective_t *coll;
|
||||
orte_rml_recv_cb_t *rb = NULL;
|
||||
|
||||
/*
|
||||
@ -230,15 +220,7 @@ int app_coord_finalize()
|
||||
"app) Shutdown Barrier..."));
|
||||
}
|
||||
|
||||
coll = OBJ_NEW(orte_grpcomm_collective_t);
|
||||
coll->id = orte_process_info.snapc_init_barrier;
|
||||
if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
coll->active = true;
|
||||
ORTE_WAIT_FOR_COMPLETION(coll->active);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
|
||||
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
||||
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
||||
@ -309,13 +291,6 @@ int app_coord_finalize()
|
||||
"app) Shutdown Barrier: Waiting on barrier...!"));
|
||||
}
|
||||
|
||||
coll->id = orte_process_info.snapc_fini_barrier;
|
||||
if( ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll)) ) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
||||
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
||||
"app) Shutdown Barrier, Done!"));
|
||||
@ -332,8 +307,6 @@ int app_coord_finalize()
|
||||
rb = NULL;
|
||||
}
|
||||
|
||||
OBJ_RELEASE(coll);
|
||||
|
||||
/*
|
||||
* Cleanup named pipes
|
||||
*/
|
||||
|
@ -524,13 +524,6 @@ int global_coord_end_ckpt(orte_snapc_base_quiesce_t *datum)
|
||||
}
|
||||
#endif
|
||||
|
||||
orte_grpcomm.finalize();
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
SNAPC_FULL_SET_TIMER(SNAPC_FULL_TIMER_ESTABLISH);
|
||||
if( ORTE_SUCCESS != (ret = orte_snapc_full_global_set_job_ckpt_info(current_global_jobid,
|
||||
ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL,
|
||||
|
@ -1763,11 +1763,6 @@ static void snapc_full_local_comm_read_event(int fd, short flags, void *arg)
|
||||
}
|
||||
#endif
|
||||
|
||||
orte_grpcomm.finalize();
|
||||
if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
goto cleanup;
|
||||
}
|
||||
flushed_modex = true;
|
||||
}
|
||||
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user