Merge pull request #5555 from karasevb/v4.0.x_pmix_fence_status
v4.0.x/pmix: added check for pmix fence status
Этот коммит содержится в:
Коммит
4c8852c2c8
@ -589,7 +589,11 @@ int ompi_dpm_disconnect(ompi_communicator_t *comm)
|
|||||||
|
|
||||||
/* ensure we tell the host RM to disconnect us - this
|
/* ensure we tell the host RM to disconnect us - this
|
||||||
* is a blocking operation so just use a fence */
|
* is a blocking operation so just use a fence */
|
||||||
ret = opal_pmix.fence(&coll, false);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(&coll, false))) {
|
||||||
|
OMPI_ERROR_LOG(ret);
|
||||||
|
OPAL_LIST_DESTRUCT(&coll);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
OPAL_LIST_DESTRUCT(&coll);
|
OPAL_LIST_DESTRUCT(&coll);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -155,7 +155,10 @@ int mca_bml_r2_ft_event(int state)
|
|||||||
* Barrier to make all processes have been successfully restarted before
|
* Barrier to make all processes have been successfully restarted before
|
||||||
* we try to remove some restart only files.
|
* we try to remove some restart only files.
|
||||||
*/
|
*/
|
||||||
opal_pmix.fence(NULL, 0);
|
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "bml:r2: ft_event(Restart): Failed to fence complete\n");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Re-open the BTL framework to get the full list of components.
|
* Re-open the BTL framework to get the full list of components.
|
||||||
@ -224,7 +227,10 @@ int mca_bml_r2_ft_event(int state)
|
|||||||
* Barrier to make all processes have been successfully restarted before
|
* Barrier to make all processes have been successfully restarted before
|
||||||
* we try to remove some restart only files.
|
* we try to remove some restart only files.
|
||||||
*/
|
*/
|
||||||
opal_pmix.fence(NULL, 0);
|
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "bml:r2: ft_event(Restart): Failed to fence complete\n");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Re-open the BTL framework to get the full list of components.
|
* Re-open the BTL framework to get the full list of components.
|
||||||
|
@ -3028,7 +3028,10 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
|||||||
|
|
||||||
if( opal_cr_timing_barrier_enabled ) {
|
if( opal_cr_timing_barrier_enabled ) {
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR0);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR0);
|
||||||
opal_pmix.fence(NULL, 0);
|
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
exit_status = ret;
|
||||||
|
goto DONE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP0);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP0);
|
||||||
|
|
||||||
@ -3096,7 +3099,10 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
|||||||
|
|
||||||
if( opal_cr_timing_barrier_enabled ) {
|
if( opal_cr_timing_barrier_enabled ) {
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR1);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR1);
|
||||||
opal_pmix.fence(NULL, 0);
|
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
exit_status = ret;
|
||||||
|
goto DONE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE2);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE2);
|
||||||
}
|
}
|
||||||
@ -6207,14 +6213,16 @@ static void clear_timers(void) {
|
|||||||
static void display_all_timers(int state) {
|
static void display_all_timers(int state) {
|
||||||
bool report_ready = false;
|
bool report_ready = false;
|
||||||
double barrier_start, barrier_stop;
|
double barrier_start, barrier_stop;
|
||||||
int i;
|
int i, ret;
|
||||||
|
|
||||||
if( 0 != OMPI_PROC_MY_NAME->vpid ) {
|
if( 0 != OMPI_PROC_MY_NAME->vpid ) {
|
||||||
if( 2 > timing_enabled ) {
|
if( 2 > timing_enabled ) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
else if( 2 == timing_enabled ) {
|
else if( 2 == timing_enabled ) {
|
||||||
opal_pmix.fence(NULL, 0);
|
if( OPAL_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
OPAL_ERROR_LOG(ret);
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -6235,7 +6243,9 @@ static void display_all_timers(int state) {
|
|||||||
|
|
||||||
if( timing_enabled >= 2) {
|
if( timing_enabled >= 2) {
|
||||||
barrier_start = get_time();
|
barrier_start = get_time();
|
||||||
opal_pmix.fence(NULL, 0);
|
if( OPAL_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
OPAL_ERROR_LOG(ret);
|
||||||
|
}
|
||||||
barrier_stop = get_time();
|
barrier_stop = get_time();
|
||||||
opal_output(0,
|
opal_output(0,
|
||||||
"crcp:bkmrk: timing(%20s): %20s = %10.2f s\n",
|
"crcp:bkmrk: timing(%20s): %20s = %10.2f s\n",
|
||||||
|
@ -666,7 +666,10 @@ int mca_pml_bfo_ft_event( int state )
|
|||||||
if(OPAL_CRS_CHECKPOINT == state) {
|
if(OPAL_CRS_CHECKPOINT == state) {
|
||||||
if( opal_cr_timing_barrier_enabled ) {
|
if( opal_cr_timing_barrier_enabled ) {
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
|
||||||
@ -677,7 +680,10 @@ int mca_pml_bfo_ft_event( int state )
|
|||||||
if( !first_continue_pass ) {
|
if( !first_continue_pass ) {
|
||||||
if( opal_cr_timing_barrier_enabled ) {
|
if( opal_cr_timing_barrier_enabled ) {
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
|
||||||
}
|
}
|
||||||
@ -777,7 +783,10 @@ int mca_pml_bfo_ft_event( int state )
|
|||||||
if( !first_continue_pass ) {
|
if( !first_continue_pass ) {
|
||||||
if( opal_cr_timing_barrier_enabled ) {
|
if( opal_cr_timing_barrier_enabled ) {
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
||||||
}
|
}
|
||||||
@ -787,7 +796,10 @@ int mca_pml_bfo_ft_event( int state )
|
|||||||
* Exchange the modex information once again.
|
* Exchange the modex information once again.
|
||||||
* BTLs will have republished their modex information.
|
* BTLs will have republished their modex information.
|
||||||
*/
|
*/
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Startup the PML stack now that the modex is running again
|
* Startup the PML stack now that the modex is running again
|
||||||
@ -799,7 +811,10 @@ int mca_pml_bfo_ft_event( int state )
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Is this barrier necessary ? JJH */
|
/* Is this barrier necessary ? JJH */
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
if( NULL != procs ) {
|
if( NULL != procs ) {
|
||||||
for(p = 0; p < (int)num_procs; ++p) {
|
for(p = 0; p < (int)num_procs; ++p) {
|
||||||
@ -812,7 +827,10 @@ int mca_pml_bfo_ft_event( int state )
|
|||||||
if( !first_continue_pass ) {
|
if( !first_continue_pass ) {
|
||||||
if( opal_cr_timing_barrier_enabled ) {
|
if( opal_cr_timing_barrier_enabled ) {
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
|
||||||
}
|
}
|
||||||
@ -825,7 +843,10 @@ int mca_pml_bfo_ft_event( int state )
|
|||||||
* Exchange the modex information once again.
|
* Exchange the modex information once again.
|
||||||
* BTLs will have republished their modex information.
|
* BTLs will have republished their modex information.
|
||||||
*/
|
*/
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Startup the PML stack now that the modex is running again
|
* Startup the PML stack now that the modex is running again
|
||||||
@ -837,7 +858,10 @@ int mca_pml_bfo_ft_event( int state )
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Is this barrier necessary ? JJH */
|
/* Is this barrier necessary ? JJH */
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
if( NULL != procs ) {
|
if( NULL != procs ) {
|
||||||
for(p = 0; p < (int)num_procs; ++p) {
|
for(p = 0; p < (int)num_procs; ++p) {
|
||||||
|
@ -807,7 +807,10 @@ int mca_pml_ob1_ft_event( int state )
|
|||||||
if(OPAL_CRS_CHECKPOINT == state) {
|
if(OPAL_CRS_CHECKPOINT == state) {
|
||||||
if( opal_cr_timing_barrier_enabled ) {
|
if( opal_cr_timing_barrier_enabled ) {
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
|
||||||
@ -818,7 +821,10 @@ int mca_pml_ob1_ft_event( int state )
|
|||||||
if( !first_continue_pass ) {
|
if( !first_continue_pass ) {
|
||||||
if( opal_cr_timing_barrier_enabled ) {
|
if( opal_cr_timing_barrier_enabled ) {
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
|
||||||
}
|
}
|
||||||
@ -918,13 +924,19 @@ int mca_pml_ob1_ft_event( int state )
|
|||||||
if( !first_continue_pass ) {
|
if( !first_continue_pass ) {
|
||||||
if( opal_cr_timing_barrier_enabled ) {
|
if( opal_cr_timing_barrier_enabled ) {
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Startup the PML stack now that the modex is running again
|
* Startup the PML stack now that the modex is running again
|
||||||
@ -936,7 +948,10 @@ int mca_pml_ob1_ft_event( int state )
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Is this barrier necessary ? JJH */
|
/* Is this barrier necessary ? JJH */
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
if( NULL != procs ) {
|
if( NULL != procs ) {
|
||||||
for(p = 0; p < (int)num_procs; ++p) {
|
for(p = 0; p < (int)num_procs; ++p) {
|
||||||
@ -949,7 +964,10 @@ int mca_pml_ob1_ft_event( int state )
|
|||||||
if( !first_continue_pass ) {
|
if( !first_continue_pass ) {
|
||||||
if( opal_cr_timing_barrier_enabled ) {
|
if( opal_cr_timing_barrier_enabled ) {
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
|
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
|
||||||
}
|
}
|
||||||
@ -962,7 +980,10 @@ int mca_pml_ob1_ft_event( int state )
|
|||||||
* Exchange the modex information once again.
|
* Exchange the modex information once again.
|
||||||
* BTLs will have republished their modex information.
|
* BTLs will have republished their modex information.
|
||||||
*/
|
*/
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Startup the PML stack now that the modex is running again
|
* Startup the PML stack now that the modex is running again
|
||||||
@ -974,7 +995,10 @@ int mca_pml_ob1_ft_event( int state )
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Is this barrier necessary ? JJH */
|
/* Is this barrier necessary ? JJH */
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
if( NULL != procs ) {
|
if( NULL != procs ) {
|
||||||
for(p = 0; p < (int)num_procs; ++p) {
|
for(p = 0; p < (int)num_procs; ++p) {
|
||||||
|
@ -389,6 +389,7 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs)
|
|||||||
void *dreq, **dreqs;
|
void *dreq, **dreqs;
|
||||||
ucp_ep_h ep;
|
ucp_ep_h ep;
|
||||||
size_t i;
|
size_t i;
|
||||||
|
int ret;
|
||||||
|
|
||||||
max_reqs = ompi_pml_ucx.num_disconnect;
|
max_reqs = ompi_pml_ucx.num_disconnect;
|
||||||
if (max_reqs > nprocs) {
|
if (max_reqs > nprocs) {
|
||||||
@ -433,7 +434,10 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs)
|
|||||||
mca_pml_ucx_waitall(dreqs, &num_reqs);
|
mca_pml_ucx_waitall(dreqs, &num_reqs);
|
||||||
free(dreqs);
|
free(dreqs);
|
||||||
|
|
||||||
opal_common_ucx_mca_pmix_fence(ompi_pml_ucx.ucp_worker);
|
if (OMPI_SUCCESS != (ret = opal_common_ucx_mca_pmix_fence(
|
||||||
|
ompi_pml_ucx.ucp_worker))) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -265,6 +265,7 @@ int mca_pml_yalla_add_procs(struct ompi_proc_t **procs, size_t nprocs)
|
|||||||
int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs)
|
int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs)
|
||||||
{
|
{
|
||||||
size_t i;
|
size_t i;
|
||||||
|
int ret;
|
||||||
|
|
||||||
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
|
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
|
||||||
PML_YALLA_VERBOSE(3, "%s", "using bulk powerdown");
|
PML_YALLA_VERBOSE(3, "%s", "using bulk powerdown");
|
||||||
@ -276,7 +277,9 @@ int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs)
|
|||||||
PML_YALLA_VERBOSE(2, "disconnected from rank %s", OPAL_NAME_PRINT(procs[i]->super.proc_name));
|
PML_YALLA_VERBOSE(2, "disconnected from rank %s", OPAL_NAME_PRINT(procs[i]->super.proc_name));
|
||||||
procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = NULL;
|
procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = NULL;
|
||||||
}
|
}
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
return OMPI_SUCCESS;
|
return OMPI_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -257,7 +257,13 @@ int ompi_mpi_finalize(void)
|
|||||||
* communications/actions to complete. See
|
* communications/actions to complete. See
|
||||||
* https://github.com/open-mpi/ompi/issues/1576 for the
|
* https://github.com/open-mpi/ompi/issues/1576 for the
|
||||||
* original bug report. */
|
* original bug report. */
|
||||||
opal_pmix.fence_nb(NULL, 0, fence_cbfunc, (void*)&active);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, 0, fence_cbfunc,
|
||||||
|
(void*)&active))) {
|
||||||
|
OMPI_ERROR_LOG(ret);
|
||||||
|
/* Reset the active flag to false, to avoid waiting for
|
||||||
|
* completion when the fence was failed. */
|
||||||
|
active = false;
|
||||||
|
}
|
||||||
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
|
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
|
||||||
} else {
|
} else {
|
||||||
/* However, we cannot guarantee that the provided PMIx has
|
/* However, we cannot guarantee that the provided PMIx has
|
||||||
@ -268,7 +274,9 @@ int ompi_mpi_finalize(void)
|
|||||||
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
|
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
|
||||||
comm->c_coll->coll_barrier(comm, comm->c_coll->coll_barrier_module);
|
comm->c_coll->coll_barrier(comm, comm->c_coll->coll_barrier_module);
|
||||||
|
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
OMPI_ERROR_LOG(ret);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -662,9 +662,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
|
|||||||
#if (OPAL_ENABLE_TIMING)
|
#if (OPAL_ENABLE_TIMING)
|
||||||
if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex &&
|
if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex &&
|
||||||
opal_pmix_collect_all_data) {
|
opal_pmix_collect_all_data) {
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
error = "timing: pmix-barrier-1 failed";
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
OMPI_TIMING_NEXT("pmix-barrier-1");
|
OMPI_TIMING_NEXT("pmix-barrier-1");
|
||||||
opal_pmix.fence(NULL, 0);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
error = "timing: pmix-barrier-2 failed";
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
OMPI_TIMING_NEXT("pmix-barrier-2");
|
OMPI_TIMING_NEXT("pmix-barrier-2");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -687,19 +693,32 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
|
|||||||
background_fence = true;
|
background_fence = true;
|
||||||
active = true;
|
active = true;
|
||||||
OPAL_POST_OBJECT(&active);
|
OPAL_POST_OBJECT(&active);
|
||||||
opal_pmix.fence_nb(NULL, true, fence_release, (void*)&active);
|
if( OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, true,
|
||||||
|
fence_release,
|
||||||
|
(void*)&active))) {
|
||||||
|
error = "opal_pmix.fence_nb() failed";
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
} else if (!opal_pmix_base_async_modex) {
|
} else if (!opal_pmix_base_async_modex) {
|
||||||
/* we want to do the modex */
|
/* we want to do the modex */
|
||||||
active = true;
|
active = true;
|
||||||
OPAL_POST_OBJECT(&active);
|
OPAL_POST_OBJECT(&active);
|
||||||
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
|
if( OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL,
|
||||||
fence_release, (void*)&active);
|
opal_pmix_collect_all_data, fence_release, (void*)&active))) {
|
||||||
|
error = "opal_pmix.fence_nb() failed";
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
/* cannot just wait on thread as we need to call opal_progress */
|
/* cannot just wait on thread as we need to call opal_progress */
|
||||||
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
|
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
|
||||||
}
|
}
|
||||||
/* otherwise, we don't want to do the modex, so fall thru */
|
/* otherwise, we don't want to do the modex, so fall thru */
|
||||||
} else if (!opal_pmix_base_async_modex || opal_pmix_collect_all_data) {
|
} else if (!opal_pmix_base_async_modex || opal_pmix_collect_all_data) {
|
||||||
opal_pmix.fence(NULL, opal_pmix_collect_all_data);
|
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL,
|
||||||
|
opal_pmix_collect_all_data))) {
|
||||||
|
error = "opal_pmix.fence() failed";
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
OMPI_TIMING_NEXT("modex");
|
OMPI_TIMING_NEXT("modex");
|
||||||
@ -877,11 +896,17 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
|
|||||||
if (NULL != opal_pmix.fence_nb) {
|
if (NULL != opal_pmix.fence_nb) {
|
||||||
active = true;
|
active = true;
|
||||||
OPAL_POST_OBJECT(&active);
|
OPAL_POST_OBJECT(&active);
|
||||||
opal_pmix.fence_nb(NULL, false,
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, false,
|
||||||
fence_release, (void*)&active);
|
fence_release, (void*)&active))) {
|
||||||
|
error = "opal_pmix.fence_nb() failed";
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
|
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
|
||||||
} else {
|
} else {
|
||||||
opal_pmix.fence(NULL, false);
|
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, false))) {
|
||||||
|
error = "opal_pmix.fence() failed";
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,13 +97,19 @@ static void opal_common_ucx_mca_fence_complete_cb(int status, void *fenced)
|
|||||||
*(int*)fenced = 1;
|
*(int*)fenced = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
OPAL_DECLSPEC void opal_common_ucx_mca_pmix_fence(ucp_worker_h worker)
|
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker)
|
||||||
{
|
{
|
||||||
volatile int fenced = 0;
|
volatile int fenced = 0;
|
||||||
|
int ret = OPAL_SUCCESS;
|
||||||
|
|
||||||
|
if (OPAL_SUCCESS != (ret = opal_pmix.fence_nb(NULL, 0,
|
||||||
|
opal_common_ucx_mca_fence_complete_cb, (void*)&fenced))){
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
opal_pmix.fence_nb(NULL, 0, opal_common_ucx_mca_fence_complete_cb, (void*)&fenced);
|
|
||||||
while (!fenced) {
|
while (!fenced) {
|
||||||
ucp_worker_progress(worker);
|
ucp_worker_progress(worker);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
@ -65,7 +65,7 @@ extern opal_common_ucx_module_t opal_common_ucx;
|
|||||||
OPAL_DECLSPEC void opal_common_ucx_mca_register(void);
|
OPAL_DECLSPEC void opal_common_ucx_mca_register(void);
|
||||||
OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void);
|
OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void);
|
||||||
OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status);
|
OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status);
|
||||||
OPAL_DECLSPEC void opal_common_ucx_mca_pmix_fence(ucp_worker_h worker);
|
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker);
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
int opal_common_ucx_wait_request(ucs_status_ptr_t request, ucp_worker_h worker,
|
int opal_common_ucx_wait_request(ucs_status_ptr_t request, ucp_worker_h worker,
|
||||||
|
@ -458,7 +458,10 @@ static int rte_init(void)
|
|||||||
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
|
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
|
||||||
/* need to commit the data before we fence */
|
/* need to commit the data before we fence */
|
||||||
opal_pmix.commit();
|
opal_pmix.commit();
|
||||||
opal_pmix.fence(NULL, 0);
|
if (ORTE_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
error = "opal_pmix.fence() failed";
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
OPAL_TIMING_ENV_NEXT(rte_init, "rte_init_done");
|
OPAL_TIMING_ENV_NEXT(rte_init, "rte_init_done");
|
||||||
|
|
||||||
|
@ -150,7 +150,11 @@ int app_coord_init()
|
|||||||
"app) Startup Barrier..."));
|
"app) Startup Barrier..."));
|
||||||
}
|
}
|
||||||
|
|
||||||
opal_pmix.fence(NULL, 0);
|
if (ORTE_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
||||||
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
||||||
@ -216,7 +220,11 @@ int app_coord_finalize()
|
|||||||
"app) Shutdown Barrier..."));
|
"app) Shutdown Barrier..."));
|
||||||
}
|
}
|
||||||
|
|
||||||
opal_pmix.fence(NULL, 0);
|
if (ORTE_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||||
|
ORTE_ERROR_LOG(ret);
|
||||||
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
||||||
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
||||||
|
@ -125,6 +125,7 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs)
|
|||||||
void *dreq, **dreqs;
|
void *dreq, **dreqs;
|
||||||
ucp_ep_h ep;
|
ucp_ep_h ep;
|
||||||
size_t i, n;
|
size_t i, n;
|
||||||
|
int ret;
|
||||||
|
|
||||||
oshmem_shmem_barrier();
|
oshmem_shmem_barrier();
|
||||||
|
|
||||||
@ -175,7 +176,10 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs)
|
|||||||
free(dreqs);
|
free(dreqs);
|
||||||
free(mca_spml_ucx.remote_addrs_tbl);
|
free(mca_spml_ucx.remote_addrs_tbl);
|
||||||
|
|
||||||
opal_common_ucx_mca_pmix_fence(mca_spml_ucx_ctx_default.ucp_worker);
|
if (OSHMEM_SUCCESS != (ret = opal_common_ucx_mca_pmix_fence(
|
||||||
|
mca_spml_ucx_ctx_default.ucp_worker))) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
free(mca_spml_ucx_ctx_default.ucp_peers);
|
free(mca_spml_ucx_ctx_default.ucp_peers);
|
||||||
mca_spml_ucx_ctx_default.ucp_peers = NULL;
|
mca_spml_ucx_ctx_default.ucp_peers = NULL;
|
||||||
return OSHMEM_SUCCESS;
|
return OSHMEM_SUCCESS;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user