pmix: added check for pmix fence status
Signed-off-by: Boris Karasev <karasev.b@gmail.com>
(cherry picked from commit 57683366ca
)
Conflicts:
opal/mca/common/ucx/common_ucx.c
opal/mca/common/ucx/common_ucx.h
Modified:
ompi/mca/pml/ucx/pml_ucx.c
oshmem/mca/spml/ucx/spml_ucx.c
Этот коммит содержится в:
родитель
8483eb4bf7
Коммит
8873d901e8
@ -589,7 +589,11 @@ int ompi_dpm_disconnect(ompi_communicator_t *comm)
|
||||
|
||||
/* ensure we tell the host RM to disconnect us - this
|
||||
* is a blocking operation so just use a fence */
|
||||
ret = opal_pmix.fence(&coll, false);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(&coll, false))) {
|
||||
OMPI_ERROR_LOG(ret);
|
||||
OPAL_LIST_DESTRUCT(&coll);
|
||||
return ret;
|
||||
}
|
||||
OPAL_LIST_DESTRUCT(&coll);
|
||||
|
||||
return ret;
|
||||
|
@ -155,7 +155,10 @@ int mca_bml_r2_ft_event(int state)
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed to fence complete\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Re-open the BTL framework to get the full list of components.
|
||||
@ -224,7 +227,10 @@ int mca_bml_r2_ft_event(int state)
|
||||
* Barrier to make all processes have been successfully restarted before
|
||||
* we try to remove some restart only files.
|
||||
*/
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "bml:r2: ft_event(Restart): Failed to fence complete\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Re-open the BTL framework to get the full list of components.
|
||||
|
@ -3028,7 +3028,10 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR0);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
exit_status = ret;
|
||||
goto DONE;
|
||||
}
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP0);
|
||||
|
||||
@ -3096,7 +3099,10 @@ ompi_crcp_base_pml_state_t* ompi_crcp_bkmrk_pml_ft_event(
|
||||
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR1);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
exit_status = ret;
|
||||
goto DONE;
|
||||
}
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CORE2);
|
||||
}
|
||||
@ -6207,14 +6213,16 @@ static void clear_timers(void) {
|
||||
static void display_all_timers(int state) {
|
||||
bool report_ready = false;
|
||||
double barrier_start, barrier_stop;
|
||||
int i;
|
||||
int i, ret;
|
||||
|
||||
if( 0 != OMPI_PROC_MY_NAME->vpid ) {
|
||||
if( 2 > timing_enabled ) {
|
||||
return;
|
||||
}
|
||||
else if( 2 == timing_enabled ) {
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if( OPAL_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
OPAL_ERROR_LOG(ret);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -6235,7 +6243,9 @@ static void display_all_timers(int state) {
|
||||
|
||||
if( timing_enabled >= 2) {
|
||||
barrier_start = get_time();
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if( OPAL_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
OPAL_ERROR_LOG(ret);
|
||||
}
|
||||
barrier_stop = get_time();
|
||||
opal_output(0,
|
||||
"crcp:bkmrk: timing(%20s): %20s = %10.2f s\n",
|
||||
|
@ -666,7 +666,10 @@ int mca_pml_bfo_ft_event( int state )
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
|
||||
@ -677,7 +680,10 @@ int mca_pml_bfo_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
|
||||
}
|
||||
@ -777,7 +783,10 @@ int mca_pml_bfo_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
||||
}
|
||||
@ -787,7 +796,10 @@ int mca_pml_bfo_ft_event( int state )
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
*/
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
@ -799,7 +811,10 @@ int mca_pml_bfo_ft_event( int state )
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
@ -812,7 +827,10 @@ int mca_pml_bfo_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
|
||||
}
|
||||
@ -825,7 +843,10 @@ int mca_pml_bfo_ft_event( int state )
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
*/
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
@ -837,7 +858,10 @@ int mca_pml_bfo_ft_event( int state )
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:bfo: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
|
@ -807,7 +807,10 @@ int mca_pml_ob1_ft_event( int state )
|
||||
if(OPAL_CRS_CHECKPOINT == state) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0);
|
||||
@ -818,7 +821,10 @@ int mca_pml_ob1_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2);
|
||||
}
|
||||
@ -918,13 +924,19 @@ int mca_pml_ob1_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3);
|
||||
}
|
||||
|
||||
if (opal_cr_continue_like_restart && !first_continue_pass) {
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
@ -936,7 +948,10 @@ int mca_pml_ob1_ft_event( int state )
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
@ -949,7 +964,10 @@ int mca_pml_ob1_ft_event( int state )
|
||||
if( !first_continue_pass ) {
|
||||
if( opal_cr_timing_barrier_enabled ) {
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2);
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1);
|
||||
}
|
||||
@ -962,7 +980,10 @@ int mca_pml_ob1_ft_event( int state )
|
||||
* Exchange the modex information once again.
|
||||
* BTLs will have republished their modex information.
|
||||
*/
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Startup the PML stack now that the modex is running again
|
||||
@ -974,7 +995,10 @@ int mca_pml_ob1_ft_event( int state )
|
||||
}
|
||||
|
||||
/* Is this barrier necessary ? JJH */
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
opal_output(0, "pml:ob1: ft_event(Restart): Failed to fence complete");
|
||||
return ret;
|
||||
}
|
||||
|
||||
if( NULL != procs ) {
|
||||
for(p = 0; p < (int)num_procs; ++p) {
|
||||
|
@ -389,6 +389,7 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs)
|
||||
void *dreq, **dreqs;
|
||||
ucp_ep_h ep;
|
||||
size_t i;
|
||||
int ret;
|
||||
|
||||
max_reqs = ompi_pml_ucx.num_disconnect;
|
||||
if (max_reqs > nprocs) {
|
||||
@ -433,7 +434,10 @@ int mca_pml_ucx_del_procs(struct ompi_proc_t **procs, size_t nprocs)
|
||||
mca_pml_ucx_waitall(dreqs, &num_reqs);
|
||||
free(dreqs);
|
||||
|
||||
opal_common_ucx_mca_pmix_fence(ompi_pml_ucx.ucp_worker);
|
||||
if (OMPI_SUCCESS != (ret = opal_common_ucx_mca_pmix_fence(
|
||||
ompi_pml_ucx.ucp_worker))) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
@ -265,6 +265,7 @@ int mca_pml_yalla_add_procs(struct ompi_proc_t **procs, size_t nprocs)
|
||||
int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs)
|
||||
{
|
||||
size_t i;
|
||||
int ret;
|
||||
|
||||
if (ompi_mpi_state >= OMPI_MPI_STATE_FINALIZE_STARTED) {
|
||||
PML_YALLA_VERBOSE(3, "%s", "using bulk powerdown");
|
||||
@ -276,7 +277,9 @@ int mca_pml_yalla_del_procs(struct ompi_proc_t **procs, size_t nprocs)
|
||||
PML_YALLA_VERBOSE(2, "disconnected from rank %s", OPAL_NAME_PRINT(procs[i]->super.proc_name));
|
||||
procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_PML] = NULL;
|
||||
}
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
return ret;
|
||||
}
|
||||
return OMPI_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -257,7 +257,13 @@ int ompi_mpi_finalize(void)
|
||||
* communications/actions to complete. See
|
||||
* https://github.com/open-mpi/ompi/issues/1576 for the
|
||||
* original bug report. */
|
||||
opal_pmix.fence_nb(NULL, 0, fence_cbfunc, (void*)&active);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, 0, fence_cbfunc,
|
||||
(void*)&active))) {
|
||||
OMPI_ERROR_LOG(ret);
|
||||
/* Reset the active flag to false, to avoid waiting for
|
||||
* completion when the fence was failed. */
|
||||
active = false;
|
||||
}
|
||||
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
|
||||
} else {
|
||||
/* However, we cannot guarantee that the provided PMIx has
|
||||
@ -268,7 +274,9 @@ int ompi_mpi_finalize(void)
|
||||
ompi_communicator_t *comm = &ompi_mpi_comm_world.comm;
|
||||
comm->c_coll->coll_barrier(comm, comm->c_coll->coll_barrier_module);
|
||||
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
OMPI_ERROR_LOG(ret);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -662,9 +662,15 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
|
||||
#if (OPAL_ENABLE_TIMING)
|
||||
if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex &&
|
||||
opal_pmix_collect_all_data) {
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
error = "timing: pmix-barrier-1 failed";
|
||||
goto error;
|
||||
}
|
||||
OMPI_TIMING_NEXT("pmix-barrier-1");
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
error = "timing: pmix-barrier-2 failed";
|
||||
goto error;
|
||||
}
|
||||
OMPI_TIMING_NEXT("pmix-barrier-2");
|
||||
}
|
||||
#endif
|
||||
@ -687,19 +693,32 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
|
||||
background_fence = true;
|
||||
active = true;
|
||||
OPAL_POST_OBJECT(&active);
|
||||
opal_pmix.fence_nb(NULL, true, fence_release, (void*)&active);
|
||||
if( OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, true,
|
||||
fence_release,
|
||||
(void*)&active))) {
|
||||
error = "opal_pmix.fence_nb() failed";
|
||||
goto error;
|
||||
}
|
||||
|
||||
} else if (!opal_pmix_base_async_modex) {
|
||||
/* we want to do the modex */
|
||||
active = true;
|
||||
OPAL_POST_OBJECT(&active);
|
||||
opal_pmix.fence_nb(NULL, opal_pmix_collect_all_data,
|
||||
fence_release, (void*)&active);
|
||||
if( OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL,
|
||||
opal_pmix_collect_all_data, fence_release, (void*)&active))) {
|
||||
error = "opal_pmix.fence_nb() failed";
|
||||
goto error;
|
||||
}
|
||||
/* cannot just wait on thread as we need to call opal_progress */
|
||||
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
|
||||
}
|
||||
/* otherwise, we don't want to do the modex, so fall thru */
|
||||
} else if (!opal_pmix_base_async_modex || opal_pmix_collect_all_data) {
|
||||
opal_pmix.fence(NULL, opal_pmix_collect_all_data);
|
||||
if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL,
|
||||
opal_pmix_collect_all_data))) {
|
||||
error = "opal_pmix.fence() failed";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
OMPI_TIMING_NEXT("modex");
|
||||
@ -877,11 +896,17 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
|
||||
if (NULL != opal_pmix.fence_nb) {
|
||||
active = true;
|
||||
OPAL_POST_OBJECT(&active);
|
||||
opal_pmix.fence_nb(NULL, false,
|
||||
fence_release, (void*)&active);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence_nb(NULL, false,
|
||||
fence_release, (void*)&active))) {
|
||||
error = "opal_pmix.fence_nb() failed";
|
||||
goto error;
|
||||
}
|
||||
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
|
||||
} else {
|
||||
opal_pmix.fence(NULL, false);
|
||||
if (OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, false))) {
|
||||
error = "opal_pmix.fence() failed";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -97,13 +97,19 @@ static void opal_common_ucx_mca_fence_complete_cb(int status, void *fenced)
|
||||
*(int*)fenced = 1;
|
||||
}
|
||||
|
||||
OPAL_DECLSPEC void opal_common_ucx_mca_pmix_fence(ucp_worker_h worker)
|
||||
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker)
|
||||
{
|
||||
volatile int fenced = 0;
|
||||
int ret = OPAL_SUCCESS;
|
||||
|
||||
if (OPAL_SUCCESS != (ret = opal_pmix.fence_nb(NULL, 0,
|
||||
opal_common_ucx_mca_fence_complete_cb, (void*)&fenced))){
|
||||
return ret;
|
||||
}
|
||||
|
||||
opal_pmix.fence_nb(NULL, 0, opal_common_ucx_mca_fence_complete_cb, (void*)&fenced);
|
||||
while (!fenced) {
|
||||
ucp_worker_progress(worker);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -65,7 +65,7 @@ extern opal_common_ucx_module_t opal_common_ucx;
|
||||
OPAL_DECLSPEC void opal_common_ucx_mca_register(void);
|
||||
OPAL_DECLSPEC void opal_common_ucx_mca_deregister(void);
|
||||
OPAL_DECLSPEC void opal_common_ucx_empty_complete_cb(void *request, ucs_status_t status);
|
||||
OPAL_DECLSPEC void opal_common_ucx_mca_pmix_fence(ucp_worker_h worker);
|
||||
OPAL_DECLSPEC int opal_common_ucx_mca_pmix_fence(ucp_worker_h worker);
|
||||
|
||||
static inline
|
||||
int opal_common_ucx_wait_request(ucs_status_ptr_t request, ucp_worker_h worker,
|
||||
|
@ -458,7 +458,10 @@ static int rte_init(void)
|
||||
if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
|
||||
/* need to commit the data before we fence */
|
||||
opal_pmix.commit();
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (ORTE_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
error = "opal_pmix.fence() failed";
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
OPAL_TIMING_ENV_NEXT(rte_init, "rte_init_done");
|
||||
|
||||
|
@ -150,7 +150,11 @@ int app_coord_init()
|
||||
"app) Startup Barrier..."));
|
||||
}
|
||||
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (ORTE_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
||||
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
||||
@ -216,7 +220,11 @@ int app_coord_finalize()
|
||||
"app) Shutdown Barrier..."));
|
||||
}
|
||||
|
||||
opal_pmix.fence(NULL, 0);
|
||||
if (ORTE_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
|
||||
ORTE_ERROR_LOG(ret);
|
||||
exit_status = ret;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if( 0 == ORTE_PROC_MY_NAME->vpid ) {
|
||||
OPAL_OUTPUT_VERBOSE((3, mca_snapc_full_component.super.output_handle,
|
||||
|
@ -125,6 +125,7 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs)
|
||||
void *dreq, **dreqs;
|
||||
ucp_ep_h ep;
|
||||
size_t i, n;
|
||||
int ret;
|
||||
|
||||
oshmem_shmem_barrier();
|
||||
|
||||
@ -175,7 +176,10 @@ int mca_spml_ucx_del_procs(ompi_proc_t** procs, size_t nprocs)
|
||||
free(dreqs);
|
||||
free(mca_spml_ucx.remote_addrs_tbl);
|
||||
|
||||
opal_common_ucx_mca_pmix_fence(mca_spml_ucx_ctx_default.ucp_worker);
|
||||
if (OSHMEM_SUCCESS != (ret = opal_common_ucx_mca_pmix_fence(
|
||||
mca_spml_ucx_ctx_default.ucp_worker))) {
|
||||
return ret;
|
||||
}
|
||||
free(mca_spml_ucx_ctx_default.ucp_peers);
|
||||
mca_spml_ucx_ctx_default.ucp_peers = NULL;
|
||||
return OSHMEM_SUCCESS;
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user