From 2f96f164166c797d81880f079b73b7ed231e4d62 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Tue, 17 Jun 2014 04:53:47 +0000 Subject: [PATCH] osc/rdma: ensure eager sends are active before checking for sync errors in self optimization This addresses an issue found with the MPICH pscw_ordering test. Eager sends were not yet active (which is ok for the standard path) but not ok for the self optimization. Fixed by waiting for all post messages before checking the sync state. Fixes trac:4724 Tracking the 1.8.2 issue in this CMR. cmr=v1.8.2:reviewer=bbenton This commit was SVN r32012. The following Trac tickets were found above: Ticket 4724 --> https://svn.open-mpi.org/trac/ompi/ticket/4724 --- ompi/mca/osc/rdma/osc_rdma_comm.c | 45 +++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index cc81cd2b88..62319bed3c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -82,6 +82,15 @@ static inline int ompi_osc_rdma_put_self (void *source, int source_count, ompi_d ((unsigned long) target_disp * module->disp_unit); int ret; + /* if we are in active target mode wait until all post messages arrive */ + if (module->sc_group && !module->active_eager_send_active) { + OPAL_THREAD_LOCK(&module->lock); + while (0 != module->num_post_msgs) { + opal_condition_wait(&module->cond, &module->lock); + } + OPAL_THREAD_UNLOCK(&module->lock); + } + if (!(module->passive_target_access_epoch || module->active_eager_send_active)) { return OMPI_ERR_RMA_SYNC; } @@ -107,6 +116,15 @@ static inline int ompi_osc_rdma_get_self (void *target, int target_count, ompi_d ((unsigned long) source_disp * module->disp_unit); int ret; + /* if we are in active target mode wait until all post messages arrive */ + if (module->sc_group && !module->active_eager_send_active) { + OPAL_THREAD_LOCK(&module->lock); + while (0 != module->num_post_msgs) { + opal_condition_wait(&module->cond, &module->lock); + } + OPAL_THREAD_UNLOCK(&module->lock); + } + if (!(module->passive_target_access_epoch || module->active_eager_send_active)) { return OMPI_ERR_RMA_SYNC; } @@ -130,6 +148,15 @@ static inline int ompi_osc_rdma_cas_self (void *source, void *compare, void *res void *target = (unsigned char*) module->baseptr + ((unsigned long) target_disp * module->disp_unit); + /* if we are in active target mode wait until all post messages arrive */ + if (module->sc_group && !module->active_eager_send_active) { + OPAL_THREAD_LOCK(&module->lock); + while (0 != module->num_post_msgs) { + opal_condition_wait(&module->cond, &module->lock); + } + OPAL_THREAD_UNLOCK(&module->lock); + } + if (!(module->passive_target_access_epoch || module->active_eager_send_active)) { return OMPI_ERR_RMA_SYNC; } @@ -155,6 +182,15 @@ static inline int ompi_osc_rdma_acc_self (void *source, int source_count, ompi_d ((unsigned long) target_disp * module->disp_unit); int ret; + /* if we are in active target mode wait until all post messages arrive */ + if (module->sc_group && !module->active_eager_send_active) { + OPAL_THREAD_LOCK(&module->lock); + while (0 != module->num_post_msgs) { + opal_condition_wait(&module->cond, &module->lock); + } + OPAL_THREAD_UNLOCK(&module->lock); + } + if (!(module->passive_target_access_epoch || module->active_eager_send_active)) { return OMPI_ERR_RMA_SYNC; } @@ -191,6 +227,15 @@ static inline int ompi_osc_rdma_gacc_self (void *source, int source_count, ompi_ ((unsigned long) target_disp * module->disp_unit); int ret; + /* if we are in active target mode wait until all post messages arrive */ + if (module->sc_group && !module->active_eager_send_active) { + OPAL_THREAD_LOCK(&module->lock); + while (0 != module->num_post_msgs) { + opal_condition_wait(&module->cond, &module->lock); + } + OPAL_THREAD_UNLOCK(&module->lock); + } + if (!(module->passive_target_access_epoch || module->active_eager_send_active)) { return OMPI_ERR_RMA_SYNC; }