diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index 53fdeb889d..31c3fc29be 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -892,7 +892,7 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo /* if the datatype is small enough (and the count is 1) then try to directly use the hardware to execute * the atomic operation. this should be safe in all cases as either 1) the user has assured us they will * never use atomics with count > 1, 2) we have the accumulate lock, or 3) we have an exclusive lock */ - if (origin_extent <= 8 && 1 == origin_count) { + if (origin_extent <= 8 && 1 == origin_count && !ompi_osc_rdma_peer_local_base (peer)) { if (module->acc_use_amo && ompi_datatype_is_predefined (origin_datatype)) { if (NULL == result_addr) { ret = ompi_osc_rdma_acc_single_atomic (sync, origin_addr, origin_datatype, origin_extent, peer, target_address, diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 522f953a2f..bf6c1a84bb 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -759,9 +759,14 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ex_peer->size = temp[i].size; } - if (module->use_cpu_atomics && MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { + if (module->use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) { /* base is local and cpu atomics are available */ - ex_peer->super.base = (uintptr_t) module->segment_base + offset; + if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { + ex_peer->super.base = (uintptr_t) module->segment_base + offset; + } else { + ex_peer->super.base = (uintptr_t) *base; + } + peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; offset += temp[i].size; } else {