diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index e42dd76e28..8facf63439 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -46,6 +46,7 @@ #include "opal/util/argv.h" #include "opal/util/printf.h" #include "opal/align.h" +#include "opal/util/sys_limits.h" #if OPAL_CUDA_SUPPORT #include "opal/datatype/opal_datatype_cuda.h" #endif /* OPAL_CUDA_SUPPORT */ @@ -550,6 +551,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ompi_osc_rdma_region_t *state_region; struct _local_data *temp; char *data_file; + int page_size = opal_getpagesize(); shared_comm = module->shared_comm; @@ -574,6 +576,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s module->state_offset = state_base = local_rank_array_size + module->region_size; data_base = state_base + leader_peer_data_size + module->state_size * local_size; + /* ensure proper alignment */ + if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { + data_base += OPAL_ALIGN_PAD_AMOUNT(data_base, page_size); + size += OPAL_ALIGN_PAD_AMOUNT(size, page_size); + } + do { temp = calloc (local_size, sizeof (temp[0])); if (NULL == temp) { @@ -642,7 +650,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s } if (size && MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { - *base = (void *)((intptr_t) module->segment_base + my_base_offset); + char *baseptr = (char *)((intptr_t) module->segment_base + my_base_offset); + *base = (void *)baseptr; + // touch each page to force allocation on local NUMA node + for (size_t i = 0; i < size; i += page_size) { + baseptr[i] = 0; + } } module->rank_array = (ompi_osc_rdma_rank_data_t *) module->segment_base;