From 7bda3eb2dc80e3611e02830dd24b4fde347e13a3 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Thu, 28 Apr 2016 19:11:11 -0600 Subject: [PATCH] osc/rdma: fix global index array calculation This commit fixes a bug that occurs when ranks are either not mapped evenly or by something other than core. Fixes open-mpi/ompi#1599 Signed-off-by: Nathan Hjelm --- ompi/mca/osc/rdma/osc_rdma_component.c | 6 +++++- ompi/mca/osc/rdma/osc_rdma_peer.c | 23 ++++++++++++++--------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 1f5913289f..2fa786b477 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -845,6 +845,8 @@ static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module) module->region_size); my_data->base = (uint64_t) (intptr_t) module->rank_array; + /* store my rank in the length field */ + my_data->len = (osc_rdma_size_t) my_rank; if (module->selected_btl->btl_register_mem) { memcpy (my_data->btl_handle_data, module->state_handle, module->selected_btl->btl_registration_handle_size); @@ -861,9 +863,11 @@ static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module) } } + int base_rank = ompi_comm_rank (module->local_leaders) * ((comm_size + module->node_count - 1) / module->node_count); + /* fill in the local part of the rank -> node map */ for (int i = 0 ; i < RANK_ARRAY_COUNT(module) ; ++i) { - int save_rank = my_rank + i; + int save_rank = base_rank + i; if (save_rank >= comm_size) { break; } diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c index f7a7bde770..c367fac7c2 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.c +++ b/ompi/mca/osc/rdma/osc_rdma_peer.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2007-2015 Los Alamos National Security, LLC. All rights + * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -19,7 +19,7 @@ #include "ompi/mca/bml/base/base.h" -#define NODE_ID_TO_RANK(module, node_id) ((node_id) * ((ompi_comm_size ((module)->comm) + (module)->node_count - 1) / (module)->node_count)) +#define NODE_ID_TO_RANK(module, peer_data, node_id) ((int)(peer_data)->len) /** * @brief find the btl endpoint for a process @@ -99,7 +99,7 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd ompi_osc_rdma_rank_data_t rank_data; int registration_handle_size = 0; int node_id, node_rank, array_index; - int ret, disp_unit; + int ret, disp_unit, comm_size; char *peer_data; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "configuring peer for rank %d", peer->rank); @@ -108,13 +108,18 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd registration_handle_size = module->selected_btl->btl_registration_handle_size; } + comm_size = ompi_comm_size (module->comm); + /* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code * calculates the node and offset the mapping can be found. once the mapping has been read the state * part of the peer structure can be initialized. */ - node_id = (peer->rank * module->node_count) / ompi_comm_size (module->comm); - node_rank = NODE_ID_TO_RANK(module, node_id); - array_index = peer->rank - node_rank; + node_id = (peer->rank * module->node_count) / comm_size; array_peer_data = (ompi_osc_rdma_region_t *) ((intptr_t) module->node_comm_info + node_id * module->region_size); + + /* the node leader rank is stored in the length field */ + node_rank = NODE_ID_TO_RANK(module, array_peer_data, node_id); + array_index = peer->rank % ((comm_size + module->node_count - 1) / module->node_count); + array_pointer = array_peer_data->base + array_index * sizeof (rank_data); /* lookup the btl endpoint needed to retrieve the mapping */ @@ -123,8 +128,8 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd return OMPI_ERR_UNREACH; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "reading region data from rank: %d pointer: 0x%" PRIx64 - ", size: %lu", node_rank, array_pointer, sizeof (rank_data)); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "reading region data for %d from rank: %d, index: %d, pointer: 0x%" PRIx64 + ", size: %lu", peer->rank, node_rank, array_index, array_pointer, sizeof (rank_data)); ret = ompi_osc_get_data_blocking (module, array_endpoint, array_pointer, (mca_btl_base_registration_handle_t *) array_peer_data->btl_handle_data, &rank_data, sizeof (rank_data)); @@ -143,7 +148,7 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd peer->state_handle = (mca_btl_base_registration_handle_t *) node_peer_data->btl_handle_data; } - peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, rank_data.node_id)); + peer->state_endpoint = ompi_osc_rdma_peer_btl_endpoint (module, NODE_ID_TO_RANK(module, node_peer_data, rank_data.node_id)); if (OPAL_UNLIKELY(NULL == peer->state_endpoint)) { return OPAL_ERR_UNREACH; }