From 3d08d790e91f9e6ff28f70432f7c03029f55475e Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 16 Jul 2020 17:06:35 +0200 Subject: [PATCH] osc/rdma: fail query_btls if no endpoint for non-local peer is found Signed-off-by: Joseph Schuchart (cherry picked from commit eebc451ec8313975998a63e25938fb4e0b4d6c44) --- ompi/mca/osc/rdma/osc_rdma_component.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 31838b9f75..e42dd76e28 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -803,6 +803,7 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_b { struct mca_btl_base_module_t **possible_btls = NULL; int comm_size = ompi_comm_size (comm); + int comm_rank = ompi_comm_rank (comm); int rc = OMPI_SUCCESS, max_btls = 0; unsigned int selected_latency = INT_MAX; struct mca_btl_base_module_t *selected_btl = NULL; @@ -842,10 +843,11 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_b return OMPI_SUCCESS; } - for (int i = 0 ; i < comm_size ; ++i) { - ompi_proc_t *proc = ompi_comm_peer_lookup (comm, i); + for (int rank = 0 ; rank < comm_size ; ++rank) { + ompi_proc_t *proc = ompi_comm_peer_lookup (comm, rank); mca_bml_base_endpoint_t *endpoint; int num_btls, prev_max; + bool found_btl = false; endpoint = mca_bml_base_get_endpoint (proc); if (NULL == endpoint) { @@ -891,23 +893,30 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_b for (int j = 0 ; j < max_btls ; ++j) { if (endpoint->btl_rdma.bml_btls[i_btl].btl == possible_btls[j]) { ++btl_counts[j]; + found_btl = true; break; } else if (NULL == possible_btls[j]) { possible_btls[j] = endpoint->btl_rdma.bml_btls[i_btl].btl; btl_counts[j] = 1; + found_btl = true; break; } } } } + + /* any non-local rank must have a usable btl */ + if (!found_btl && comm_rank == rank) { + /* no btl = no rdma/atomics */ + rc = OMPI_ERR_UNREACH; + break; + } } if (OMPI_SUCCESS != rc) { free (possible_btls); free (btl_counts); - - /* no btl = no rdma/atomics */ - return OMPI_ERR_NOT_AVAILABLE; + return rc; } for (int i = 0 ; i < max_btls ; ++i) {