1
1

Merge pull request #7951 from devreal/v4.1.x

(v4.1.x) osc/rdma: fail query_btls if no endpoint for non-local peer is found
Этот коммит содержится в:
Jeff Squyres 2020-07-20 15:12:59 -04:00 коммит произвёл GitHub
родитель bd16024a0b 3d08d790e9
Коммит 981b8858e7
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23

Просмотреть файл

@ -803,6 +803,7 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_b
{ {
struct mca_btl_base_module_t **possible_btls = NULL; struct mca_btl_base_module_t **possible_btls = NULL;
int comm_size = ompi_comm_size (comm); int comm_size = ompi_comm_size (comm);
int comm_rank = ompi_comm_rank (comm);
int rc = OMPI_SUCCESS, max_btls = 0; int rc = OMPI_SUCCESS, max_btls = 0;
unsigned int selected_latency = INT_MAX; unsigned int selected_latency = INT_MAX;
struct mca_btl_base_module_t *selected_btl = NULL; struct mca_btl_base_module_t *selected_btl = NULL;
@ -842,10 +843,11 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_b
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
for (int i = 0 ; i < comm_size ; ++i) { for (int rank = 0 ; rank < comm_size ; ++rank) {
ompi_proc_t *proc = ompi_comm_peer_lookup (comm, i); ompi_proc_t *proc = ompi_comm_peer_lookup (comm, rank);
mca_bml_base_endpoint_t *endpoint; mca_bml_base_endpoint_t *endpoint;
int num_btls, prev_max; int num_btls, prev_max;
bool found_btl = false;
endpoint = mca_bml_base_get_endpoint (proc); endpoint = mca_bml_base_get_endpoint (proc);
if (NULL == endpoint) { if (NULL == endpoint) {
@ -891,23 +893,30 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, struct mca_btl_b
for (int j = 0 ; j < max_btls ; ++j) { for (int j = 0 ; j < max_btls ; ++j) {
if (endpoint->btl_rdma.bml_btls[i_btl].btl == possible_btls[j]) { if (endpoint->btl_rdma.bml_btls[i_btl].btl == possible_btls[j]) {
++btl_counts[j]; ++btl_counts[j];
found_btl = true;
break; break;
} else if (NULL == possible_btls[j]) { } else if (NULL == possible_btls[j]) {
possible_btls[j] = endpoint->btl_rdma.bml_btls[i_btl].btl; possible_btls[j] = endpoint->btl_rdma.bml_btls[i_btl].btl;
btl_counts[j] = 1; btl_counts[j] = 1;
found_btl = true;
break; break;
} }
} }
} }
} }
/* any non-local rank must have a usable btl */
if (!found_btl && comm_rank == rank) {
/* no btl = no rdma/atomics */
rc = OMPI_ERR_UNREACH;
break;
}
} }
if (OMPI_SUCCESS != rc) { if (OMPI_SUCCESS != rc) {
free (possible_btls); free (possible_btls);
free (btl_counts); free (btl_counts);
return rc;
/* no btl = no rdma/atomics */
return OMPI_ERR_NOT_AVAILABLE;
} }
for (int i = 0 ; i < max_btls ; ++i) { for (int i = 0 ; i < max_btls ; ++i) {