diff --git a/config/ompi_check_ucx.m4 b/config/ompi_check_ucx.m4 index 044b599dc3..42e53f9ce8 100644 --- a/config/ompi_check_ucx.m4 +++ b/config/ompi_check_ucx.m4 @@ -135,9 +135,11 @@ AC_DEFUN([OMPI_CHECK_UCX],[ [$1_CPPFLAGS="[$]$1_CPPFLAGS $ompi_check_ucx_CPPFLAGS" $1_LDFLAGS="[$]$1_LDFLAGS $ompi_check_ucx_LDFLAGS" $1_LIBS="[$]$1_LIBS $ompi_check_ucx_LIBS" + AC_DEFINE([HAVE_UCX], [1], [have ucx]) $2], [AS_IF([test ! -z "$with_ucx" && test "$with_ucx" != "no"], [AC_MSG_ERROR([UCX support requested but not found. Aborting])]) + AC_DEFINE([HAVE_UCX], [0], [have ucx]) $3]) OPAL_VAR_SCOPE_POP diff --git a/opal/mca/btl/openib/btl_openib.c b/opal/mca/btl/openib/btl_openib.c index c2686a0676..f9ba3a3de6 100644 --- a/opal/mca/btl/openib/btl_openib.c +++ b/opal/mca/btl/openib/btl_openib.c @@ -22,6 +22,7 @@ * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014 Bull SAS. All rights reserved + * Copyrigth (c) 2019 Triad National Security, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -1040,15 +1041,6 @@ int mca_btl_openib_add_procs( int btl_rank = 0; volatile mca_btl_base_endpoint_t* endpoint; - - if (! openib_btl->allowed) { - opal_bitmap_clear_all_bits(reachable); - opal_show_help("help-mpi-btl-openib.txt", "ib port not selected", - true, opal_process_info.nodename, - openib_btl->device_name, openib_btl->port_num); - return OPAL_SUCCESS; - } - btl_rank = get_openib_btl_params(openib_btl, &lcl_subnet_id_port_cnt); if( 0 > btl_rank ){ return OPAL_ERR_NOT_FOUND; @@ -1648,83 +1640,82 @@ static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl) return OPAL_SUCCESS; } - if (openib_btl->allowed) { - /* Release all QPs */ - if (NULL != openib_btl->device->endpoints) { - for (ep_index=0; - ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints); - ep_index++) { - endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints, + /* Release all QPs */ + if (NULL != openib_btl->device->endpoints) { + for (ep_index=0; + ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints); + ep_index++) { + + endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints, ep_index); - if(!endpoint) { - BTL_VERBOSE(("In finalize, got another null endpoint")); - continue; - } - if(endpoint->endpoint_btl != openib_btl) { - continue; - } - for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) { - if(openib_btl->device->eager_rdma_buffers[i] == endpoint) { - openib_btl->device->eager_rdma_buffers[i] = NULL; - OBJ_RELEASE(endpoint); - } - } - opal_pointer_array_set_item(openib_btl->device->endpoints, - ep_index, NULL); - assert(((opal_object_t*)endpoint)->obj_reference_count == 1); - OBJ_RELEASE(endpoint); + if(!endpoint) { + BTL_VERBOSE(("In finalize, got another null endpoint")); + continue; } - } - - /* Release SRQ resources */ - for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { - if(!BTL_OPENIB_QP_TYPE_PP(qp)) { - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS( - &openib_btl->qps[qp].u.srq_qp.pending_frags[0]); - MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS( - &openib_btl->qps[qp].u.srq_qp.pending_frags[1]); - if (NULL != openib_btl->qps[qp].u.srq_qp.srq) { - opal_mutex_t *lock = - &mca_btl_openib_component.srq_manager.lock; - - opal_hash_table_t *srq_addr_table = - &mca_btl_openib_component.srq_manager.srq_addr_table; - - opal_mutex_lock(lock); - if (OPAL_SUCCESS != - opal_hash_table_remove_value_ptr(srq_addr_table, - &openib_btl->qps[qp].u.srq_qp.srq, - sizeof(struct ibv_srq *))) { - BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp)); - rc = OPAL_ERROR; - } - opal_mutex_unlock(lock); - if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) { - BTL_VERBOSE(("Failed to close SRQ %d", qp)); - rc = OPAL_ERROR; - } + if(endpoint->endpoint_btl != openib_btl) { + continue; + } + for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) { + if(openib_btl->device->eager_rdma_buffers[i] == endpoint) { + openib_btl->device->eager_rdma_buffers[i] = NULL; + OBJ_RELEASE(endpoint); } - - OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]); - OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]); } + opal_pointer_array_set_item(openib_btl->device->endpoints, + ep_index, NULL); + assert(((opal_object_t*)endpoint)->obj_reference_count == 1); + OBJ_RELEASE(endpoint); } + } - /* Finalize the CPC modules on this openib module */ - for (i = 0; i < openib_btl->num_cpcs; ++i) { - if (NULL != openib_btl->cpcs[i]->cbm_finalize) { - openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]); + /* Release SRQ resources */ + for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { + if(!BTL_OPENIB_QP_TYPE_PP(qp)) { + MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS( + &openib_btl->qps[qp].u.srq_qp.pending_frags[0]); + MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS( + &openib_btl->qps[qp].u.srq_qp.pending_frags[1]); + if (NULL != openib_btl->qps[qp].u.srq_qp.srq) { + opal_mutex_t *lock = + &mca_btl_openib_component.srq_manager.lock; + + opal_hash_table_t *srq_addr_table = + &mca_btl_openib_component.srq_manager.srq_addr_table; + + opal_mutex_lock(lock); + if (OPAL_SUCCESS != + opal_hash_table_remove_value_ptr(srq_addr_table, + &openib_btl->qps[qp].u.srq_qp.srq, + sizeof(struct ibv_srq *))) { + BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp)); + rc = OPAL_ERROR; + } + opal_mutex_unlock(lock); + if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) { + BTL_VERBOSE(("Failed to close SRQ %d", qp)); + rc = OPAL_ERROR; + } } - free(openib_btl->cpcs[i]); - } - free(openib_btl->cpcs); - /* Release device if there are no more users */ - if(!(--openib_btl->device->allowed_btls)) { - OBJ_RELEASE(openib_btl->device); + OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]); + OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]); } } + /* Finalize the CPC modules on this openib module */ + for (i = 0; i < openib_btl->num_cpcs; ++i) { + if (NULL != openib_btl->cpcs[i]->cbm_finalize) { + openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]); + } + free(openib_btl->cpcs[i]); + } + free(openib_btl->cpcs); + + /* Release device if there are no more users */ + if(!(--openib_btl->device->allowed_btls)) { + OBJ_RELEASE(openib_btl->device); + } + if (NULL != openib_btl->qps) { free(openib_btl->qps); } diff --git a/opal/mca/btl/openib/btl_openib.h b/opal/mca/btl/openib/btl_openib.h index 0b85bfb566..3ffc0feffc 100644 --- a/opal/mca/btl/openib/btl_openib.h +++ b/opal/mca/btl/openib/btl_openib.h @@ -20,6 +20,8 @@ * Copyright (c) 2014 Bull SAS. All rights reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyrigth (c) 2019 Triad National Security, LLC. All rights reserved. + * * $COPYRIGHT$ * * Additional copyrights may follow @@ -506,8 +508,6 @@ struct mca_btl_openib_module_t { int local_procs; /** number of local procs */ bool atomic_ops_be; /** atomic result is big endian */ - - bool allowed; /** is this port allowed */ }; typedef struct mca_btl_openib_module_t mca_btl_openib_module_t; diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c index fcc0ac5697..d93178fb53 100644 --- a/opal/mca/btl/openib/btl_openib_component.c +++ b/opal/mca/btl/openib/btl_openib_component.c @@ -22,6 +22,7 @@ * Copyright (c) 2014-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2014 Bull SAS. All rights reserved. + * Copyrigth (c) 2019 Triad National Security, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -278,9 +279,6 @@ static int btl_openib_modex_send(void) ); /* For each module, add in the size of the per-CPC data */ for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { - if (! mca_btl_openib_component.openib_btls[i]->allowed) { - continue; - } for (j = 0; j < mca_btl_openib_component.openib_btls[i]->num_cpcs; ++j) { @@ -309,9 +307,6 @@ static int btl_openib_modex_send(void) /* Pack each of the modules */ for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { - if (! mca_btl_openib_component.openib_btls[i]->allowed) { - continue; - } /* Pack the modex common message struct. */ size = modex_message_size; @@ -633,38 +628,26 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device, * unless the user specifically requested to override this * policy. For ancient OFED, only allow if user has set * the MCA parameter. + * + * We emit a help message if Open MPI was configured without + * UCX support if the port is configured to use infiniband for link + * layer. If UCX support is available, don't emit help message + * since UCX PML has higher priority than OB1 and this BTL will + * not be used. */ - if (! mca_btl_openib_component.allow_ib + if (false == mca_btl_openib_component.allow_ib #if HAVE_DECL_IBV_LINK_LAYER_ETHERNET && IBV_LINK_LAYER_INFINIBAND == ib_port_attr->link_layer #endif ) { - openib_btl = (mca_btl_openib_module_t *) calloc(1, sizeof(mca_btl_openib_module_t)); - if(NULL == openib_btl) { - BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__)); - return OPAL_ERR_OUT_OF_RESOURCE; - } - memcpy(openib_btl, &mca_btl_openib_module, - sizeof(mca_btl_openib_module)); - ib_selected = OBJ_NEW(mca_btl_base_selected_module_t); - ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl; - openib_btl->port_num = (uint8_t) port_num; - openib_btl->allowed = false; - openib_btl->device = NULL; - openib_btl->device_name = strdup(ibv_get_device_name(device->ib_dev)); - OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t); - opal_list_append(btl_list, (opal_list_item_t*) ib_selected); - opal_pointer_array_add(device->device_btls, (void*) openib_btl); - ++device->btls; - ++mca_btl_openib_component.ib_num_btls; - if (-1 != mca_btl_openib_component.ib_max_btls && - mca_btl_openib_component.ib_num_btls >= - mca_btl_openib_component.ib_max_btls) { - return OPAL_ERR_VALUE_OUT_OF_BOUNDS; - } - return OPAL_SUCCESS; - } - +#if !HAVE_UCX + opal_show_help("help-mpi-btl-openib.txt", "ib port not selected", + true, opal_process_info.nodename, + ibv_get_device_name(device->ib_dev), + port_num); +#endif + return OPAL_ERR_NOT_FOUND; + } /* Ensure that the requested GID index (via the btl_openib_gid_index MCA param) is within the GID table @@ -901,8 +884,6 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device, } } - openib_btl->allowed = true; - opal_list_append(btl_list, (opal_list_item_t*) ib_selected); opal_pointer_array_add(device->device_btls, (void*) openib_btl); ++device->btls; @@ -2999,29 +2980,27 @@ btl_openib_component_init(int *num_btl_modules, ib_selected = (mca_btl_base_selected_module_t*)item; openib_btl = (mca_btl_openib_module_t*)ib_selected->btl_module; - if (openib_btl->allowed) { - /* Search for a CPC that can handle this port */ - ret = opal_btl_openib_connect_base_select_for_local_port(openib_btl); - /* If we get NOT_SUPPORTED, then no CPC was found for this - port. But that's not a fatal error -- just keep going; - let's see if we find any usable openib modules or not. */ - if (OPAL_ERR_NOT_SUPPORTED == ret) { - continue; - } else if (OPAL_SUCCESS != ret) { - /* All others *are* fatal. Note that we already did a - show_help in the lower layer */ - goto no_btls; - } + /* Search for a CPC that can handle this port */ + ret = opal_btl_openib_connect_base_select_for_local_port(openib_btl); + /* If we get NOT_SUPPORTED, then no CPC was found for this + port. But that's not a fatal error -- just keep going; + let's see if we find any usable openib modules or not. */ + if (OPAL_ERR_NOT_SUPPORTED == ret) { + continue; + } else if (OPAL_SUCCESS != ret) { + /* All others *are* fatal. Note that we already did a + show_help in the lower layer */ + goto no_btls; + } - if (mca_btl_openib_component.max_hw_msg_size > 0 && - (uint32_t)mca_btl_openib_component.max_hw_msg_size > openib_btl->ib_port_attr.max_msg_sz) { - BTL_ERROR(("max_hw_msg_size (%" PRIu32 ") is larger than hw max message size (%" PRIu32 ")", - mca_btl_openib_component.max_hw_msg_size, openib_btl->ib_port_attr.max_msg_sz)); - } + if (mca_btl_openib_component.max_hw_msg_size > 0 && + (uint32_t)mca_btl_openib_component.max_hw_msg_size > openib_btl->ib_port_attr.max_msg_sz) { + BTL_ERROR(("max_hw_msg_size (%" PRIu32 ") is larger than hw max message size (%" PRIu32 ")", + mca_btl_openib_component.max_hw_msg_size, openib_btl->ib_port_attr.max_msg_sz)); + } - if (finish_btl_init(openib_btl) != OPAL_SUCCESS) { - goto no_btls; - } + if (finish_btl_init(openib_btl) != OPAL_SUCCESS) { + goto no_btls; } mca_btl_openib_component.openib_btls[i] = openib_btl;