btl/openib: fix issue 6785
Commit d7053a3 broke things for the case when Open MPI 4.0.x is built without UCX support. Problem was it was trying to partially initialize the btl to try and delay printing of a help message till wireup. Well this sort of doesn't work in all cases. Rather than keep piling on changes to support a help message for a BTL that we are deprecating, take a keep it simple stupid approach. So, revert most of d7053a3 and instead put the help message back in the original location, during scan of ports of the available HCAs to check for whether or not link layer for that port is configured for ethernet or infiniband. If Open MPI was built with UCX support, don't emit the help message, if UCX was not linked in, emit the help message. Verified on a system with connectX5 HCAs configured with two ports configured for ethernet and two for infiniband. relates to #6785 Signed-off-by: Howard Pritchard <howardp@lanl.gov>
Этот коммит содержится в:
родитель
2df46aca54
Коммит
71f240f078
@ -135,9 +135,11 @@ AC_DEFUN([OMPI_CHECK_UCX],[
|
||||
[$1_CPPFLAGS="[$]$1_CPPFLAGS $ompi_check_ucx_CPPFLAGS"
|
||||
$1_LDFLAGS="[$]$1_LDFLAGS $ompi_check_ucx_LDFLAGS"
|
||||
$1_LIBS="[$]$1_LIBS $ompi_check_ucx_LIBS"
|
||||
AC_DEFINE([HAVE_UCX], [1], [have ucx])
|
||||
$2],
|
||||
[AS_IF([test ! -z "$with_ucx" && test "$with_ucx" != "no"],
|
||||
[AC_MSG_ERROR([UCX support requested but not found. Aborting])])
|
||||
AC_DEFINE([HAVE_UCX], [0], [have ucx])
|
||||
$3])
|
||||
|
||||
OPAL_VAR_SCOPE_POP
|
||||
|
@ -22,6 +22,7 @@
|
||||
* Copyright (c) 2014-2018 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2014 Bull SAS. All rights reserved
|
||||
* Copyrigth (c) 2019 Triad National Security, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -1040,15 +1041,6 @@ int mca_btl_openib_add_procs(
|
||||
int btl_rank = 0;
|
||||
volatile mca_btl_base_endpoint_t* endpoint;
|
||||
|
||||
|
||||
if (! openib_btl->allowed) {
|
||||
opal_bitmap_clear_all_bits(reachable);
|
||||
opal_show_help("help-mpi-btl-openib.txt", "ib port not selected",
|
||||
true, opal_process_info.nodename,
|
||||
openib_btl->device_name, openib_btl->port_num);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
btl_rank = get_openib_btl_params(openib_btl, &lcl_subnet_id_port_cnt);
|
||||
if( 0 > btl_rank ){
|
||||
return OPAL_ERR_NOT_FOUND;
|
||||
@ -1648,83 +1640,82 @@ static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl)
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
if (openib_btl->allowed) {
|
||||
/* Release all QPs */
|
||||
if (NULL != openib_btl->device->endpoints) {
|
||||
for (ep_index=0;
|
||||
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
|
||||
ep_index++) {
|
||||
endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints,
|
||||
/* Release all QPs */
|
||||
if (NULL != openib_btl->device->endpoints) {
|
||||
for (ep_index=0;
|
||||
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
|
||||
ep_index++) {
|
||||
|
||||
endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints,
|
||||
ep_index);
|
||||
if(!endpoint) {
|
||||
BTL_VERBOSE(("In finalize, got another null endpoint"));
|
||||
continue;
|
||||
}
|
||||
if(endpoint->endpoint_btl != openib_btl) {
|
||||
continue;
|
||||
}
|
||||
for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) {
|
||||
if(openib_btl->device->eager_rdma_buffers[i] == endpoint) {
|
||||
openib_btl->device->eager_rdma_buffers[i] = NULL;
|
||||
OBJ_RELEASE(endpoint);
|
||||
}
|
||||
}
|
||||
opal_pointer_array_set_item(openib_btl->device->endpoints,
|
||||
ep_index, NULL);
|
||||
assert(((opal_object_t*)endpoint)->obj_reference_count == 1);
|
||||
OBJ_RELEASE(endpoint);
|
||||
if(!endpoint) {
|
||||
BTL_VERBOSE(("In finalize, got another null endpoint"));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Release SRQ resources */
|
||||
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
||||
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
|
||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
|
||||
if (NULL != openib_btl->qps[qp].u.srq_qp.srq) {
|
||||
opal_mutex_t *lock =
|
||||
&mca_btl_openib_component.srq_manager.lock;
|
||||
|
||||
opal_hash_table_t *srq_addr_table =
|
||||
&mca_btl_openib_component.srq_manager.srq_addr_table;
|
||||
|
||||
opal_mutex_lock(lock);
|
||||
if (OPAL_SUCCESS !=
|
||||
opal_hash_table_remove_value_ptr(srq_addr_table,
|
||||
&openib_btl->qps[qp].u.srq_qp.srq,
|
||||
sizeof(struct ibv_srq *))) {
|
||||
BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp));
|
||||
rc = OPAL_ERROR;
|
||||
}
|
||||
opal_mutex_unlock(lock);
|
||||
if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) {
|
||||
BTL_VERBOSE(("Failed to close SRQ %d", qp));
|
||||
rc = OPAL_ERROR;
|
||||
}
|
||||
if(endpoint->endpoint_btl != openib_btl) {
|
||||
continue;
|
||||
}
|
||||
for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) {
|
||||
if(openib_btl->device->eager_rdma_buffers[i] == endpoint) {
|
||||
openib_btl->device->eager_rdma_buffers[i] = NULL;
|
||||
OBJ_RELEASE(endpoint);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
|
||||
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
|
||||
}
|
||||
opal_pointer_array_set_item(openib_btl->device->endpoints,
|
||||
ep_index, NULL);
|
||||
assert(((opal_object_t*)endpoint)->obj_reference_count == 1);
|
||||
OBJ_RELEASE(endpoint);
|
||||
}
|
||||
}
|
||||
|
||||
/* Finalize the CPC modules on this openib module */
|
||||
for (i = 0; i < openib_btl->num_cpcs; ++i) {
|
||||
if (NULL != openib_btl->cpcs[i]->cbm_finalize) {
|
||||
openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]);
|
||||
/* Release SRQ resources */
|
||||
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
||||
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
|
||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
|
||||
if (NULL != openib_btl->qps[qp].u.srq_qp.srq) {
|
||||
opal_mutex_t *lock =
|
||||
&mca_btl_openib_component.srq_manager.lock;
|
||||
|
||||
opal_hash_table_t *srq_addr_table =
|
||||
&mca_btl_openib_component.srq_manager.srq_addr_table;
|
||||
|
||||
opal_mutex_lock(lock);
|
||||
if (OPAL_SUCCESS !=
|
||||
opal_hash_table_remove_value_ptr(srq_addr_table,
|
||||
&openib_btl->qps[qp].u.srq_qp.srq,
|
||||
sizeof(struct ibv_srq *))) {
|
||||
BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp));
|
||||
rc = OPAL_ERROR;
|
||||
}
|
||||
opal_mutex_unlock(lock);
|
||||
if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) {
|
||||
BTL_VERBOSE(("Failed to close SRQ %d", qp));
|
||||
rc = OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
free(openib_btl->cpcs[i]);
|
||||
}
|
||||
free(openib_btl->cpcs);
|
||||
|
||||
/* Release device if there are no more users */
|
||||
if(!(--openib_btl->device->allowed_btls)) {
|
||||
OBJ_RELEASE(openib_btl->device);
|
||||
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
|
||||
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
|
||||
}
|
||||
}
|
||||
|
||||
/* Finalize the CPC modules on this openib module */
|
||||
for (i = 0; i < openib_btl->num_cpcs; ++i) {
|
||||
if (NULL != openib_btl->cpcs[i]->cbm_finalize) {
|
||||
openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]);
|
||||
}
|
||||
free(openib_btl->cpcs[i]);
|
||||
}
|
||||
free(openib_btl->cpcs);
|
||||
|
||||
/* Release device if there are no more users */
|
||||
if(!(--openib_btl->device->allowed_btls)) {
|
||||
OBJ_RELEASE(openib_btl->device);
|
||||
}
|
||||
|
||||
if (NULL != openib_btl->qps) {
|
||||
free(openib_btl->qps);
|
||||
}
|
||||
|
@ -20,6 +20,8 @@
|
||||
* Copyright (c) 2014 Bull SAS. All rights reserved.
|
||||
* Copyright (c) 2015-2018 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyrigth (c) 2019 Triad National Security, LLC. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -506,8 +508,6 @@ struct mca_btl_openib_module_t {
|
||||
int local_procs; /** number of local procs */
|
||||
|
||||
bool atomic_ops_be; /** atomic result is big endian */
|
||||
|
||||
bool allowed; /** is this port allowed */
|
||||
};
|
||||
typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
|
||||
|
||||
|
@ -22,6 +22,7 @@
|
||||
* Copyright (c) 2014-2018 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2014 Bull SAS. All rights reserved.
|
||||
* Copyrigth (c) 2019 Triad National Security, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -278,9 +279,6 @@ static int btl_openib_modex_send(void)
|
||||
);
|
||||
/* For each module, add in the size of the per-CPC data */
|
||||
for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
||||
if (! mca_btl_openib_component.openib_btls[i]->allowed) {
|
||||
continue;
|
||||
}
|
||||
for (j = 0;
|
||||
j < mca_btl_openib_component.openib_btls[i]->num_cpcs;
|
||||
++j) {
|
||||
@ -309,9 +307,6 @@ static int btl_openib_modex_send(void)
|
||||
/* Pack each of the modules */
|
||||
for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
||||
|
||||
if (! mca_btl_openib_component.openib_btls[i]->allowed) {
|
||||
continue;
|
||||
}
|
||||
/* Pack the modex common message struct. */
|
||||
size = modex_message_size;
|
||||
|
||||
@ -633,38 +628,26 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
||||
* unless the user specifically requested to override this
|
||||
* policy. For ancient OFED, only allow if user has set
|
||||
* the MCA parameter.
|
||||
*
|
||||
* We emit a help message if Open MPI was configured without
|
||||
* UCX support if the port is configured to use infiniband for link
|
||||
* layer. If UCX support is available, don't emit help message
|
||||
* since UCX PML has higher priority than OB1 and this BTL will
|
||||
* not be used.
|
||||
*/
|
||||
if (! mca_btl_openib_component.allow_ib
|
||||
if (false == mca_btl_openib_component.allow_ib
|
||||
#if HAVE_DECL_IBV_LINK_LAYER_ETHERNET
|
||||
&& IBV_LINK_LAYER_INFINIBAND == ib_port_attr->link_layer
|
||||
#endif
|
||||
) {
|
||||
openib_btl = (mca_btl_openib_module_t *) calloc(1, sizeof(mca_btl_openib_module_t));
|
||||
if(NULL == openib_btl) {
|
||||
BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
memcpy(openib_btl, &mca_btl_openib_module,
|
||||
sizeof(mca_btl_openib_module));
|
||||
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
|
||||
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
|
||||
openib_btl->port_num = (uint8_t) port_num;
|
||||
openib_btl->allowed = false;
|
||||
openib_btl->device = NULL;
|
||||
openib_btl->device_name = strdup(ibv_get_device_name(device->ib_dev));
|
||||
OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t);
|
||||
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
|
||||
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
|
||||
++device->btls;
|
||||
++mca_btl_openib_component.ib_num_btls;
|
||||
if (-1 != mca_btl_openib_component.ib_max_btls &&
|
||||
mca_btl_openib_component.ib_num_btls >=
|
||||
mca_btl_openib_component.ib_max_btls) {
|
||||
return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
}
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
#if !HAVE_UCX
|
||||
opal_show_help("help-mpi-btl-openib.txt", "ib port not selected",
|
||||
true, opal_process_info.nodename,
|
||||
ibv_get_device_name(device->ib_dev),
|
||||
port_num);
|
||||
#endif
|
||||
return OPAL_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* Ensure that the requested GID index (via the
|
||||
btl_openib_gid_index MCA param) is within the GID table
|
||||
@ -901,8 +884,6 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
||||
}
|
||||
}
|
||||
|
||||
openib_btl->allowed = true;
|
||||
|
||||
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
|
||||
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
|
||||
++device->btls;
|
||||
@ -2999,29 +2980,27 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
ib_selected = (mca_btl_base_selected_module_t*)item;
|
||||
openib_btl = (mca_btl_openib_module_t*)ib_selected->btl_module;
|
||||
|
||||
if (openib_btl->allowed) {
|
||||
/* Search for a CPC that can handle this port */
|
||||
ret = opal_btl_openib_connect_base_select_for_local_port(openib_btl);
|
||||
/* If we get NOT_SUPPORTED, then no CPC was found for this
|
||||
port. But that's not a fatal error -- just keep going;
|
||||
let's see if we find any usable openib modules or not. */
|
||||
if (OPAL_ERR_NOT_SUPPORTED == ret) {
|
||||
continue;
|
||||
} else if (OPAL_SUCCESS != ret) {
|
||||
/* All others *are* fatal. Note that we already did a
|
||||
show_help in the lower layer */
|
||||
goto no_btls;
|
||||
}
|
||||
/* Search for a CPC that can handle this port */
|
||||
ret = opal_btl_openib_connect_base_select_for_local_port(openib_btl);
|
||||
/* If we get NOT_SUPPORTED, then no CPC was found for this
|
||||
port. But that's not a fatal error -- just keep going;
|
||||
let's see if we find any usable openib modules or not. */
|
||||
if (OPAL_ERR_NOT_SUPPORTED == ret) {
|
||||
continue;
|
||||
} else if (OPAL_SUCCESS != ret) {
|
||||
/* All others *are* fatal. Note that we already did a
|
||||
show_help in the lower layer */
|
||||
goto no_btls;
|
||||
}
|
||||
|
||||
if (mca_btl_openib_component.max_hw_msg_size > 0 &&
|
||||
(uint32_t)mca_btl_openib_component.max_hw_msg_size > openib_btl->ib_port_attr.max_msg_sz) {
|
||||
BTL_ERROR(("max_hw_msg_size (%" PRIu32 ") is larger than hw max message size (%" PRIu32 ")",
|
||||
mca_btl_openib_component.max_hw_msg_size, openib_btl->ib_port_attr.max_msg_sz));
|
||||
}
|
||||
if (mca_btl_openib_component.max_hw_msg_size > 0 &&
|
||||
(uint32_t)mca_btl_openib_component.max_hw_msg_size > openib_btl->ib_port_attr.max_msg_sz) {
|
||||
BTL_ERROR(("max_hw_msg_size (%" PRIu32 ") is larger than hw max message size (%" PRIu32 ")",
|
||||
mca_btl_openib_component.max_hw_msg_size, openib_btl->ib_port_attr.max_msg_sz));
|
||||
}
|
||||
|
||||
if (finish_btl_init(openib_btl) != OPAL_SUCCESS) {
|
||||
goto no_btls;
|
||||
}
|
||||
if (finish_btl_init(openib_btl) != OPAL_SUCCESS) {
|
||||
goto no_btls;
|
||||
}
|
||||
|
||||
mca_btl_openib_component.openib_btls[i] = openib_btl;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user