Merge pull request #6804 from hppritcha/topic/swat_issue_6785
btl/openib: fix issue 6785
Этот коммит содержится в:
Коммит
368da00414
@ -135,9 +135,11 @@ AC_DEFUN([OMPI_CHECK_UCX],[
|
||||
[$1_CPPFLAGS="[$]$1_CPPFLAGS $ompi_check_ucx_CPPFLAGS"
|
||||
$1_LDFLAGS="[$]$1_LDFLAGS $ompi_check_ucx_LDFLAGS"
|
||||
$1_LIBS="[$]$1_LIBS $ompi_check_ucx_LIBS"
|
||||
AC_DEFINE([HAVE_UCX], [1], [have ucx])
|
||||
$2],
|
||||
[AS_IF([test ! -z "$with_ucx" && test "$with_ucx" != "no"],
|
||||
[AC_MSG_ERROR([UCX support requested but not found. Aborting])])
|
||||
AC_DEFINE([HAVE_UCX], [0], [have ucx])
|
||||
$3])
|
||||
|
||||
OPAL_VAR_SCOPE_POP
|
||||
|
@ -22,6 +22,7 @@
|
||||
* Copyright (c) 2014-2018 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2014 Bull SAS. All rights reserved
|
||||
* Copyrigth (c) 2019 Triad National Security, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -1040,15 +1041,6 @@ int mca_btl_openib_add_procs(
|
||||
int btl_rank = 0;
|
||||
volatile mca_btl_base_endpoint_t* endpoint;
|
||||
|
||||
|
||||
if (! openib_btl->allowed) {
|
||||
opal_bitmap_clear_all_bits(reachable);
|
||||
opal_show_help("help-mpi-btl-openib.txt", "ib port not selected",
|
||||
true, opal_process_info.nodename,
|
||||
openib_btl->device_name, openib_btl->port_num);
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
btl_rank = get_openib_btl_params(openib_btl, &lcl_subnet_id_port_cnt);
|
||||
if( 0 > btl_rank ){
|
||||
return OPAL_ERR_NOT_FOUND;
|
||||
@ -1648,83 +1640,82 @@ static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl)
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
if (openib_btl->allowed) {
|
||||
/* Release all QPs */
|
||||
if (NULL != openib_btl->device->endpoints) {
|
||||
for (ep_index=0;
|
||||
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
|
||||
ep_index++) {
|
||||
endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints,
|
||||
/* Release all QPs */
|
||||
if (NULL != openib_btl->device->endpoints) {
|
||||
for (ep_index=0;
|
||||
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
|
||||
ep_index++) {
|
||||
|
||||
endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints,
|
||||
ep_index);
|
||||
if(!endpoint) {
|
||||
BTL_VERBOSE(("In finalize, got another null endpoint"));
|
||||
continue;
|
||||
}
|
||||
if(endpoint->endpoint_btl != openib_btl) {
|
||||
continue;
|
||||
}
|
||||
for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) {
|
||||
if(openib_btl->device->eager_rdma_buffers[i] == endpoint) {
|
||||
openib_btl->device->eager_rdma_buffers[i] = NULL;
|
||||
OBJ_RELEASE(endpoint);
|
||||
}
|
||||
}
|
||||
opal_pointer_array_set_item(openib_btl->device->endpoints,
|
||||
ep_index, NULL);
|
||||
assert(((opal_object_t*)endpoint)->obj_reference_count == 1);
|
||||
OBJ_RELEASE(endpoint);
|
||||
if(!endpoint) {
|
||||
BTL_VERBOSE(("In finalize, got another null endpoint"));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Release SRQ resources */
|
||||
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
||||
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
|
||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
|
||||
if (NULL != openib_btl->qps[qp].u.srq_qp.srq) {
|
||||
opal_mutex_t *lock =
|
||||
&mca_btl_openib_component.srq_manager.lock;
|
||||
|
||||
opal_hash_table_t *srq_addr_table =
|
||||
&mca_btl_openib_component.srq_manager.srq_addr_table;
|
||||
|
||||
opal_mutex_lock(lock);
|
||||
if (OPAL_SUCCESS !=
|
||||
opal_hash_table_remove_value_ptr(srq_addr_table,
|
||||
&openib_btl->qps[qp].u.srq_qp.srq,
|
||||
sizeof(struct ibv_srq *))) {
|
||||
BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp));
|
||||
rc = OPAL_ERROR;
|
||||
}
|
||||
opal_mutex_unlock(lock);
|
||||
if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) {
|
||||
BTL_VERBOSE(("Failed to close SRQ %d", qp));
|
||||
rc = OPAL_ERROR;
|
||||
}
|
||||
if(endpoint->endpoint_btl != openib_btl) {
|
||||
continue;
|
||||
}
|
||||
for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) {
|
||||
if(openib_btl->device->eager_rdma_buffers[i] == endpoint) {
|
||||
openib_btl->device->eager_rdma_buffers[i] = NULL;
|
||||
OBJ_RELEASE(endpoint);
|
||||
}
|
||||
|
||||
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
|
||||
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
|
||||
}
|
||||
opal_pointer_array_set_item(openib_btl->device->endpoints,
|
||||
ep_index, NULL);
|
||||
assert(((opal_object_t*)endpoint)->obj_reference_count == 1);
|
||||
OBJ_RELEASE(endpoint);
|
||||
}
|
||||
}
|
||||
|
||||
/* Finalize the CPC modules on this openib module */
|
||||
for (i = 0; i < openib_btl->num_cpcs; ++i) {
|
||||
if (NULL != openib_btl->cpcs[i]->cbm_finalize) {
|
||||
openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]);
|
||||
/* Release SRQ resources */
|
||||
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
|
||||
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
|
||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
|
||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
|
||||
if (NULL != openib_btl->qps[qp].u.srq_qp.srq) {
|
||||
opal_mutex_t *lock =
|
||||
&mca_btl_openib_component.srq_manager.lock;
|
||||
|
||||
opal_hash_table_t *srq_addr_table =
|
||||
&mca_btl_openib_component.srq_manager.srq_addr_table;
|
||||
|
||||
opal_mutex_lock(lock);
|
||||
if (OPAL_SUCCESS !=
|
||||
opal_hash_table_remove_value_ptr(srq_addr_table,
|
||||
&openib_btl->qps[qp].u.srq_qp.srq,
|
||||
sizeof(struct ibv_srq *))) {
|
||||
BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp));
|
||||
rc = OPAL_ERROR;
|
||||
}
|
||||
opal_mutex_unlock(lock);
|
||||
if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) {
|
||||
BTL_VERBOSE(("Failed to close SRQ %d", qp));
|
||||
rc = OPAL_ERROR;
|
||||
}
|
||||
}
|
||||
free(openib_btl->cpcs[i]);
|
||||
}
|
||||
free(openib_btl->cpcs);
|
||||
|
||||
/* Release device if there are no more users */
|
||||
if(!(--openib_btl->device->allowed_btls)) {
|
||||
OBJ_RELEASE(openib_btl->device);
|
||||
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
|
||||
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
|
||||
}
|
||||
}
|
||||
|
||||
/* Finalize the CPC modules on this openib module */
|
||||
for (i = 0; i < openib_btl->num_cpcs; ++i) {
|
||||
if (NULL != openib_btl->cpcs[i]->cbm_finalize) {
|
||||
openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]);
|
||||
}
|
||||
free(openib_btl->cpcs[i]);
|
||||
}
|
||||
free(openib_btl->cpcs);
|
||||
|
||||
/* Release device if there are no more users */
|
||||
if(!(--openib_btl->device->allowed_btls)) {
|
||||
OBJ_RELEASE(openib_btl->device);
|
||||
}
|
||||
|
||||
if (NULL != openib_btl->qps) {
|
||||
free(openib_btl->qps);
|
||||
}
|
||||
|
@ -20,6 +20,8 @@
|
||||
* Copyright (c) 2014 Bull SAS. All rights reserved.
|
||||
* Copyright (c) 2015-2018 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyrigth (c) 2019 Triad National Security, LLC. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -506,8 +508,6 @@ struct mca_btl_openib_module_t {
|
||||
int local_procs; /** number of local procs */
|
||||
|
||||
bool atomic_ops_be; /** atomic result is big endian */
|
||||
|
||||
bool allowed; /** is this port allowed */
|
||||
};
|
||||
typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;
|
||||
|
||||
|
@ -22,6 +22,7 @@
|
||||
* Copyright (c) 2014-2018 Research Organization for Information Science
|
||||
* and Technology (RIST). All rights reserved.
|
||||
* Copyright (c) 2014 Bull SAS. All rights reserved.
|
||||
* Copyrigth (c) 2019 Triad National Security, LLC. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -278,9 +279,6 @@ static int btl_openib_modex_send(void)
|
||||
);
|
||||
/* For each module, add in the size of the per-CPC data */
|
||||
for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
||||
if (! mca_btl_openib_component.openib_btls[i]->allowed) {
|
||||
continue;
|
||||
}
|
||||
for (j = 0;
|
||||
j < mca_btl_openib_component.openib_btls[i]->num_cpcs;
|
||||
++j) {
|
||||
@ -309,9 +307,6 @@ static int btl_openib_modex_send(void)
|
||||
/* Pack each of the modules */
|
||||
for (i = 0; i < mca_btl_openib_component.ib_num_btls; i++) {
|
||||
|
||||
if (! mca_btl_openib_component.openib_btls[i]->allowed) {
|
||||
continue;
|
||||
}
|
||||
/* Pack the modex common message struct. */
|
||||
size = modex_message_size;
|
||||
|
||||
@ -633,38 +628,26 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
||||
* unless the user specifically requested to override this
|
||||
* policy. For ancient OFED, only allow if user has set
|
||||
* the MCA parameter.
|
||||
*
|
||||
* We emit a help message if Open MPI was configured without
|
||||
* UCX support if the port is configured to use infiniband for link
|
||||
* layer. If UCX support is available, don't emit help message
|
||||
* since UCX PML has higher priority than OB1 and this BTL will
|
||||
* not be used.
|
||||
*/
|
||||
if (! mca_btl_openib_component.allow_ib
|
||||
if (false == mca_btl_openib_component.allow_ib
|
||||
#if HAVE_DECL_IBV_LINK_LAYER_ETHERNET
|
||||
&& IBV_LINK_LAYER_INFINIBAND == ib_port_attr->link_layer
|
||||
#endif
|
||||
) {
|
||||
openib_btl = (mca_btl_openib_module_t *) calloc(1, sizeof(mca_btl_openib_module_t));
|
||||
if(NULL == openib_btl) {
|
||||
BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
|
||||
return OPAL_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
memcpy(openib_btl, &mca_btl_openib_module,
|
||||
sizeof(mca_btl_openib_module));
|
||||
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
|
||||
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
|
||||
openib_btl->port_num = (uint8_t) port_num;
|
||||
openib_btl->allowed = false;
|
||||
openib_btl->device = NULL;
|
||||
openib_btl->device_name = strdup(ibv_get_device_name(device->ib_dev));
|
||||
OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t);
|
||||
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
|
||||
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
|
||||
++device->btls;
|
||||
++mca_btl_openib_component.ib_num_btls;
|
||||
if (-1 != mca_btl_openib_component.ib_max_btls &&
|
||||
mca_btl_openib_component.ib_num_btls >=
|
||||
mca_btl_openib_component.ib_max_btls) {
|
||||
return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
|
||||
}
|
||||
return OPAL_SUCCESS;
|
||||
}
|
||||
|
||||
#if !HAVE_UCX
|
||||
opal_show_help("help-mpi-btl-openib.txt", "ib port not selected",
|
||||
true, opal_process_info.nodename,
|
||||
ibv_get_device_name(device->ib_dev),
|
||||
port_num);
|
||||
#endif
|
||||
return OPAL_ERR_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* Ensure that the requested GID index (via the
|
||||
btl_openib_gid_index MCA param) is within the GID table
|
||||
@ -901,8 +884,6 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
|
||||
}
|
||||
}
|
||||
|
||||
openib_btl->allowed = true;
|
||||
|
||||
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
|
||||
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
|
||||
++device->btls;
|
||||
@ -2999,29 +2980,27 @@ btl_openib_component_init(int *num_btl_modules,
|
||||
ib_selected = (mca_btl_base_selected_module_t*)item;
|
||||
openib_btl = (mca_btl_openib_module_t*)ib_selected->btl_module;
|
||||
|
||||
if (openib_btl->allowed) {
|
||||
/* Search for a CPC that can handle this port */
|
||||
ret = opal_btl_openib_connect_base_select_for_local_port(openib_btl);
|
||||
/* If we get NOT_SUPPORTED, then no CPC was found for this
|
||||
port. But that's not a fatal error -- just keep going;
|
||||
let's see if we find any usable openib modules or not. */
|
||||
if (OPAL_ERR_NOT_SUPPORTED == ret) {
|
||||
continue;
|
||||
} else if (OPAL_SUCCESS != ret) {
|
||||
/* All others *are* fatal. Note that we already did a
|
||||
show_help in the lower layer */
|
||||
goto no_btls;
|
||||
}
|
||||
/* Search for a CPC that can handle this port */
|
||||
ret = opal_btl_openib_connect_base_select_for_local_port(openib_btl);
|
||||
/* If we get NOT_SUPPORTED, then no CPC was found for this
|
||||
port. But that's not a fatal error -- just keep going;
|
||||
let's see if we find any usable openib modules or not. */
|
||||
if (OPAL_ERR_NOT_SUPPORTED == ret) {
|
||||
continue;
|
||||
} else if (OPAL_SUCCESS != ret) {
|
||||
/* All others *are* fatal. Note that we already did a
|
||||
show_help in the lower layer */
|
||||
goto no_btls;
|
||||
}
|
||||
|
||||
if (mca_btl_openib_component.max_hw_msg_size > 0 &&
|
||||
(uint32_t)mca_btl_openib_component.max_hw_msg_size > openib_btl->ib_port_attr.max_msg_sz) {
|
||||
BTL_ERROR(("max_hw_msg_size (%" PRIu32 ") is larger than hw max message size (%" PRIu32 ")",
|
||||
mca_btl_openib_component.max_hw_msg_size, openib_btl->ib_port_attr.max_msg_sz));
|
||||
}
|
||||
if (mca_btl_openib_component.max_hw_msg_size > 0 &&
|
||||
(uint32_t)mca_btl_openib_component.max_hw_msg_size > openib_btl->ib_port_attr.max_msg_sz) {
|
||||
BTL_ERROR(("max_hw_msg_size (%" PRIu32 ") is larger than hw max message size (%" PRIu32 ")",
|
||||
mca_btl_openib_component.max_hw_msg_size, openib_btl->ib_port_attr.max_msg_sz));
|
||||
}
|
||||
|
||||
if (finish_btl_init(openib_btl) != OPAL_SUCCESS) {
|
||||
goto no_btls;
|
||||
}
|
||||
if (finish_btl_init(openib_btl) != OPAL_SUCCESS) {
|
||||
goto no_btls;
|
||||
}
|
||||
|
||||
mca_btl_openib_component.openib_btls[i] = openib_btl;
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user