Fixing openib btl finalize flow. Bug fix for #1286.
This commit was SVN r18590.
Этот коммит содержится в:
родитель
a8b5dcb204
Коммит
379e00050c
@ -11,7 +11,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved.
|
* Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
|
||||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||||
@ -42,7 +42,6 @@
|
|||||||
#include "ompi/mca/mpool/base/base.h"
|
#include "ompi/mca/mpool/base/base.h"
|
||||||
#include "ompi/mca/mpool/mpool.h"
|
#include "ompi/mca/mpool/mpool.h"
|
||||||
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
|
#include "ompi/mca/mpool/rdma/mpool_rdma.h"
|
||||||
#include "ompi/runtime/params.h"
|
|
||||||
#include "orte/util/proc_info.h"
|
#include "orte/util/proc_info.h"
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
@ -912,80 +911,6 @@ mca_btl_base_descriptor_t* mca_btl_openib_prepare_dst(
|
|||||||
return &to_base_frag(frag)->base;
|
return &to_base_frag(frag)->base;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int mca_btl_finalize_hca(struct mca_btl_openib_hca_t *hca)
|
|
||||||
{
|
|
||||||
#if OMPI_HAVE_THREADS
|
|
||||||
int hca_to_remove;
|
|
||||||
#if OMPI_ENABLE_PROGRESS_THREADS
|
|
||||||
if(hca->progress) {
|
|
||||||
hca->progress = false;
|
|
||||||
if (pthread_cancel(hca->thread.t_handle)) {
|
|
||||||
BTL_ERROR(("Failed to cancel OpenIB progress thread"));
|
|
||||||
}
|
|
||||||
opal_thread_join(&hca->thread, NULL);
|
|
||||||
}
|
|
||||||
if (ibv_destroy_comp_channel(hca->ib_channel)) {
|
|
||||||
BTL_VERBOSE(("Failed to close comp_channel"));
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
/* signaling to async_tread to stop poll for this hca */
|
|
||||||
if(mca_btl_openib_component.use_async_event_thread) {
|
|
||||||
hca_to_remove = -(hca->ib_dev_context->async_fd);
|
|
||||||
if (write(mca_btl_openib_component.async_pipe[1], &hca_to_remove,
|
|
||||||
sizeof(int)) < 0){
|
|
||||||
BTL_ERROR(("Failed to write to pipe"));
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Release CQs */
|
|
||||||
if(hca->ib_cq[BTL_OPENIB_HP_CQ] != NULL) {
|
|
||||||
if (ibv_destroy_cq(hca->ib_cq[BTL_OPENIB_HP_CQ])) {
|
|
||||||
BTL_VERBOSE(("Failed to close HP CQ"));
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(hca->ib_cq[BTL_OPENIB_LP_CQ] != NULL) {
|
|
||||||
if (ibv_destroy_cq(hca->ib_cq[BTL_OPENIB_LP_CQ])) {
|
|
||||||
BTL_VERBOSE(("Failed to close LP CQ"));
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (OMPI_SUCCESS != mca_mpool_base_module_destroy(hca->mpool)) {
|
|
||||||
BTL_VERBOSE(("Failed to release mpool"));
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if HAVE_XRC
|
|
||||||
if (MCA_BTL_XRC_ENABLED) {
|
|
||||||
if (OMPI_SUCCESS != mca_btl_openib_close_xrc_domain(hca)) {
|
|
||||||
BTL_ERROR(("XRC Internal error. Failed to close xrc domain"));
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (ibv_dealloc_pd(hca->ib_pd)) {
|
|
||||||
BTL_VERBOSE(("Warning! Failed to release PD"));
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
if (ibv_close_device(hca->ib_dev_context)) {
|
|
||||||
if (ompi_mpi_leave_pinned || ompi_mpi_leave_pinned_pipeline) {
|
|
||||||
BTL_VERBOSE(("Warning! Failed to close HCA"));
|
|
||||||
return OMPI_SUCCESS;
|
|
||||||
} else {
|
|
||||||
BTL_ERROR(("Error! Failed to close HCA"));
|
|
||||||
return OMPI_ERROR;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
OBJ_RELEASE(hca);
|
|
||||||
return OMPI_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl)
|
int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl)
|
||||||
{
|
{
|
||||||
mca_btl_openib_module_t* openib_btl;
|
mca_btl_openib_module_t* openib_btl;
|
||||||
@ -1054,15 +979,9 @@ int mca_btl_openib_finalize(struct mca_btl_base_module_t* btl)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Release pending lists */
|
/* Release HCA if it is no more users */
|
||||||
if(!(--openib_btl->hca->btls)) {
|
if(!(--openib_btl->hca->btls)) {
|
||||||
/* All btls for the HCA were closed
|
OBJ_RELEASE(openib_btl->hca);
|
||||||
* Now we can close the HCA
|
|
||||||
*/
|
|
||||||
if (OMPI_SUCCESS != mca_btl_finalize_hca(openib_btl->hca)) {
|
|
||||||
BTL_VERBOSE(("Failed to close HCA"));
|
|
||||||
rc = OMPI_ERROR;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#if OMPI_HAVE_THREADS
|
#if OMPI_HAVE_THREADS
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved.
|
* Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
|
||||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||||
@ -76,6 +76,7 @@
|
|||||||
#endif
|
#endif
|
||||||
#include "connect/base.h"
|
#include "connect/base.h"
|
||||||
#include "btl_openib_iwarp.h"
|
#include "btl_openib_iwarp.h"
|
||||||
|
#include "ompi/runtime/params.h"
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Local functions
|
* Local functions
|
||||||
@ -672,7 +673,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_hca_t *hca,
|
|||||||
}
|
}
|
||||||
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
|
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
|
||||||
opal_pointer_array_add(hca->hca_btls, (void*) openib_btl);
|
opal_pointer_array_add(hca->hca_btls, (void*) openib_btl);
|
||||||
hca->btls++;
|
++hca->btls;
|
||||||
++mca_btl_openib_component.ib_num_btls;
|
++mca_btl_openib_component.ib_num_btls;
|
||||||
if (-1 != mca_btl_openib_component.ib_max_btls &&
|
if (-1 != mca_btl_openib_component.ib_max_btls &&
|
||||||
mca_btl_openib_component.ib_num_btls >=
|
mca_btl_openib_component.ib_num_btls >=
|
||||||
@ -717,6 +718,33 @@ static void hca_destruct(mca_btl_openib_hca_t *hca)
|
|||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
#if OMPI_HAVE_THREADS
|
||||||
|
#if OMPI_ENABLE_PROGRESS_THREADS
|
||||||
|
if(hca->progress) {
|
||||||
|
hca->progress = false;
|
||||||
|
if (pthread_cancel(hca->thread.t_handle)) {
|
||||||
|
BTL_ERROR(("Failed to cancel OpenIB progress thread"));
|
||||||
|
goto hca_error;
|
||||||
|
}
|
||||||
|
opal_thread_join(&hca->thread, NULL);
|
||||||
|
}
|
||||||
|
if (ibv_destroy_comp_channel(hca->ib_channel)) {
|
||||||
|
BTL_VERBOSE(("Failed to close comp_channel"));
|
||||||
|
goto hca_error;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
/* signaling to async_tread to stop poll for this hca */
|
||||||
|
if(mca_btl_openib_component.use_async_event_thread) {
|
||||||
|
int hca_to_remove;
|
||||||
|
hca_to_remove = -(hca->ib_dev_context->async_fd);
|
||||||
|
if (write(mca_btl_openib_component.async_pipe[1], &hca_to_remove,
|
||||||
|
sizeof(int)) < 0){
|
||||||
|
BTL_ERROR(("Failed to write to pipe"));
|
||||||
|
goto hca_error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if(hca->eager_rdma_buffers) {
|
if(hca->eager_rdma_buffers) {
|
||||||
int i;
|
int i;
|
||||||
for(i = 0; i < hca->eager_rdma_buffers_count; i++)
|
for(i = 0; i < hca->eager_rdma_buffers_count; i++)
|
||||||
@ -724,8 +752,7 @@ static void hca_destruct(mca_btl_openib_hca_t *hca)
|
|||||||
OBJ_RELEASE(hca->eager_rdma_buffers[i]);
|
OBJ_RELEASE(hca->eager_rdma_buffers[i]);
|
||||||
free(hca->eager_rdma_buffers);
|
free(hca->eager_rdma_buffers);
|
||||||
}
|
}
|
||||||
OBJ_DESTRUCT(&hca->hca_lock);
|
|
||||||
OBJ_DESTRUCT(&hca->send_free_control);
|
|
||||||
if (NULL != hca->qps) {
|
if (NULL != hca->qps) {
|
||||||
for (i = 0; i < mca_btl_openib_component.num_qps; i++) {
|
for (i = 0; i < mca_btl_openib_component.num_qps; i++) {
|
||||||
OBJ_DESTRUCT(&hca->qps[i].send_free);
|
OBJ_DESTRUCT(&hca->qps[i].send_free);
|
||||||
@ -733,6 +760,58 @@ static void hca_destruct(mca_btl_openib_hca_t *hca)
|
|||||||
}
|
}
|
||||||
free(hca->qps);
|
free(hca->qps);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OBJ_DESTRUCT(&hca->send_free_control);
|
||||||
|
|
||||||
|
/* Release CQs */
|
||||||
|
if(hca->ib_cq[BTL_OPENIB_HP_CQ] != NULL) {
|
||||||
|
if (ibv_destroy_cq(hca->ib_cq[BTL_OPENIB_HP_CQ])) {
|
||||||
|
BTL_VERBOSE(("Failed to close HP CQ"));
|
||||||
|
goto hca_error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(hca->ib_cq[BTL_OPENIB_LP_CQ] != NULL) {
|
||||||
|
if (ibv_destroy_cq(hca->ib_cq[BTL_OPENIB_LP_CQ])) {
|
||||||
|
BTL_VERBOSE(("Failed to close LP CQ"));
|
||||||
|
goto hca_error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (OMPI_SUCCESS != mca_mpool_base_module_destroy(hca->mpool)) {
|
||||||
|
BTL_VERBOSE(("Failed to release mpool"));
|
||||||
|
goto hca_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if HAVE_XRC
|
||||||
|
if (MCA_BTL_XRC_ENABLED) {
|
||||||
|
if (OMPI_SUCCESS != mca_btl_openib_close_xrc_domain(hca)) {
|
||||||
|
BTL_VERBOSE(("XRC Internal error. Failed to close xrc domain"));
|
||||||
|
goto hca_error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (ibv_dealloc_pd(hca->ib_pd)) {
|
||||||
|
BTL_VERBOSE(("Warning! Failed to release PD"));
|
||||||
|
goto hca_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
OBJ_DESTRUCT(&hca->hca_lock);
|
||||||
|
|
||||||
|
if (ibv_close_device(hca->ib_dev_context)) {
|
||||||
|
if (ompi_mpi_leave_pinned || ompi_mpi_leave_pinned_pipeline) {
|
||||||
|
BTL_VERBOSE(("Warning! Failed to close HCA"));
|
||||||
|
goto hca_error;
|
||||||
|
} else {
|
||||||
|
BTL_ERROR(("Error! Failed to close HCA"));
|
||||||
|
goto hca_error;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
BTL_VERBOSE(("HCA was successfully released"));
|
||||||
|
return;
|
||||||
|
hca_error:
|
||||||
|
BTL_VERBOSE(("Failed to destroy HCA resources"));
|
||||||
}
|
}
|
||||||
|
|
||||||
OBJ_CLASS_INSTANCE(mca_btl_openib_hca_t, opal_object_t, hca_construct,
|
OBJ_CLASS_INSTANCE(mca_btl_openib_hca_t, opal_object_t, hca_construct,
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user