Adding real fix for ticket #1693 - XRC + coalescing segfault.
This commit was SVN r20214.
Этот коммит содержится в:
родитель
5e1d2eec58
Коммит
2f7b66160b
@ -11,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
@ -638,7 +638,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
|
||||
if(mca_btl_openib_component.use_message_coalescing &&
|
||||
(flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
|
||||
int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY);
|
||||
sfrag = check_coalescing(&ep->qps[qp].qp->pending_frags[prio],
|
||||
sfrag = check_coalescing(&ep->qps[qp].no_wqe_pending_frags[prio],
|
||||
&ep->qps[qp].qp->lock, ep, size);
|
||||
|
||||
if(NULL == sfrag) {
|
||||
|
@ -11,7 +11,7 @@
|
||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||
* All rights reserved.
|
||||
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
@ -2733,14 +2733,12 @@ progress_pending_frags_wqe(mca_btl_base_endpoint_t *ep, const int qpn)
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
for(i = 0; i < 2; i++) {
|
||||
while(qp->sd_wqe > 0) {
|
||||
mca_btl_base_endpoint_t *ep;
|
||||
OPAL_THREAD_LOCK(&qp->lock);
|
||||
frag = opal_list_remove_first(&qp->pending_frags[i]);
|
||||
OPAL_THREAD_UNLOCK(&qp->lock);
|
||||
mca_btl_base_endpoint_t *tmp_ep;
|
||||
frag = opal_list_remove_first(&ep->qps[qpn].no_wqe_pending_frags[i]);
|
||||
if(NULL == frag)
|
||||
break;
|
||||
ep = to_com_frag(frag)->endpoint;
|
||||
mca_btl_openib_endpoint_post_send(ep, to_send_frag(frag));
|
||||
tmp_ep = to_com_frag(frag)->endpoint;
|
||||
mca_btl_openib_endpoint_post_send(tmp_ep, to_send_frag(frag));
|
||||
}
|
||||
}
|
||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||
|
@ -14,7 +14,7 @@
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2006-2008 Mellanox Technologies, Inc. All rights reserved.
|
||||
* Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
@ -120,10 +120,8 @@ static inline int acruire_wqe(mca_btl_openib_endpoint_t *ep,
|
||||
|
||||
if(qp_get_wqe(ep, qp) < 0) {
|
||||
qp_put_wqe(ep, qp);
|
||||
OPAL_THREAD_LOCK(&ep->qps[qp].qp->lock);
|
||||
opal_list_append(&ep->qps[qp].qp->pending_frags[prio],
|
||||
opal_list_append(&ep->qps[qp].no_wqe_pending_frags[prio],
|
||||
(opal_list_item_t *)frag);
|
||||
OPAL_THREAD_UNLOCK(&ep->qps[qp].qp->lock);
|
||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||
}
|
||||
|
||||
@ -281,8 +279,6 @@ static mca_btl_openib_qp_t *endpoint_alloc_qp(void)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
OBJ_CONSTRUCT(&qp->pending_frags[0], opal_list_t);
|
||||
OBJ_CONSTRUCT(&qp->pending_frags[1], opal_list_t);
|
||||
OBJ_CONSTRUCT(&qp->lock, opal_mutex_t);
|
||||
|
||||
return qp;
|
||||
@ -347,8 +343,12 @@ static void endpoint_init_qp(mca_btl_base_endpoint_t *ep, const int qp)
|
||||
ep_qp->rd_credit_send_lock = 0;
|
||||
ep_qp->credit_frag = NULL;
|
||||
|
||||
OBJ_CONSTRUCT(&ep_qp->no_wqe_pending_frags[0], opal_list_t);
|
||||
OBJ_CONSTRUCT(&ep_qp->no_wqe_pending_frags[1], opal_list_t);
|
||||
|
||||
OBJ_CONSTRUCT(&ep_qp->pending_frags[0], opal_list_t);
|
||||
OBJ_CONSTRUCT(&ep_qp->pending_frags[1], opal_list_t);
|
||||
|
||||
switch(BTL_OPENIB_QP_TYPE(qp)) {
|
||||
case MCA_BTL_OPENIB_PP_QP:
|
||||
endpoint_init_qp_pp(ep_qp, qp);
|
||||
@ -500,16 +500,17 @@ static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
|
||||
OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags[0]);
|
||||
OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags[1]);
|
||||
|
||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||
&endpoint->qps[qp].no_wqe_pending_frags[0]);
|
||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||
&endpoint->qps[qp].no_wqe_pending_frags[1]);
|
||||
OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[0]);
|
||||
OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[1]);
|
||||
|
||||
|
||||
if(--endpoint->qps[qp].qp->users != 0)
|
||||
continue;
|
||||
|
||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||
&endpoint->qps[qp].qp->pending_frags[0]);
|
||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||
&endpoint->qps[qp].qp->pending_frags[1]);
|
||||
OBJ_DESTRUCT(&endpoint->qps[qp].qp->pending_frags[0]);
|
||||
OBJ_DESTRUCT(&endpoint->qps[qp].qp->pending_frags[1]);
|
||||
|
||||
if(endpoint->qps[qp].qp->lcl_qp != NULL)
|
||||
if(ibv_destroy_qp(endpoint->qps[qp].qp->lcl_qp))
|
||||
BTL_ERROR(("Failed to destroy QP:%d\n", qp));
|
||||
|
@ -13,7 +13,7 @@
|
||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||
* reserved.
|
||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||
* Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
|
||||
* $COPYRIGHT$
|
||||
*
|
||||
* Additional copyrights may follow
|
||||
@ -126,8 +126,6 @@ typedef struct mca_btl_openib_qp_t {
|
||||
struct ibv_qp *lcl_qp;
|
||||
uint32_t lcl_psn;
|
||||
int32_t sd_wqe; /**< number of available send wqe entries */
|
||||
opal_list_t pending_frags[2]; /**< put fragments here if there is no wqe
|
||||
available */
|
||||
int users;
|
||||
opal_mutex_t lock;
|
||||
} mca_btl_openib_qp_t;
|
||||
@ -136,6 +134,8 @@ typedef struct mca_btl_openib_endpoint_qp_t {
|
||||
mca_btl_openib_qp_t *qp;
|
||||
opal_list_t pending_frags[2]; /**< put fragment here if there is no credits
|
||||
available */
|
||||
opal_list_t no_wqe_pending_frags[2]; /**< put fragments here if there is no wqe
|
||||
available */
|
||||
int32_t rd_credit_send_lock; /**< Lock credit send fragment */
|
||||
mca_btl_openib_send_control_frag_t *credit_frag;
|
||||
size_t ib_inline_max; /**< max size of inline send*/
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
|
||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||
*
|
||||
* $COPYRIGHT$
|
||||
@ -409,7 +409,7 @@ static int xoob_send_qp_create (mca_btl_base_endpoint_t* endpoint)
|
||||
if (qp_init_attr.cap.max_inline_data < req_inline) {
|
||||
endpoint->qps[0].ib_inline_max = qp_init_attr.cap.max_inline_data;
|
||||
orte_show_help("help-mpi-btl-openib-cpc-base.txt",
|
||||
"inline truncated", true, orte_process_info.nodename,
|
||||
"inline truncated", orte_process_info.nodename,
|
||||
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||
req_inline, qp_init_attr.cap.max_inline_data);
|
||||
} else {
|
||||
@ -715,8 +715,8 @@ static void xoob_restart_connect(mca_btl_base_endpoint_t *endpoint)
|
||||
endpoint->ib_addr->status = MCA_BTL_IB_ADDR_CLOSED;
|
||||
/* destroy the qp */
|
||||
/* the reciver site was alredy closed so all pending list must be clean ! */
|
||||
assert (opal_list_is_empty(&endpoint->ib_addr->qp->pending_frags[0]));
|
||||
assert (opal_list_is_empty(&endpoint->ib_addr->qp->pending_frags[1]));
|
||||
assert (opal_list_is_empty(&endpoint->qps->no_wqe_pending_frags[0]));
|
||||
assert (opal_list_is_empty(&endpoint->qps->no_wqe_pending_frags[1]));
|
||||
if(ibv_destroy_qp(endpoint->qps[0].qp->lcl_qp))
|
||||
BTL_ERROR(("Failed to destroy QP"));
|
||||
case MCA_BTL_IB_ADDR_CLOSED:
|
||||
@ -954,13 +954,6 @@ static int xoob_component_query(mca_btl_openib_module_t *openib_btl,
|
||||
ibv_get_device_name(openib_btl->device->ib_dev));
|
||||
return OMPI_ERR_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
/* Print warning and switch off coalescing mode (ticket #1693)*/
|
||||
if (mca_btl_openib_component.use_message_coalescing) {
|
||||
orte_show_help("help-mpi-btl-openib-cpc-base.txt",
|
||||
"bug #1693", true, orte_process_info.nodename);
|
||||
mca_btl_openib_component.use_message_coalescing = 0;
|
||||
}
|
||||
|
||||
*cpc = malloc(sizeof(ompi_btl_openib_connect_base_module_t));
|
||||
if (NULL == *cpc) {
|
||||
|
@ -38,19 +38,3 @@ a smaller inline data value than was requested.
|
||||
Local device: %s
|
||||
Requested value: %d
|
||||
Value used by device: %d
|
||||
#
|
||||
[bug #1693]
|
||||
WARNING: ConnectX XRC support was enabled together with coalescing
|
||||
mode. This combination is not supported in this version of Open MPI
|
||||
because it may cause random seg faults (please see Open MPI bug ticket
|
||||
#1693 https://svn.open-mpi.org/trac/ompi/ticket/1693 for more
|
||||
details). Message coalescing has been disabled in this job to prevent
|
||||
catastrophic failure.
|
||||
|
||||
You can silence this warning either by disabling XRC (i.e., not specifying
|
||||
an "X" queue in btl_openib_receive_queues) or disabling message
|
||||
coalescing by setting the btl_openib_use_message_coalescing MCA
|
||||
parameter to 0.
|
||||
|
||||
Local host: %s
|
||||
#
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user