1
1

Adding real fix for ticket #1693 - XRC + coalescing segfault.

This commit was SVN r20214.
Этот коммит содержится в:
Pavel Shamis 2009-01-07 14:10:58 +00:00
родитель 5e1d2eec58
Коммит 2f7b66160b
6 изменённых файлов: 28 добавлений и 52 удалений

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
@ -638,7 +638,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
if(mca_btl_openib_component.use_message_coalescing &&
(flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY);
sfrag = check_coalescing(&ep->qps[qp].qp->pending_frags[prio],
sfrag = check_coalescing(&ep->qps[qp].no_wqe_pending_frags[prio],
&ep->qps[qp].qp->lock, ep, size);
if(NULL == sfrag) {

Просмотреть файл

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
@ -2733,14 +2733,12 @@ progress_pending_frags_wqe(mca_btl_base_endpoint_t *ep, const int qpn)
OPAL_THREAD_LOCK(&ep->endpoint_lock);
for(i = 0; i < 2; i++) {
while(qp->sd_wqe > 0) {
mca_btl_base_endpoint_t *ep;
OPAL_THREAD_LOCK(&qp->lock);
frag = opal_list_remove_first(&qp->pending_frags[i]);
OPAL_THREAD_UNLOCK(&qp->lock);
mca_btl_base_endpoint_t *tmp_ep;
frag = opal_list_remove_first(&ep->qps[qpn].no_wqe_pending_frags[i]);
if(NULL == frag)
break;
ep = to_com_frag(frag)->endpoint;
mca_btl_openib_endpoint_post_send(ep, to_send_frag(frag));
tmp_ep = to_com_frag(frag)->endpoint;
mca_btl_openib_endpoint_post_send(tmp_ep, to_send_frag(frag));
}
}
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);

Просмотреть файл

@ -14,7 +14,7 @@
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2006-2008 Mellanox Technologies, Inc. All rights reserved.
* Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved.
*
* $COPYRIGHT$
*
@ -120,10 +120,8 @@ static inline int acruire_wqe(mca_btl_openib_endpoint_t *ep,
if(qp_get_wqe(ep, qp) < 0) {
qp_put_wqe(ep, qp);
OPAL_THREAD_LOCK(&ep->qps[qp].qp->lock);
opal_list_append(&ep->qps[qp].qp->pending_frags[prio],
opal_list_append(&ep->qps[qp].no_wqe_pending_frags[prio],
(opal_list_item_t *)frag);
OPAL_THREAD_UNLOCK(&ep->qps[qp].qp->lock);
return OMPI_ERR_OUT_OF_RESOURCE;
}
@ -281,8 +279,6 @@ static mca_btl_openib_qp_t *endpoint_alloc_qp(void)
return NULL;
}
OBJ_CONSTRUCT(&qp->pending_frags[0], opal_list_t);
OBJ_CONSTRUCT(&qp->pending_frags[1], opal_list_t);
OBJ_CONSTRUCT(&qp->lock, opal_mutex_t);
return qp;
@ -347,8 +343,12 @@ static void endpoint_init_qp(mca_btl_base_endpoint_t *ep, const int qp)
ep_qp->rd_credit_send_lock = 0;
ep_qp->credit_frag = NULL;
OBJ_CONSTRUCT(&ep_qp->no_wqe_pending_frags[0], opal_list_t);
OBJ_CONSTRUCT(&ep_qp->no_wqe_pending_frags[1], opal_list_t);
OBJ_CONSTRUCT(&ep_qp->pending_frags[0], opal_list_t);
OBJ_CONSTRUCT(&ep_qp->pending_frags[1], opal_list_t);
switch(BTL_OPENIB_QP_TYPE(qp)) {
case MCA_BTL_OPENIB_PP_QP:
endpoint_init_qp_pp(ep_qp, qp);
@ -500,16 +500,17 @@ static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags[0]);
OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags[1]);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&endpoint->qps[qp].no_wqe_pending_frags[0]);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&endpoint->qps[qp].no_wqe_pending_frags[1]);
OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[0]);
OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[1]);
if(--endpoint->qps[qp].qp->users != 0)
continue;
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&endpoint->qps[qp].qp->pending_frags[0]);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&endpoint->qps[qp].qp->pending_frags[1]);
OBJ_DESTRUCT(&endpoint->qps[qp].qp->pending_frags[0]);
OBJ_DESTRUCT(&endpoint->qps[qp].qp->pending_frags[1]);
if(endpoint->qps[qp].qp->lcl_qp != NULL)
if(ibv_destroy_qp(endpoint->qps[qp].qp->lcl_qp))
BTL_ERROR(("Failed to destroy QP:%d\n", qp));

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2006-2007 Voltaire All rights reserved.
* Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved.
* Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -126,8 +126,6 @@ typedef struct mca_btl_openib_qp_t {
struct ibv_qp *lcl_qp;
uint32_t lcl_psn;
int32_t sd_wqe; /**< number of available send wqe entries */
opal_list_t pending_frags[2]; /**< put fragments here if there is no wqe
available */
int users;
opal_mutex_t lock;
} mca_btl_openib_qp_t;
@ -136,6 +134,8 @@ typedef struct mca_btl_openib_endpoint_qp_t {
mca_btl_openib_qp_t *qp;
opal_list_t pending_frags[2]; /**< put fragment here if there is no credits
available */
opal_list_t no_wqe_pending_frags[2]; /**< put fragments here if there is no wqe
available */
int32_t rd_credit_send_lock; /**< Lock credit send fragment */
mca_btl_openib_send_control_frag_t *credit_frag;
size_t ib_inline_max; /**< max size of inline send*/

Просмотреть файл

@ -1,5 +1,5 @@
/*
* Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved.
* Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
@ -409,7 +409,7 @@ static int xoob_send_qp_create (mca_btl_base_endpoint_t* endpoint)
if (qp_init_attr.cap.max_inline_data < req_inline) {
endpoint->qps[0].ib_inline_max = qp_init_attr.cap.max_inline_data;
orte_show_help("help-mpi-btl-openib-cpc-base.txt",
"inline truncated", true, orte_process_info.nodename,
"inline truncated", orte_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev),
req_inline, qp_init_attr.cap.max_inline_data);
} else {
@ -715,8 +715,8 @@ static void xoob_restart_connect(mca_btl_base_endpoint_t *endpoint)
endpoint->ib_addr->status = MCA_BTL_IB_ADDR_CLOSED;
/* destroy the qp */
/* the reciver site was alredy closed so all pending list must be clean ! */
assert (opal_list_is_empty(&endpoint->ib_addr->qp->pending_frags[0]));
assert (opal_list_is_empty(&endpoint->ib_addr->qp->pending_frags[1]));
assert (opal_list_is_empty(&endpoint->qps->no_wqe_pending_frags[0]));
assert (opal_list_is_empty(&endpoint->qps->no_wqe_pending_frags[1]));
if(ibv_destroy_qp(endpoint->qps[0].qp->lcl_qp))
BTL_ERROR(("Failed to destroy QP"));
case MCA_BTL_IB_ADDR_CLOSED:
@ -954,13 +954,6 @@ static int xoob_component_query(mca_btl_openib_module_t *openib_btl,
ibv_get_device_name(openib_btl->device->ib_dev));
return OMPI_ERR_NOT_SUPPORTED;
}
/* Print warning and switch off coalescing mode (ticket #1693)*/
if (mca_btl_openib_component.use_message_coalescing) {
orte_show_help("help-mpi-btl-openib-cpc-base.txt",
"bug #1693", true, orte_process_info.nodename);
mca_btl_openib_component.use_message_coalescing = 0;
}
*cpc = malloc(sizeof(ompi_btl_openib_connect_base_module_t));
if (NULL == *cpc) {

Просмотреть файл

@ -38,19 +38,3 @@ a smaller inline data value than was requested.
Local device: %s
Requested value: %d
Value used by device: %d
#
[bug #1693]
WARNING: ConnectX XRC support was enabled together with coalescing
mode. This combination is not supported in this version of Open MPI
because it may cause random seg faults (please see Open MPI bug ticket
#1693 https://svn.open-mpi.org/trac/ompi/ticket/1693 for more
details). Message coalescing has been disabled in this job to prevent
catastrophic failure.
You can silence this warning either by disabling XRC (i.e., not specifying
an "X" queue in btl_openib_receive_queues) or disabling message
coalescing by setting the btl_openib_use_message_coalescing MCA
parameter to 0.
Local host: %s
#