Adding real fix for ticket #1693 - XRC + coalescing segfault.
This commit was SVN r20214.
Этот коммит содержится в:
родитель
5e1d2eec58
Коммит
2f7b66160b
@ -11,7 +11,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
|
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||||
@ -638,7 +638,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
|
|||||||
if(mca_btl_openib_component.use_message_coalescing &&
|
if(mca_btl_openib_component.use_message_coalescing &&
|
||||||
(flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
|
(flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) {
|
||||||
int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY);
|
int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY);
|
||||||
sfrag = check_coalescing(&ep->qps[qp].qp->pending_frags[prio],
|
sfrag = check_coalescing(&ep->qps[qp].no_wqe_pending_frags[prio],
|
||||||
&ep->qps[qp].qp->lock, ep, size);
|
&ep->qps[qp].qp->lock, ep, size);
|
||||||
|
|
||||||
if(NULL == sfrag) {
|
if(NULL == sfrag) {
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
* Copyright (c) 2004-2005 The Regents of the University of California.
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2006-2008 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2006-2008 Mellanox Technologies. All rights reserved.
|
* Copyright (c) 2006-2009 Mellanox Technologies. All rights reserved.
|
||||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||||
@ -2733,14 +2733,12 @@ progress_pending_frags_wqe(mca_btl_base_endpoint_t *ep, const int qpn)
|
|||||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||||
for(i = 0; i < 2; i++) {
|
for(i = 0; i < 2; i++) {
|
||||||
while(qp->sd_wqe > 0) {
|
while(qp->sd_wqe > 0) {
|
||||||
mca_btl_base_endpoint_t *ep;
|
mca_btl_base_endpoint_t *tmp_ep;
|
||||||
OPAL_THREAD_LOCK(&qp->lock);
|
frag = opal_list_remove_first(&ep->qps[qpn].no_wqe_pending_frags[i]);
|
||||||
frag = opal_list_remove_first(&qp->pending_frags[i]);
|
|
||||||
OPAL_THREAD_UNLOCK(&qp->lock);
|
|
||||||
if(NULL == frag)
|
if(NULL == frag)
|
||||||
break;
|
break;
|
||||||
ep = to_com_frag(frag)->endpoint;
|
tmp_ep = to_com_frag(frag)->endpoint;
|
||||||
mca_btl_openib_endpoint_post_send(ep, to_send_frag(frag));
|
mca_btl_openib_endpoint_post_send(tmp_ep, to_send_frag(frag));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
OPAL_THREAD_UNLOCK(&ep->endpoint_lock);
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||||
* Copyright (c) 2006-2008 Mellanox Technologies, Inc. All rights reserved.
|
* Copyright (c) 2006-2009 Mellanox Technologies, Inc. All rights reserved.
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -120,10 +120,8 @@ static inline int acruire_wqe(mca_btl_openib_endpoint_t *ep,
|
|||||||
|
|
||||||
if(qp_get_wqe(ep, qp) < 0) {
|
if(qp_get_wqe(ep, qp) < 0) {
|
||||||
qp_put_wqe(ep, qp);
|
qp_put_wqe(ep, qp);
|
||||||
OPAL_THREAD_LOCK(&ep->qps[qp].qp->lock);
|
opal_list_append(&ep->qps[qp].no_wqe_pending_frags[prio],
|
||||||
opal_list_append(&ep->qps[qp].qp->pending_frags[prio],
|
|
||||||
(opal_list_item_t *)frag);
|
(opal_list_item_t *)frag);
|
||||||
OPAL_THREAD_UNLOCK(&ep->qps[qp].qp->lock);
|
|
||||||
return OMPI_ERR_OUT_OF_RESOURCE;
|
return OMPI_ERR_OUT_OF_RESOURCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -281,8 +279,6 @@ static mca_btl_openib_qp_t *endpoint_alloc_qp(void)
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&qp->pending_frags[0], opal_list_t);
|
|
||||||
OBJ_CONSTRUCT(&qp->pending_frags[1], opal_list_t);
|
|
||||||
OBJ_CONSTRUCT(&qp->lock, opal_mutex_t);
|
OBJ_CONSTRUCT(&qp->lock, opal_mutex_t);
|
||||||
|
|
||||||
return qp;
|
return qp;
|
||||||
@ -347,8 +343,12 @@ static void endpoint_init_qp(mca_btl_base_endpoint_t *ep, const int qp)
|
|||||||
ep_qp->rd_credit_send_lock = 0;
|
ep_qp->rd_credit_send_lock = 0;
|
||||||
ep_qp->credit_frag = NULL;
|
ep_qp->credit_frag = NULL;
|
||||||
|
|
||||||
|
OBJ_CONSTRUCT(&ep_qp->no_wqe_pending_frags[0], opal_list_t);
|
||||||
|
OBJ_CONSTRUCT(&ep_qp->no_wqe_pending_frags[1], opal_list_t);
|
||||||
|
|
||||||
OBJ_CONSTRUCT(&ep_qp->pending_frags[0], opal_list_t);
|
OBJ_CONSTRUCT(&ep_qp->pending_frags[0], opal_list_t);
|
||||||
OBJ_CONSTRUCT(&ep_qp->pending_frags[1], opal_list_t);
|
OBJ_CONSTRUCT(&ep_qp->pending_frags[1], opal_list_t);
|
||||||
|
|
||||||
switch(BTL_OPENIB_QP_TYPE(qp)) {
|
switch(BTL_OPENIB_QP_TYPE(qp)) {
|
||||||
case MCA_BTL_OPENIB_PP_QP:
|
case MCA_BTL_OPENIB_PP_QP:
|
||||||
endpoint_init_qp_pp(ep_qp, qp);
|
endpoint_init_qp_pp(ep_qp, qp);
|
||||||
@ -500,16 +500,17 @@ static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
|
|||||||
OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags[0]);
|
OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags[0]);
|
||||||
OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags[1]);
|
OBJ_DESTRUCT(&endpoint->qps[qp].pending_frags[1]);
|
||||||
|
|
||||||
|
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||||
|
&endpoint->qps[qp].no_wqe_pending_frags[0]);
|
||||||
|
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
||||||
|
&endpoint->qps[qp].no_wqe_pending_frags[1]);
|
||||||
|
OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[0]);
|
||||||
|
OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[1]);
|
||||||
|
|
||||||
|
|
||||||
if(--endpoint->qps[qp].qp->users != 0)
|
if(--endpoint->qps[qp].qp->users != 0)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
|
||||||
&endpoint->qps[qp].qp->pending_frags[0]);
|
|
||||||
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
|
|
||||||
&endpoint->qps[qp].qp->pending_frags[1]);
|
|
||||||
OBJ_DESTRUCT(&endpoint->qps[qp].qp->pending_frags[0]);
|
|
||||||
OBJ_DESTRUCT(&endpoint->qps[qp].qp->pending_frags[1]);
|
|
||||||
|
|
||||||
if(endpoint->qps[qp].qp->lcl_qp != NULL)
|
if(endpoint->qps[qp].qp->lcl_qp != NULL)
|
||||||
if(ibv_destroy_qp(endpoint->qps[qp].qp->lcl_qp))
|
if(ibv_destroy_qp(endpoint->qps[qp].qp->lcl_qp))
|
||||||
BTL_ERROR(("Failed to destroy QP:%d\n", qp));
|
BTL_ERROR(("Failed to destroy QP:%d\n", qp));
|
||||||
|
@ -13,7 +13,7 @@
|
|||||||
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2006-2007 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
* Copyright (c) 2006-2007 Voltaire All rights reserved.
|
||||||
* Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved.
|
* Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
* Additional copyrights may follow
|
* Additional copyrights may follow
|
||||||
@ -126,8 +126,6 @@ typedef struct mca_btl_openib_qp_t {
|
|||||||
struct ibv_qp *lcl_qp;
|
struct ibv_qp *lcl_qp;
|
||||||
uint32_t lcl_psn;
|
uint32_t lcl_psn;
|
||||||
int32_t sd_wqe; /**< number of available send wqe entries */
|
int32_t sd_wqe; /**< number of available send wqe entries */
|
||||||
opal_list_t pending_frags[2]; /**< put fragments here if there is no wqe
|
|
||||||
available */
|
|
||||||
int users;
|
int users;
|
||||||
opal_mutex_t lock;
|
opal_mutex_t lock;
|
||||||
} mca_btl_openib_qp_t;
|
} mca_btl_openib_qp_t;
|
||||||
@ -136,6 +134,8 @@ typedef struct mca_btl_openib_endpoint_qp_t {
|
|||||||
mca_btl_openib_qp_t *qp;
|
mca_btl_openib_qp_t *qp;
|
||||||
opal_list_t pending_frags[2]; /**< put fragment here if there is no credits
|
opal_list_t pending_frags[2]; /**< put fragment here if there is no credits
|
||||||
available */
|
available */
|
||||||
|
opal_list_t no_wqe_pending_frags[2]; /**< put fragments here if there is no wqe
|
||||||
|
available */
|
||||||
int32_t rd_credit_send_lock; /**< Lock credit send fragment */
|
int32_t rd_credit_send_lock; /**< Lock credit send fragment */
|
||||||
mca_btl_openib_send_control_frag_t *credit_frag;
|
mca_btl_openib_send_control_frag_t *credit_frag;
|
||||||
size_t ib_inline_max; /**< max size of inline send*/
|
size_t ib_inline_max; /**< max size of inline send*/
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2007-2008 Mellanox Technologies. All rights reserved.
|
* Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
|
||||||
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
@ -409,7 +409,7 @@ static int xoob_send_qp_create (mca_btl_base_endpoint_t* endpoint)
|
|||||||
if (qp_init_attr.cap.max_inline_data < req_inline) {
|
if (qp_init_attr.cap.max_inline_data < req_inline) {
|
||||||
endpoint->qps[0].ib_inline_max = qp_init_attr.cap.max_inline_data;
|
endpoint->qps[0].ib_inline_max = qp_init_attr.cap.max_inline_data;
|
||||||
orte_show_help("help-mpi-btl-openib-cpc-base.txt",
|
orte_show_help("help-mpi-btl-openib-cpc-base.txt",
|
||||||
"inline truncated", true, orte_process_info.nodename,
|
"inline truncated", orte_process_info.nodename,
|
||||||
ibv_get_device_name(openib_btl->device->ib_dev),
|
ibv_get_device_name(openib_btl->device->ib_dev),
|
||||||
req_inline, qp_init_attr.cap.max_inline_data);
|
req_inline, qp_init_attr.cap.max_inline_data);
|
||||||
} else {
|
} else {
|
||||||
@ -715,8 +715,8 @@ static void xoob_restart_connect(mca_btl_base_endpoint_t *endpoint)
|
|||||||
endpoint->ib_addr->status = MCA_BTL_IB_ADDR_CLOSED;
|
endpoint->ib_addr->status = MCA_BTL_IB_ADDR_CLOSED;
|
||||||
/* destroy the qp */
|
/* destroy the qp */
|
||||||
/* the reciver site was alredy closed so all pending list must be clean ! */
|
/* the reciver site was alredy closed so all pending list must be clean ! */
|
||||||
assert (opal_list_is_empty(&endpoint->ib_addr->qp->pending_frags[0]));
|
assert (opal_list_is_empty(&endpoint->qps->no_wqe_pending_frags[0]));
|
||||||
assert (opal_list_is_empty(&endpoint->ib_addr->qp->pending_frags[1]));
|
assert (opal_list_is_empty(&endpoint->qps->no_wqe_pending_frags[1]));
|
||||||
if(ibv_destroy_qp(endpoint->qps[0].qp->lcl_qp))
|
if(ibv_destroy_qp(endpoint->qps[0].qp->lcl_qp))
|
||||||
BTL_ERROR(("Failed to destroy QP"));
|
BTL_ERROR(("Failed to destroy QP"));
|
||||||
case MCA_BTL_IB_ADDR_CLOSED:
|
case MCA_BTL_IB_ADDR_CLOSED:
|
||||||
@ -955,13 +955,6 @@ static int xoob_component_query(mca_btl_openib_module_t *openib_btl,
|
|||||||
return OMPI_ERR_NOT_SUPPORTED;
|
return OMPI_ERR_NOT_SUPPORTED;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Print warning and switch off coalescing mode (ticket #1693)*/
|
|
||||||
if (mca_btl_openib_component.use_message_coalescing) {
|
|
||||||
orte_show_help("help-mpi-btl-openib-cpc-base.txt",
|
|
||||||
"bug #1693", true, orte_process_info.nodename);
|
|
||||||
mca_btl_openib_component.use_message_coalescing = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
*cpc = malloc(sizeof(ompi_btl_openib_connect_base_module_t));
|
*cpc = malloc(sizeof(ompi_btl_openib_connect_base_module_t));
|
||||||
if (NULL == *cpc) {
|
if (NULL == *cpc) {
|
||||||
opal_output_verbose(5, mca_btl_base_output,
|
opal_output_verbose(5, mca_btl_base_output,
|
||||||
|
@ -38,19 +38,3 @@ a smaller inline data value than was requested.
|
|||||||
Local device: %s
|
Local device: %s
|
||||||
Requested value: %d
|
Requested value: %d
|
||||||
Value used by device: %d
|
Value used by device: %d
|
||||||
#
|
|
||||||
[bug #1693]
|
|
||||||
WARNING: ConnectX XRC support was enabled together with coalescing
|
|
||||||
mode. This combination is not supported in this version of Open MPI
|
|
||||||
because it may cause random seg faults (please see Open MPI bug ticket
|
|
||||||
#1693 https://svn.open-mpi.org/trac/ompi/ticket/1693 for more
|
|
||||||
details). Message coalescing has been disabled in this job to prevent
|
|
||||||
catastrophic failure.
|
|
||||||
|
|
||||||
You can silence this warning either by disabling XRC (i.e., not specifying
|
|
||||||
an "X" queue in btl_openib_receive_queues) or disabling message
|
|
||||||
coalescing by setting the btl_openib_use_message_coalescing MCA
|
|
||||||
parameter to 0.
|
|
||||||
|
|
||||||
Local host: %s
|
|
||||||
#
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user