The ib_procs list in the openib btl is accessed without the ib lock in some cases. This causes races when running multithreaded. This patch adds protection of the ib_procs list with the ib_lock.
fixes trac:2149 cmr:v1.4 This commit was SVN r22682. The following Trac tickets were found above: Ticket 2149 --> https://svn.open-mpi.org/trac/ompi/ticket/2149
Этот коммит содержится в:
родитель
6828122069
Коммит
322e73d8c4
@ -1,6 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved.
|
||||||
* Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
|
* Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
|
||||||
|
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -1816,6 +1817,7 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
|
|||||||
/* JMS: optimization target -- can we send something in private
|
/* JMS: optimization target -- can we send something in private
|
||||||
data to find the proc directly instead of having to search
|
data to find the proc directly instead of having to search
|
||||||
through *all* procs? */
|
through *all* procs? */
|
||||||
|
OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
|
||||||
for (found = false, ib_proc = (mca_btl_openib_proc_t*)
|
for (found = false, ib_proc = (mca_btl_openib_proc_t*)
|
||||||
opal_list_get_first(&mca_btl_openib_component.ib_procs);
|
opal_list_get_first(&mca_btl_openib_component.ib_procs);
|
||||||
!found &&
|
!found &&
|
||||||
@ -1850,6 +1852,7 @@ static int request_received(ibcm_listen_cm_id_t *cmh,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
|
||||||
if (!found) {
|
if (!found) {
|
||||||
BTL_VERBOSE(("could not find match for calling endpoint!"));
|
BTL_VERBOSE(("could not find match for calling endpoint!"));
|
||||||
rc = OMPI_ERR_NOT_FOUND;
|
rc = OMPI_ERR_NOT_FOUND;
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
* Copyright (c) 2006 Los Alamos National Security, LLC. All rights
|
* Copyright (c) 2006 Los Alamos National Security, LLC. All rights
|
||||||
* reserved.
|
* reserved.
|
||||||
* Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
|
* Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
|
||||||
|
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -728,6 +729,9 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
|||||||
master = orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME,
|
master = orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME,
|
||||||
process_name) > 0 ? true : false;
|
process_name) > 0 ? true : false;
|
||||||
|
|
||||||
|
/* Need to protect the ib_procs list */
|
||||||
|
OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
|
||||||
|
|
||||||
for (ib_proc = (mca_btl_openib_proc_t*)
|
for (ib_proc = (mca_btl_openib_proc_t*)
|
||||||
opal_list_get_first(&mca_btl_openib_component.ib_procs);
|
opal_list_get_first(&mca_btl_openib_component.ib_procs);
|
||||||
ib_proc != (mca_btl_openib_proc_t*)
|
ib_proc != (mca_btl_openib_proc_t*)
|
||||||
@ -780,6 +784,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
|||||||
just ignore this connection request */
|
just ignore this connection request */
|
||||||
if (found && !master &&
|
if (found && !master &&
|
||||||
MCA_BTL_IB_CLOSED != ib_endpoint->endpoint_state) {
|
MCA_BTL_IB_CLOSED != ib_endpoint->endpoint_state) {
|
||||||
|
OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -787,6 +792,7 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
|||||||
if (!found) {
|
if (!found) {
|
||||||
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
|
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
|
||||||
mca_btl_openib_endpoint_invoke_error(NULL);
|
mca_btl_openib_endpoint_invoke_error(NULL);
|
||||||
|
OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -869,4 +875,5 @@ static void rml_recv_cb(int status, orte_process_name_t* process_name,
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
|
* Copyright (c) 2007-2009 Mellanox Technologies. All rights reserved.
|
||||||
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
|
||||||
|
* Copyright (c) 2009 IBM Corporation. All rights reserved.
|
||||||
*
|
*
|
||||||
* $COPYRIGHT$
|
* $COPYRIGHT$
|
||||||
*
|
*
|
||||||
@ -658,6 +659,7 @@ static mca_btl_openib_endpoint_t* xoob_find_endpoint(orte_process_name_t* proces
|
|||||||
"jobid %d, vpid %d, sid %" PRIx64 ", lid %d",
|
"jobid %d, vpid %d, sid %" PRIx64 ", lid %d",
|
||||||
process_name->jobid, process_name->vpid, subnet_id, lid));
|
process_name->jobid, process_name->vpid, subnet_id, lid));
|
||||||
/* find ibproc */
|
/* find ibproc */
|
||||||
|
OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock);
|
||||||
for (ib_proc = (mca_btl_openib_proc_t*)
|
for (ib_proc = (mca_btl_openib_proc_t*)
|
||||||
opal_list_get_first(&mca_btl_openib_component.ib_procs);
|
opal_list_get_first(&mca_btl_openib_component.ib_procs);
|
||||||
ib_proc != (mca_btl_openib_proc_t*)
|
ib_proc != (mca_btl_openib_proc_t*)
|
||||||
@ -696,6 +698,7 @@ static mca_btl_openib_endpoint_t* xoob_find_endpoint(orte_process_name_t* proces
|
|||||||
} else {
|
} else {
|
||||||
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
|
BTL_ERROR(("can't find suitable endpoint for this peer\n"));
|
||||||
}
|
}
|
||||||
|
OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock);
|
||||||
return ib_endpoint;
|
return ib_endpoint;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user