1
1

btl/tcp: fix add_procs race condition

This commit fixes a race between a thread calling the tcp btl's
add_procs and a thread processing an incomming connection. The race
occured because the add_procs thread adds a newly created proc object
to the hash table *before* the object is fully initialized. The
connection thread then attempts to use the object before the endpoints
array on the object has beeen allocation. The fix is to only add the
proc to the hash table after it has been completely initialized.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2016-04-27 10:21:59 -06:00
родитель 01c90d4e71
Коммит 03f4a854cb

Просмотреть файл

@ -14,7 +14,7 @@
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
@ -122,16 +122,16 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
return btl_proc;
}
do {
btl_proc = OBJ_NEW(mca_btl_tcp_proc_t);
if(NULL == btl_proc)
return NULL;
btl_proc->proc_opal = proc;
OBJ_RETAIN(btl_proc->proc_opal);
if(NULL == btl_proc) {
rc = OPAL_ERR_OUT_OF_RESOURCE;
break;
}
/* add to hash table of all proc instance */
opal_proc_table_set_value(&mca_btl_tcp_component.tcp_procs,
proc->proc_name, btl_proc);
OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
btl_proc->proc_opal = proc;
OBJ_RETAIN(btl_proc->proc_opal);
/* lookup tcp parameters exported by this proc */
OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version,
@ -139,14 +139,16 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
if(rc != OPAL_SUCCESS) {
if(OPAL_ERR_NOT_FOUND != rc)
BTL_ERROR(("opal_modex_recv: failed with return value=%d", rc));
OBJ_RELEASE(btl_proc);
return NULL;
break;
}
if(0 != (size % sizeof(mca_btl_tcp_addr_t))) {
BTL_ERROR(("opal_modex_recv: invalid size %lu: btl-size: %lu\n",
(unsigned long) size, (unsigned long)sizeof(mca_btl_tcp_addr_t)));
return NULL;
rc = OPAL_ERROR;
break;
}
btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp_addr_t);
/* allocate space for endpoint array - one for each exported address */
@ -154,20 +156,19 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
malloc((1 + btl_proc->proc_addr_count) *
sizeof(mca_btl_base_endpoint_t*));
if(NULL == btl_proc->proc_endpoints) {
OBJ_RELEASE(btl_proc);
return NULL;
rc = OPAL_ERR_OUT_OF_RESOURCE;
break;
}
if(NULL == mca_btl_tcp_component.tcp_local && (proc == opal_proc_local_get())) {
mca_btl_tcp_component.tcp_local = btl_proc;
}
{
/* convert the OPAL addr_family field to OS constants,
* so we can check for AF_INET (or AF_INET6) and don't have
* to deal with byte ordering anymore.
*/
unsigned int i;
for (i = 0; i < btl_proc->proc_addr_count; i++) {
for (unsigned int i = 0; i < btl_proc->proc_addr_count; i++) {
if (MCA_BTL_TCP_AF_INET == btl_proc->proc_addrs[i].addr_family) {
btl_proc->proc_addrs[i].addr_family = AF_INET;
}
@ -177,7 +178,21 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc)
}
#endif
}
} while (0);
if (OPAL_SUCCESS == rc) {
/* add to hash table of all proc instance. */
opal_proc_table_set_value(&mca_btl_tcp_component.tcp_procs,
proc->proc_name, btl_proc);
} else {
if (btl_proc) {
OBJ_RELEASE(btl_proc);
btl_proc = NULL;
}
}
OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock);
return btl_proc;
}