1
1

coll/libnbc: fix race condition with multi threaded apps

protect the mca_coll_libnbc_component.active_requests list with
the new mca_coll_libnbc_component.lock mutex.

Thanks Jie Hu for the report

Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
Этот коммит содержится в:
Gilles Gouaillardet 2016-11-21 10:21:47 +09:00
родитель 19bdd1d626
Коммит 2c94a3a6f3
3 изменённых файлов: 12 добавлений и 1 удалений

Просмотреть файл

@ -13,7 +13,7 @@
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
* reserved. * reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science * Copyright (c) 2014-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved. * and Technology (RIST). All rights reserved.
* Copyright (c) 2016 IBM Corporation. All rights reserved. * Copyright (c) 2016 IBM Corporation. All rights reserved.
* $COPYRIGHT$ * $COPYRIGHT$
@ -76,6 +76,7 @@ struct ompi_coll_libnbc_component_t {
opal_list_t active_requests; opal_list_t active_requests;
int32_t active_comms; int32_t active_comms;
opal_atomic_lock_t progress_lock; opal_atomic_lock_t progress_lock;
opal_mutex_t lock;
}; };
typedef struct ompi_coll_libnbc_component_t ompi_coll_libnbc_component_t; typedef struct ompi_coll_libnbc_component_t ompi_coll_libnbc_component_t;

Просмотреть файл

@ -91,6 +91,7 @@ libnbc_open(void)
OBJ_CONSTRUCT(&mca_coll_libnbc_component.requests, opal_free_list_t); OBJ_CONSTRUCT(&mca_coll_libnbc_component.requests, opal_free_list_t);
OBJ_CONSTRUCT(&mca_coll_libnbc_component.active_requests, opal_list_t); OBJ_CONSTRUCT(&mca_coll_libnbc_component.active_requests, opal_list_t);
OBJ_CONSTRUCT(&mca_coll_libnbc_component.lock, opal_mutex_t);
ret = opal_free_list_init (&mca_coll_libnbc_component.requests, ret = opal_free_list_init (&mca_coll_libnbc_component.requests,
sizeof(ompi_coll_libnbc_request_t), 8, sizeof(ompi_coll_libnbc_request_t), 8,
OBJ_CLASS(ompi_coll_libnbc_request_t), OBJ_CLASS(ompi_coll_libnbc_request_t),
@ -115,6 +116,7 @@ libnbc_close(void)
OBJ_DESTRUCT(&mca_coll_libnbc_component.requests); OBJ_DESTRUCT(&mca_coll_libnbc_component.requests);
OBJ_DESTRUCT(&mca_coll_libnbc_component.active_requests); OBJ_DESTRUCT(&mca_coll_libnbc_component.active_requests);
OBJ_DESTRUCT(&mca_coll_libnbc_component.lock);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }
@ -263,13 +265,17 @@ ompi_coll_libnbc_progress(void)
if (opal_atomic_trylock(&mca_coll_libnbc_component.progress_lock)) return 0; if (opal_atomic_trylock(&mca_coll_libnbc_component.progress_lock)) return 0;
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
OPAL_LIST_FOREACH_SAFE(request, next, &mca_coll_libnbc_component.active_requests, OPAL_LIST_FOREACH_SAFE(request, next, &mca_coll_libnbc_component.active_requests,
ompi_coll_libnbc_request_t) { ompi_coll_libnbc_request_t) {
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);
res = NBC_Progress(request); res = NBC_Progress(request);
if( NBC_CONTINUE != res ) { if( NBC_CONTINUE != res ) {
/* done, remove and complete */ /* done, remove and complete */
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
opal_list_remove_item(&mca_coll_libnbc_component.active_requests, opal_list_remove_item(&mca_coll_libnbc_component.active_requests,
&request->super.super.super); &request->super.super.super);
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);
if( OMPI_SUCCESS == res || NBC_OK == res || NBC_SUCCESS == res ) { if( OMPI_SUCCESS == res || NBC_OK == res || NBC_SUCCESS == res ) {
request->super.req_status.MPI_ERROR = OMPI_SUCCESS; request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
@ -281,7 +287,9 @@ ompi_coll_libnbc_progress(void)
ompi_request_complete(&request->super, true); ompi_request_complete(&request->super, true);
OPAL_THREAD_UNLOCK(&ompi_request_lock); OPAL_THREAD_UNLOCK(&ompi_request_lock);
} }
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
} }
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);
opal_atomic_unlock(&mca_coll_libnbc_component.progress_lock); opal_atomic_unlock(&mca_coll_libnbc_component.progress_lock);

Просмотреть файл

@ -702,7 +702,9 @@ int NBC_Start(NBC_Handle *handle, NBC_Schedule *schedule) {
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res; return res;
} }
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
opal_list_append(&mca_coll_libnbc_component.active_requests, &(handle->super.super.super)); opal_list_append(&mca_coll_libnbc_component.active_requests, &(handle->super.super.super));
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);
return OMPI_SUCCESS; return OMPI_SUCCESS;
} }