btl/openib: fix rdma hang
This commit is an attempt to fix a hang in finalize of rdmacm. This fixes a path where no rdmacm client is found for an endpoint. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
родитель
f18d6606da
Коммит
960fcd292c
@ -1173,7 +1173,7 @@ static void *call_disconnect_callback(int fd, int flags, void *v)
|
|||||||
*/
|
*/
|
||||||
static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
|
static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
|
||||||
{
|
{
|
||||||
rdmacm_contents_t *contents;
|
rdmacm_contents_t *contents = NULL, *item;
|
||||||
opal_event_t event;
|
opal_event_t event;
|
||||||
|
|
||||||
BTL_VERBOSE(("Start disconnecting..."));
|
BTL_VERBOSE(("Start disconnecting..."));
|
||||||
@ -1193,8 +1193,9 @@ static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
|
|||||||
* main thread and service thread.
|
* main thread and service thread.
|
||||||
*/
|
*/
|
||||||
opal_mutex_lock(&client_list_lock);
|
opal_mutex_lock(&client_list_lock);
|
||||||
OPAL_LIST_FOREACH(contents, &client_list, rdmacm_contents_t) {
|
OPAL_LIST_FOREACH(item, &client_list, rdmacm_contents_t) {
|
||||||
if (endpoint == contents->endpoint) {
|
if (endpoint == item->endpoint) {
|
||||||
|
contents = item;
|
||||||
opal_list_remove_item(&client_list, (opal_list_item_t *) contents);
|
opal_list_remove_item(&client_list, (opal_list_item_t *) contents);
|
||||||
contents->on_client_list = false;
|
contents->on_client_list = false;
|
||||||
|
|
||||||
@ -1223,12 +1224,14 @@ static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
|
|||||||
opal_atomic_wmb();
|
opal_atomic_wmb();
|
||||||
opal_mutex_unlock(&client_list_lock);
|
opal_mutex_unlock(&client_list_lock);
|
||||||
|
|
||||||
|
if (NULL != contents) {
|
||||||
/* Now wait for all the disconnect callbacks to occur */
|
/* Now wait for all the disconnect callbacks to occur */
|
||||||
pthread_mutex_lock(&rdmacm_disconnect_lock);
|
pthread_mutex_lock(&rdmacm_disconnect_lock);
|
||||||
while (opal_list_get_size (&contents->ids)) {
|
while (opal_list_get_size (&contents->ids)) {
|
||||||
pthread_cond_wait (&rdmacm_disconnect_cond, &rdmacm_disconnect_lock);
|
pthread_cond_wait (&rdmacm_disconnect_cond, &rdmacm_disconnect_lock);
|
||||||
}
|
}
|
||||||
pthread_mutex_unlock(&rdmacm_disconnect_lock);
|
pthread_mutex_unlock(&rdmacm_disconnect_lock);
|
||||||
|
}
|
||||||
|
|
||||||
OPAL_OUTPUT((-1, "MAIN Endpoint finished finalizing"));
|
OPAL_OUTPUT((-1, "MAIN Endpoint finished finalizing"));
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user