1
1
This commit is an attempt to fix a hang in finalize of rdmacm. This fixes
a path where no rdmacm client is found for an endpoint.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2016-06-29 11:11:35 -06:00 коммит произвёл Nathan Hjelm
родитель f18d6606da
Коммит 960fcd292c

Просмотреть файл

@ -1173,7 +1173,7 @@ static void *call_disconnect_callback(int fd, int flags, void *v)
*/ */
static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint) static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
{ {
rdmacm_contents_t *contents; rdmacm_contents_t *contents = NULL, *item;
opal_event_t event; opal_event_t event;
BTL_VERBOSE(("Start disconnecting...")); BTL_VERBOSE(("Start disconnecting..."));
@ -1193,8 +1193,9 @@ static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
* main thread and service thread. * main thread and service thread.
*/ */
opal_mutex_lock(&client_list_lock); opal_mutex_lock(&client_list_lock);
OPAL_LIST_FOREACH(contents, &client_list, rdmacm_contents_t) { OPAL_LIST_FOREACH(item, &client_list, rdmacm_contents_t) {
if (endpoint == contents->endpoint) { if (endpoint == item->endpoint) {
contents = item;
opal_list_remove_item(&client_list, (opal_list_item_t *) contents); opal_list_remove_item(&client_list, (opal_list_item_t *) contents);
contents->on_client_list = false; contents->on_client_list = false;
@ -1223,12 +1224,14 @@ static int rdmacm_endpoint_finalize(struct mca_btl_base_endpoint_t *endpoint)
opal_atomic_wmb(); opal_atomic_wmb();
opal_mutex_unlock(&client_list_lock); opal_mutex_unlock(&client_list_lock);
if (NULL != contents) {
/* Now wait for all the disconnect callbacks to occur */ /* Now wait for all the disconnect callbacks to occur */
pthread_mutex_lock(&rdmacm_disconnect_lock); pthread_mutex_lock(&rdmacm_disconnect_lock);
while (opal_list_get_size (&contents->ids)) { while (opal_list_get_size (&contents->ids)) {
pthread_cond_wait (&rdmacm_disconnect_cond, &rdmacm_disconnect_lock); pthread_cond_wait (&rdmacm_disconnect_cond, &rdmacm_disconnect_lock);
} }
pthread_mutex_unlock(&rdmacm_disconnect_lock); pthread_mutex_unlock(&rdmacm_disconnect_lock);
}
OPAL_OUTPUT((-1, "MAIN Endpoint finished finalizing")); OPAL_OUTPUT((-1, "MAIN Endpoint finished finalizing"));
return OPAL_SUCCESS; return OPAL_SUCCESS;