1
1

Merge pull request #1837 from hjelmn/rdmacm_fix

btl/openib: fix rdmacm locking bug
Этот коммит содержится в:
Nathan Hjelm 2016-06-30 16:55:18 -06:00 коммит произвёл GitHub
родитель cc2b3e0c3f 01d6da31af
Коммит 2cf0e5d7cc
4 изменённых файлов: 9 добавлений и 2 удалений

Просмотреть файл

@ -496,6 +496,8 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
trigger credit management (because the rd_credits will
still be negative), and Bad Things will happen. */
if (ep->endpoint_posted_recvs) {
/* need to hold to lock for both send_cts and connected */
OPAL_THREAD_LOCK(&ep->endpoint_lock);
if (!ep->endpoint_cts_sent) {
mca_btl_openib_endpoint_send_cts(ep);
}

Просмотреть файл

@ -537,13 +537,11 @@ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
ctl_hdr->type = MCA_BTL_OPENIB_CONTROL_CTS;
/* Send the fragment */
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
if (OPAL_SUCCESS != mca_btl_openib_endpoint_post_send(endpoint, sc_frag)) {
BTL_ERROR(("Failed to post CTS send"));
mca_btl_openib_endpoint_invoke_error(endpoint);
}
endpoint->endpoint_cts_sent = true;
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
}
/*
@ -588,6 +586,9 @@ void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint)
OPAL_OUTPUT((-1, "cpc_complete to %s -- already got CTS, so marking endpoint as complete",
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
mca_btl_openib_endpoint_connected(endpoint);
} else {
/* the caller hold the lock and expects us to drop it */
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
}
}

Просмотреть файл

@ -342,8 +342,11 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t*,
void mca_btl_openib_endpoint_send_credits(mca_btl_base_endpoint_t*, const int);
void mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t*);
int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t*);
/* the endpoint lock must be held with OPAL_THREAD_LOCK for both CTS and cpc complete */
void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint);
void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t*);
void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t*);
void mca_btl_openib_endpoint_init(mca_btl_openib_module_t*,
mca_btl_base_endpoint_t*,

Просмотреть файл

@ -1246,6 +1246,7 @@ static void *local_endpoint_cpc_complete(void *context)
OPAL_OUTPUT((-1, "MAIN local_endpoint_cpc_complete to %s",
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
mca_btl_openib_endpoint_cpc_complete(endpoint);
return NULL;