1
1

btl/openib: fix rdmacm locking bug

This commit fixes a long standing bug in rdmacm. It is required that
the thread that calls mca_btl_openib_endpoint_cpc_complete holds the
endpoint lock. This was not the case for rdmacm. This causes debug
builds to abort. This change also required changing
mca_btl_openib_endpoint_send_cts to require the endpoint lock to be
held when calling.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
Nathan Hjelm 2016-06-30 12:21:47 -06:00
родитель cc2b3e0c3f
Коммит 01d6da31af
4 изменённых файлов: 9 добавлений и 2 удалений

Просмотреть файл

@ -496,6 +496,8 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
trigger credit management (because the rd_credits will
still be negative), and Bad Things will happen. */
if (ep->endpoint_posted_recvs) {
/* need to hold to lock for both send_cts and connected */
OPAL_THREAD_LOCK(&ep->endpoint_lock);
if (!ep->endpoint_cts_sent) {
mca_btl_openib_endpoint_send_cts(ep);
}

Просмотреть файл

@ -537,13 +537,11 @@ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
ctl_hdr->type = MCA_BTL_OPENIB_CONTROL_CTS;
/* Send the fragment */
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
if (OPAL_SUCCESS != mca_btl_openib_endpoint_post_send(endpoint, sc_frag)) {
BTL_ERROR(("Failed to post CTS send"));
mca_btl_openib_endpoint_invoke_error(endpoint);
}
endpoint->endpoint_cts_sent = true;
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
}
/*
@ -588,6 +586,9 @@ void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint)
OPAL_OUTPUT((-1, "cpc_complete to %s -- already got CTS, so marking endpoint as complete",
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
mca_btl_openib_endpoint_connected(endpoint);
} else {
/* the caller hold the lock and expects us to drop it */
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
}
}

Просмотреть файл

@ -342,8 +342,11 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t*,
void mca_btl_openib_endpoint_send_credits(mca_btl_base_endpoint_t*, const int);
void mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t*);
int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t*);
/* the endpoint lock must be held with OPAL_THREAD_LOCK for both CTS and cpc complete */
void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint);
void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t*);
void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t*);
void mca_btl_openib_endpoint_init(mca_btl_openib_module_t*,
mca_btl_base_endpoint_t*,

Просмотреть файл

@ -1246,6 +1246,7 @@ static void *local_endpoint_cpc_complete(void *context)
OPAL_OUTPUT((-1, "MAIN local_endpoint_cpc_complete to %s",
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
mca_btl_openib_endpoint_cpc_complete(endpoint);
return NULL;