btl/openib: fix rdmacm locking bug
This commit fixes a long standing bug in rdmacm. It is required that the thread that calls mca_btl_openib_endpoint_cpc_complete holds the endpoint lock. This was not the case for rdmacm. This causes debug builds to abort. This change also required changing mca_btl_openib_endpoint_send_cts to require the endpoint lock to be held when calling. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
Этот коммит содержится в:
родитель
cc2b3e0c3f
Коммит
01d6da31af
@ -496,6 +496,8 @@ static void btl_openib_control(mca_btl_base_module_t* btl,
|
||||
trigger credit management (because the rd_credits will
|
||||
still be negative), and Bad Things will happen. */
|
||||
if (ep->endpoint_posted_recvs) {
|
||||
/* need to hold to lock for both send_cts and connected */
|
||||
OPAL_THREAD_LOCK(&ep->endpoint_lock);
|
||||
if (!ep->endpoint_cts_sent) {
|
||||
mca_btl_openib_endpoint_send_cts(ep);
|
||||
}
|
||||
|
@ -537,13 +537,11 @@ void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint)
|
||||
ctl_hdr->type = MCA_BTL_OPENIB_CONTROL_CTS;
|
||||
|
||||
/* Send the fragment */
|
||||
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
|
||||
if (OPAL_SUCCESS != mca_btl_openib_endpoint_post_send(endpoint, sc_frag)) {
|
||||
BTL_ERROR(("Failed to post CTS send"));
|
||||
mca_btl_openib_endpoint_invoke_error(endpoint);
|
||||
}
|
||||
endpoint->endpoint_cts_sent = true;
|
||||
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -588,6 +586,9 @@ void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t *endpoint)
|
||||
OPAL_OUTPUT((-1, "cpc_complete to %s -- already got CTS, so marking endpoint as complete",
|
||||
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
|
||||
mca_btl_openib_endpoint_connected(endpoint);
|
||||
} else {
|
||||
/* the caller hold the lock and expects us to drop it */
|
||||
OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -342,8 +342,11 @@ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t*,
|
||||
void mca_btl_openib_endpoint_send_credits(mca_btl_base_endpoint_t*, const int);
|
||||
void mca_btl_openib_endpoint_connect_eager_rdma(mca_btl_openib_endpoint_t*);
|
||||
int mca_btl_openib_endpoint_post_recvs(mca_btl_openib_endpoint_t*);
|
||||
|
||||
/* the endpoint lock must be held with OPAL_THREAD_LOCK for both CTS and cpc complete */
|
||||
void mca_btl_openib_endpoint_send_cts(mca_btl_openib_endpoint_t *endpoint);
|
||||
void mca_btl_openib_endpoint_cpc_complete(mca_btl_openib_endpoint_t*);
|
||||
|
||||
void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t*);
|
||||
void mca_btl_openib_endpoint_init(mca_btl_openib_module_t*,
|
||||
mca_btl_base_endpoint_t*,
|
||||
|
@ -1246,6 +1246,7 @@ static void *local_endpoint_cpc_complete(void *context)
|
||||
|
||||
OPAL_OUTPUT((-1, "MAIN local_endpoint_cpc_complete to %s",
|
||||
opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal)));
|
||||
OPAL_THREAD_LOCK(&endpoint->endpoint_lock);
|
||||
mca_btl_openib_endpoint_cpc_complete(endpoint);
|
||||
|
||||
return NULL;
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user