1
1

usnic: fix endpoint destruction on the trunk

Fixes an assertion failure in --enable-debug builds and SEGVs in normal
builds.

I'm not 100% sure I like this model, but it at least seems to be
consistent.  Some variation on this scheme will need to be adapted to
the trunk, where usnic_del_procs() is called by the PML instead of
internally in usnic_finalize().

A related bug (but with different mechanics) is #4832.

This commit was SVN r32424.
Этот коммит содержится в:
Dave Goodell 2014-08-04 21:30:21 +00:00
родитель 490c484f8c
Коммит 13b104bdef
5 изменённых файлов: 21 добавлений и 12 удалений

Просмотреть файл

@ -116,12 +116,6 @@ static void endpoint_destruct(mca_btl_base_endpoint_t* endpoint)
so it should be safe to unconditionally destruct the ack_li */ so it should be safe to unconditionally destruct the ack_li */
OBJ_DESTRUCT(&(endpoint->endpoint_ack_li)); OBJ_DESTRUCT(&(endpoint->endpoint_ack_li));
/* Remove this endpoint from module->all_endpoints list, then
destruct the list_item_t */
opal_mutex_lock(&endpoint->endpoint_module->all_endpoints_lock);
opal_list_remove_item(&endpoint->endpoint_module->all_endpoints,
&endpoint->endpoint_endpoint_li);
opal_mutex_unlock(&endpoint->endpoint_module->all_endpoints_lock);
OBJ_DESTRUCT(&(endpoint->endpoint_endpoint_li)); OBJ_DESTRUCT(&(endpoint->endpoint_endpoint_li));
if (endpoint->endpoint_hotel.rooms != NULL) { if (endpoint->endpoint_hotel.rooms != NULL) {
@ -179,3 +173,12 @@ opal_btl_usnic_flush_endpoint(
/* Now, ACK everything that is pending */ /* Now, ACK everything that is pending */
opal_btl_usnic_handle_ack(endpoint, endpoint->endpoint_next_seq_to_send-1); opal_btl_usnic_handle_ack(endpoint, endpoint->endpoint_next_seq_to_send-1);
} }
void opal_btl_usnic_release_endpoint(opal_btl_usnic_module_t *module,
opal_btl_usnic_endpoint_t *endpoint)
{
opal_mutex_lock(&module->all_endpoints_lock);
opal_list_remove_item(&module->all_endpoints, &endpoint->endpoint_endpoint_li);
opal_mutex_unlock(&module->all_endpoints_lock);
OBJ_RELEASE(endpoint);
}

Просмотреть файл

@ -186,5 +186,11 @@ void
opal_btl_usnic_flush_endpoint( opal_btl_usnic_flush_endpoint(
opal_btl_usnic_endpoint_t *endpoint); opal_btl_usnic_endpoint_t *endpoint);
/* Release the given endpoint and remove it from the all_endpoints list. The
* reference that is released was logically held by the all_endpoints list
* (this is not a generic release function). */
void opal_btl_usnic_release_endpoint(struct opal_btl_usnic_module_t *module,
opal_btl_usnic_endpoint_t *endpoint);
END_C_DECLS END_C_DECLS
#endif #endif

Просмотреть файл

@ -263,7 +263,7 @@ static int add_procs_create_ahs(opal_btl_usnic_module_t *module,
EHOSTUNREACH == errno) { EHOSTUNREACH == errno) {
add_procs_warn_ah_fail(module, endpoints[i]); add_procs_warn_ah_fail(module, endpoints[i]);
OBJ_RELEASE(endpoints[i]); opal_btl_usnic_release_endpoint(module, endpoints[i]);
endpoints[i] = NULL; endpoints[i] = NULL;
--num_ah_left; --num_ah_left;
} }
@ -317,7 +317,7 @@ static int add_procs_create_ahs(opal_btl_usnic_module_t *module,
if (NULL != endpoints[i]) { if (NULL != endpoints[i]) {
if (OPAL_SUCCESS != ret || if (OPAL_SUCCESS != ret ||
NULL == endpoints[i]->endpoint_remote_ah) { NULL == endpoints[i]->endpoint_remote_ah) {
OBJ_RELEASE(endpoints[i]); opal_btl_usnic_release_endpoint(module, endpoints[i]);
endpoints[i] = NULL; endpoints[i] = NULL;
} else { } else {
++num_created; ++num_created;
@ -402,7 +402,7 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
reachable. */ reachable. */
for (size_t i = 0; i < nprocs; ++i) { for (size_t i = 0; i < nprocs; ++i) {
if (NULL != endpoints[i]) { if (NULL != endpoints[i]) {
OBJ_RELEASE(endpoints[i]); opal_btl_usnic_release_endpoint(module, endpoints[i]);
endpoints[i] = NULL; endpoints[i] = NULL;
} }
} }
@ -451,7 +451,7 @@ static int usnic_del_procs(struct mca_btl_base_module_t *base_module,
} }
/* We're all done with this endpoint */ /* We're all done with this endpoint */
OBJ_RELEASE(endpoint); opal_btl_usnic_release_endpoint(module, endpoint);
break; /* done once we found match */ break; /* done once we found match */
} }

Просмотреть файл

@ -362,7 +362,7 @@ void opal_btl_usnic_recv_call(opal_btl_usnic_module_t *module,
/* if endpoint exiting, and all ACKs received, release the endpoint */ /* if endpoint exiting, and all ACKs received, release the endpoint */
if (endpoint->endpoint_exiting && ENDPOINT_DRAINED(endpoint)) { if (endpoint->endpoint_exiting && ENDPOINT_DRAINED(endpoint)) {
OBJ_RELEASE(endpoint); opal_btl_usnic_release_endpoint(module, endpoint);
} }
repost_no_endpoint: repost_no_endpoint:
++module->stats.num_recv_reposts; ++module->stats.num_recv_reposts;

Просмотреть файл

@ -334,7 +334,7 @@ opal_btl_usnic_recv_frag_bookkeeping(
repost: repost:
/* if endpoint exiting, and all ACKs received, release the endpoint */ /* if endpoint exiting, and all ACKs received, release the endpoint */
if (endpoint->endpoint_exiting && ENDPOINT_DRAINED(endpoint)) { if (endpoint->endpoint_exiting && ENDPOINT_DRAINED(endpoint)) {
OBJ_RELEASE(endpoint); opal_btl_usnic_release_endpoint(module, endpoint);
} }
++module->stats.num_recv_reposts; ++module->stats.num_recv_reposts;