From 13b104bdef3f08019990f1aa0b9f2e52536eac9e Mon Sep 17 00:00:00 2001 From: Dave Goodell Date: Mon, 4 Aug 2014 21:30:21 +0000 Subject: [PATCH] usnic: fix endpoint destruction on the trunk Fixes an assertion failure in --enable-debug builds and SEGVs in normal builds. I'm not 100% sure I like this model, but it at least seems to be consistent. Some variation on this scheme will need to be adapted to the trunk, where usnic_del_procs() is called by the PML instead of internally in usnic_finalize(). A related bug (but with different mechanics) is #4832. This commit was SVN r32424. --- opal/mca/btl/usnic/btl_usnic_endpoint.c | 15 +++++++++------ opal/mca/btl/usnic/btl_usnic_endpoint.h | 6 ++++++ opal/mca/btl/usnic/btl_usnic_module.c | 8 ++++---- opal/mca/btl/usnic/btl_usnic_recv.c | 2 +- opal/mca/btl/usnic/btl_usnic_recv.h | 2 +- 5 files changed, 21 insertions(+), 12 deletions(-) diff --git a/opal/mca/btl/usnic/btl_usnic_endpoint.c b/opal/mca/btl/usnic/btl_usnic_endpoint.c index da80f122b7..190b8657b3 100644 --- a/opal/mca/btl/usnic/btl_usnic_endpoint.c +++ b/opal/mca/btl/usnic/btl_usnic_endpoint.c @@ -116,12 +116,6 @@ static void endpoint_destruct(mca_btl_base_endpoint_t* endpoint) so it should be safe to unconditionally destruct the ack_li */ OBJ_DESTRUCT(&(endpoint->endpoint_ack_li)); - /* Remove this endpoint from module->all_endpoints list, then - destruct the list_item_t */ - opal_mutex_lock(&endpoint->endpoint_module->all_endpoints_lock); - opal_list_remove_item(&endpoint->endpoint_module->all_endpoints, - &endpoint->endpoint_endpoint_li); - opal_mutex_unlock(&endpoint->endpoint_module->all_endpoints_lock); OBJ_DESTRUCT(&(endpoint->endpoint_endpoint_li)); if (endpoint->endpoint_hotel.rooms != NULL) { @@ -179,3 +173,12 @@ opal_btl_usnic_flush_endpoint( /* Now, ACK everything that is pending */ opal_btl_usnic_handle_ack(endpoint, endpoint->endpoint_next_seq_to_send-1); } + +void opal_btl_usnic_release_endpoint(opal_btl_usnic_module_t *module, + opal_btl_usnic_endpoint_t *endpoint) +{ + opal_mutex_lock(&module->all_endpoints_lock); + opal_list_remove_item(&module->all_endpoints, &endpoint->endpoint_endpoint_li); + opal_mutex_unlock(&module->all_endpoints_lock); + OBJ_RELEASE(endpoint); +} diff --git a/opal/mca/btl/usnic/btl_usnic_endpoint.h b/opal/mca/btl/usnic/btl_usnic_endpoint.h index 2d40878ef2..bc292e1248 100644 --- a/opal/mca/btl/usnic/btl_usnic_endpoint.h +++ b/opal/mca/btl/usnic/btl_usnic_endpoint.h @@ -186,5 +186,11 @@ void opal_btl_usnic_flush_endpoint( opal_btl_usnic_endpoint_t *endpoint); +/* Release the given endpoint and remove it from the all_endpoints list. The + * reference that is released was logically held by the all_endpoints list + * (this is not a generic release function). */ +void opal_btl_usnic_release_endpoint(struct opal_btl_usnic_module_t *module, + opal_btl_usnic_endpoint_t *endpoint); + END_C_DECLS #endif diff --git a/opal/mca/btl/usnic/btl_usnic_module.c b/opal/mca/btl/usnic/btl_usnic_module.c index 268f5c500a..3e3be16858 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.c +++ b/opal/mca/btl/usnic/btl_usnic_module.c @@ -263,7 +263,7 @@ static int add_procs_create_ahs(opal_btl_usnic_module_t *module, EHOSTUNREACH == errno) { add_procs_warn_ah_fail(module, endpoints[i]); - OBJ_RELEASE(endpoints[i]); + opal_btl_usnic_release_endpoint(module, endpoints[i]); endpoints[i] = NULL; --num_ah_left; } @@ -317,7 +317,7 @@ static int add_procs_create_ahs(opal_btl_usnic_module_t *module, if (NULL != endpoints[i]) { if (OPAL_SUCCESS != ret || NULL == endpoints[i]->endpoint_remote_ah) { - OBJ_RELEASE(endpoints[i]); + opal_btl_usnic_release_endpoint(module, endpoints[i]); endpoints[i] = NULL; } else { ++num_created; @@ -402,7 +402,7 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module, reachable. */ for (size_t i = 0; i < nprocs; ++i) { if (NULL != endpoints[i]) { - OBJ_RELEASE(endpoints[i]); + opal_btl_usnic_release_endpoint(module, endpoints[i]); endpoints[i] = NULL; } } @@ -451,7 +451,7 @@ static int usnic_del_procs(struct mca_btl_base_module_t *base_module, } /* We're all done with this endpoint */ - OBJ_RELEASE(endpoint); + opal_btl_usnic_release_endpoint(module, endpoint); break; /* done once we found match */ } diff --git a/opal/mca/btl/usnic/btl_usnic_recv.c b/opal/mca/btl/usnic/btl_usnic_recv.c index 2e25a1db31..64d39338d0 100644 --- a/opal/mca/btl/usnic/btl_usnic_recv.c +++ b/opal/mca/btl/usnic/btl_usnic_recv.c @@ -362,7 +362,7 @@ void opal_btl_usnic_recv_call(opal_btl_usnic_module_t *module, /* if endpoint exiting, and all ACKs received, release the endpoint */ if (endpoint->endpoint_exiting && ENDPOINT_DRAINED(endpoint)) { - OBJ_RELEASE(endpoint); + opal_btl_usnic_release_endpoint(module, endpoint); } repost_no_endpoint: ++module->stats.num_recv_reposts; diff --git a/opal/mca/btl/usnic/btl_usnic_recv.h b/opal/mca/btl/usnic/btl_usnic_recv.h index 2692ac362a..b3912e6f91 100644 --- a/opal/mca/btl/usnic/btl_usnic_recv.h +++ b/opal/mca/btl/usnic/btl_usnic_recv.h @@ -334,7 +334,7 @@ opal_btl_usnic_recv_frag_bookkeeping( repost: /* if endpoint exiting, and all ACKs received, release the endpoint */ if (endpoint->endpoint_exiting && ENDPOINT_DRAINED(endpoint)) { - OBJ_RELEASE(endpoint); + opal_btl_usnic_release_endpoint(module, endpoint); } ++module->stats.num_recv_reposts;