From e10afcd354b8ef69bb3b6bfae73ef3ab84ee05c0 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm <hjelmn@lanl.gov> Date: Wed, 21 Oct 2015 12:54:14 -0600 Subject: [PATCH] udcm: fix bugs This commit fixes the following bugs: - On send failure release newly allocated message. - In the destructor for udcm_message_sent_t always remove the send timeout event from the event base. Failure to do this can lead to memory corruption since the destructor may be called from an event callback. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov> --- .../openib/connect/btl_openib_connect_udcm.c | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c index f2cde4b3a2..c29df267d7 100644 --- a/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c +++ b/opal/mca/btl/openib/connect/btl_openib_connect_udcm.c @@ -316,7 +316,7 @@ static void *udcm_cq_event_dispatch(int fd, int flags, void *context); static void *udcm_message_callback (int fd, int flags, void *context); static void udcm_set_message_timeout (udcm_message_sent_t *message); -static void udcm_cancel_message_timeout (udcm_message_sent_t *message); +static void udcm_free_message (udcm_message_sent_t *message); static int udcm_module_init (udcm_module_t *m, mca_btl_openib_module_t *btl); @@ -1660,7 +1660,7 @@ static int udcm_send_request (mca_btl_base_endpoint_t *lcl_ep, if (0 != (rc = udcm_post_send (lcl_ep, msg->data, m->msg_length, 0))) { BTL_VERBOSE(("error posting REQ")); - udcm_cancel_message_timeout (msg); + udcm_free_message (msg); return rc; } @@ -1683,7 +1683,7 @@ static int udcm_send_complete (mca_btl_base_endpoint_t *lcl_ep, if (0 != rc) { BTL_VERBOSE(("error posting complete")); - udcm_cancel_message_timeout (msg); + udcm_free_message (msg); return rc; } @@ -1709,7 +1709,7 @@ static int udcm_send_reject (mca_btl_base_endpoint_t *lcl_ep, if (0 != rc) { BTL_VERBOSE(("error posting rejection")); - udcm_cancel_message_timeout (msg); + udcm_free_message (msg); return rc; } @@ -2185,10 +2185,8 @@ static void udcm_sent_message_destructor (udcm_message_sent_t *message) free (message->data); } - if (message->event_active) { - opal_event_evtimer_del (&message->event); - message->event_active = false; - } + opal_event_evtimer_del (&message->event); + message->event_active = false; } /* mark: message timeout code */ @@ -2266,21 +2264,22 @@ static void udcm_set_message_timeout (udcm_message_sent_t *message) opal_mutex_unlock (&m->cm_timeout_lock); } -static void udcm_cancel_message_timeout (udcm_message_sent_t *message) +static void udcm_free_message (udcm_message_sent_t *message) { udcm_module_t *m = UDCM_ENDPOINT_MODULE(message->endpoint); - BTL_VERBOSE(("cancelling timeout for message %p", (void *) message)); + BTL_VERBOSE(("releasing message %p", (void *) message)); opal_mutex_lock (&m->cm_timeout_lock); - opal_list_remove_item (&m->flying_messages, &message->super); - - /* start the event */ - opal_event_evtimer_del (&message->event); - message->event_active = false; + if (message->event_active) { + opal_list_remove_item (&m->flying_messages, &message->super); + message->event_active = false; + } opal_mutex_unlock (&m->cm_timeout_lock); + + OBJ_RELEASE(message); } /* mark: xrc connection support */ @@ -2798,7 +2797,7 @@ static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_ if (0 != (rc = udcm_post_send (lcl_ep, msg->data, sizeof (udcm_msg_hdr_t), 0))) { BTL_VERBOSE(("error posting XREQ")); - udcm_cancel_message_timeout (msg); + udcm_free_message (msg); return rc; } @@ -2851,7 +2850,7 @@ static int udcm_xrc_send_xresponse (mca_btl_base_endpoint_t *lcl_ep, mca_btl_bas if (0 != rc) { BTL_VERBOSE(("error posting complete")); - udcm_cancel_message_timeout (msg); + udcm_free_message (msg); return rc; }