1
1

udcm: fix race between ack arrival and message send and potential hang in udcm

finalize.

Closes trac:4290

cmr=v1.7.5:reviewer=miked

This commit was SVN r30854.

The following Trac tickets were found above:
  Ticket 4290 --> https://svn.open-mpi.org/trac/ompi/ticket/4290
Этот коммит содержится в:
Nathan Hjelm 2014-02-26 15:33:27 +00:00
родитель 30b61a3333
Коммит dfe4a504e4

Просмотреть файл

@ -3,8 +3,8 @@
* Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2009 Mellanox Technologies. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2011-2013 Los Alamos National Security, LLC. All
* rights reserved.
* Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
*
@ -299,6 +299,7 @@ static void *udcm_cq_event_dispatch(int fd, int flags, void *context);
static void *udcm_message_callback (void *context);
static void udcm_set_message_timeout (udcm_message_sent_t *message);
static void udcm_cancel_message_timeout (udcm_message_sent_t *message);
static int udcm_module_init (udcm_module_t *m, mca_btl_openib_module_t *btl);
@ -656,14 +657,15 @@ udcm_module_start_connect(ompi_btl_openib_connect_base_module_t *cpc,
#endif
opal_mutex_lock (&udep->udep_lock);
if (MCA_BTL_IB_CLOSED != lcl_ep->endpoint_state) {
opal_mutex_unlock (&udep->udep_lock);
BTL_VERBOSE(("already ongoing %p. state = %d",
(void *) lcl_ep, lcl_ep->endpoint_state));
return OMPI_SUCCESS;
}
opal_mutex_lock (&udep->udep_lock);
do {
opal_atomic_wmb ();
@ -1525,11 +1527,13 @@ static int udcm_new_message (mca_btl_base_endpoint_t *lcl_ep,
message->endpoint = lcl_ep;
udcm_set_message_timeout (message);
opal_atomic_wmb ();
*msgp = message;
BTL_VERBOSE(("created message with type %d", type));
BTL_VERBOSE(("created message %p with type %d", (void *) message, type));
return OMPI_SUCCESS;
}
@ -1567,11 +1571,12 @@ static int udcm_send_request (mca_btl_base_endpoint_t *lcl_ep,
if (0 != (rc = udcm_post_send (lcl_ep, msg->data, m->msg_length, 0))) {
BTL_VERBOSE(("error posting REQ"));
udcm_cancel_message_timeout (msg);
return rc;
}
udcm_set_message_timeout (msg);
return 0;
}
@ -1589,11 +1594,12 @@ static int udcm_send_complete (mca_btl_base_endpoint_t *lcl_ep,
rc = udcm_post_send (lcl_ep, msg->data, sizeof (udcm_msg_hdr_t), 0);
if (0 != rc) {
BTL_VERBOSE(("error posting complete"));
udcm_cancel_message_timeout (msg);
return rc;
}
udcm_set_message_timeout (msg);
return 0;
}
@ -1614,11 +1620,12 @@ static int udcm_send_reject (mca_btl_base_endpoint_t *lcl_ep,
rc = udcm_post_send (lcl_ep, msg->data, sizeof (udcm_msg_hdr_t), 0);
if (0 != rc) {
BTL_VERBOSE(("error posting rejection"));
udcm_cancel_message_timeout (msg);
return rc;
}
udcm_set_message_timeout (msg);
return 0;
}
@ -1638,6 +1645,7 @@ static int udcm_handle_ack (udcm_module_t *m, const uintptr_t ctx, const uint16_
const uint32_t rem_qp)
{
udcm_message_sent_t *msg, *next;
bool found = false;
opal_mutex_lock (&m->cm_timeout_lock);
@ -1651,6 +1659,7 @@ static int udcm_handle_ack (udcm_module_t *m, const uintptr_t ctx, const uint16_
}
BTL_VERBOSE(("found matching message"));
found = true;
/* found it */
opal_list_remove_item (&m->flying_messages, &msg->super);
@ -1659,6 +1668,10 @@ static int udcm_handle_ack (udcm_module_t *m, const uintptr_t ctx, const uint16_
break;
}
if (!found) {
BTL_VERBOSE(("message %p not found in the list of flying messages", (void *) ctx));
}
opal_mutex_unlock (&m->cm_timeout_lock);
return OMPI_SUCCESS;
@ -1809,7 +1822,7 @@ static int udcm_process_messages (struct ibv_cq *event_cq, udcm_module_t *m)
msg_num = (int)(wc[i].wr_id & (~UDCM_WR_DIR_MASK));
if (IBV_WC_SUCCESS != wc[i].status) {
BTL_VERBOSE(("recv work request for buffer %d failed, code = %d",
BTL_ERROR(("recv work request for buffer %d failed, code = %d",
msg_num, wc[i].status));
count = -1;
break;
@ -2122,14 +2135,14 @@ static void udcm_send_timeout (evutil_socket_t fd, short event, void *arg)
msg->tries++;
udcm_set_message_timeout (msg);
if (0 != udcm_post_send (lcl_ep, msg->data, msg->length, 0)) {
BTL_VERBOSE(("error reposting message"));
ompi_btl_openib_fd_run_in_main(mca_btl_openib_endpoint_invoke_error,
lcl_ep);
break;
}
udcm_set_message_timeout (msg);
} while (0);
}
@ -2137,6 +2150,8 @@ static void udcm_set_message_timeout (udcm_message_sent_t *message)
{
udcm_module_t *m = UDCM_ENDPOINT_MODULE(message->endpoint);
BTL_VERBOSE(("activating timeout for message %p", (void *) message));
opal_mutex_lock (&m->cm_timeout_lock);
opal_list_append (&m->flying_messages, &message->super);
@ -2148,6 +2163,23 @@ static void udcm_set_message_timeout (udcm_message_sent_t *message)
opal_mutex_unlock (&m->cm_timeout_lock);
}
static void udcm_cancel_message_timeout (udcm_message_sent_t *message)
{
udcm_module_t *m = UDCM_ENDPOINT_MODULE(message->endpoint);
BTL_VERBOSE(("cancelling timeout for message %p", (void *) message));
opal_mutex_lock (&m->cm_timeout_lock);
opal_list_remove_item (&m->flying_messages, &message->super);
/* start the event */
opal_event_evtimer_del (&message->event);
message->event_active = false;
opal_mutex_unlock (&m->cm_timeout_lock);
}
/* mark: xrc connection support */
/* XRC support functions */
@ -2560,11 +2592,12 @@ static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_
if (0 != (rc = udcm_post_send (lcl_ep, msg->data, sizeof (udcm_msg_hdr_t), 0))) {
BTL_VERBOSE(("error posting XREQ"));
udcm_cancel_message_timeout (msg);
return rc;
}
udcm_set_message_timeout (msg);
return 0;
}
@ -2597,11 +2630,12 @@ static int udcm_xrc_send_xresponse (mca_btl_base_endpoint_t *lcl_ep, mca_btl_bas
rc = udcm_post_send (lcl_ep, msg->data, m->msg_length, 0);
if (0 != rc) {
BTL_VERBOSE(("error posting complete"));
udcm_cancel_message_timeout (msg);
return rc;
}
udcm_set_message_timeout (msg);
return 0;
}