From f37a77dd0848e1118d452e3399a9835a7b016cea Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 7 Apr 2006 18:13:35 +0000 Subject: [PATCH] * Fix potential deadlock when mpi threads are enabled and progress threads are not. See lengthy comment in the body of commit. This commit was SVN r9573. --- orte/mca/oob/tcp/oob_tcp_msg.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/orte/mca/oob/tcp/oob_tcp_msg.c b/orte/mca/oob/tcp/oob_tcp_msg.c index d36a3017f3..bfacc19697 100644 --- a/orte/mca/oob/tcp/oob_tcp_msg.c +++ b/orte/mca/oob/tcp/oob_tcp_msg.c @@ -86,8 +86,26 @@ int mca_oob_tcp_msg_wait(mca_oob_tcp_msg_t* msg, int* rc) #else /* wait for message to complete */ - while(msg->msg_complete == false) + while(msg->msg_complete == false) { + /* msg_wait() is used in the "barrier" at the completion of + MPI_FINALIZE, during which time BTLs may still need to + progress pending outgoing communication, so we need to + call opal_progress() here to make sure that communication + gets pushed out so others can enter finalize (and send us + the message we're here waiting for). However, if we're + in a callback from the event library that was triggered + from a call to opal_progress(), opal_progress() will + think another thread is already progressing the event + engine (in the case of mpi threads enabled) and not + progress the engine, meaning we'll never receive our + message. So we also need to progress the event library + expicitly. We use EVLOOP_NONBLOCK so that we can + progress both the registered callbacks and the event + library, as EVLOOP_ONCE may block for a short period of + time. */ opal_progress(); + opal_event_loop(OPAL_EVLOOP_NONBLOCK); + } #endif /* return status */ @@ -133,7 +151,9 @@ int mca_oob_tcp_msg_timedwait(mca_oob_tcp_msg_t* msg, int* rc, struct timespec* while(msg->msg_complete == false && ((uint32_t)tv.tv_sec <= secs || ((uint32_t)tv.tv_sec == secs && (uint32_t)tv.tv_usec < usecs))) { + /* see comment in tcp_msg_wait, above */ opal_progress(); + opal_event_loop(OPAL_EVLOOP_NONBLOCK); gettimeofday(&tv,NULL); } #endif