From 48cca8ab83e81a907be0e54b0d89fa0f9a61dcc1 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Wed, 30 Sep 2020 14:55:43 +0000 Subject: [PATCH] oob/tcp: fix a race condition on stop_thread pipe This patch fix a race condition, which caused the main thread to sometimes write to a closed pipe. The following are details: Currently, during shut down, the main thread will do the following: 1. set listen_thread_action to false. 2. write to stop_thread pipe to tell the listener thread to exit. The listener thread do the following: 1. call select() to listen to a set of file descriptors with a maximum wait time. 2. check listen_thread_action. If it is false, close the stop_thread pipe. The main thread will write to closed pipe, when 1. listener's call to select() finished because maximum wait time reached. 2. main thread set listen_thread_action to false 3. listener thread check listen_thread_action and closed the pipe 4. main thread write to the closed pipe. This patch address the issue by having the main thread close the pipe after the listener thread has been joined. This way, main thread will both write and close the thread, so there is no conflict. Note This patch was opened directly against v4.1.x branch because the orte/mca/oob/tcp directory has been removed from master branch. Signed-off-by: Wei Zhang --- orte/mca/oob/tcp/oob_tcp_component.c | 3 +++ orte/mca/oob/tcp/oob_tcp_listener.c | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/orte/mca/oob/tcp/oob_tcp_component.c b/orte/mca/oob/tcp/oob_tcp_component.c index 4398b9bd48..e6982bafc0 100644 --- a/orte/mca/oob/tcp/oob_tcp_component.c +++ b/orte/mca/oob/tcp/oob_tcp_component.c @@ -695,6 +695,9 @@ static void component_shutdown(void) /* tell the thread to exit */ write(mca_oob_tcp_component.stop_thread[1], &i, sizeof(int)); opal_thread_join(&mca_oob_tcp_component.listen_thread, NULL); + + close(mca_oob_tcp_component.stop_thread[0]); + close(mca_oob_tcp_component.stop_thread[1]); } else { opal_output_verbose(2, orte_oob_base_framework.framework_output, "no hnp or not active"); diff --git a/orte/mca/oob/tcp/oob_tcp_listener.c b/orte/mca/oob/tcp/oob_tcp_listener.c index 41c4aeb51c..0c7855e600 100644 --- a/orte/mca/oob/tcp/oob_tcp_listener.c +++ b/orte/mca/oob/tcp/oob_tcp_listener.c @@ -705,8 +705,6 @@ static void* listen_thread(opal_object_t *obj) rc = select(max + 1, &readfds, NULL, NULL, &timeout); if (!mca_oob_tcp_component.listen_thread_active) { /* we've been asked to terminate */ - close(mca_oob_tcp_component.stop_thread[0]); - close(mca_oob_tcp_component.stop_thread[1]); return NULL; } if (rc < 0) {