From 1d8c0616875802a500bc2c6226e89ef72070eb66 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Sat, 8 Feb 2014 22:04:19 +0000 Subject: [PATCH] Fix a race condition that could result in assert failures during finalize. Ensure we shutdown the orte progress thread prior to finalizing the rml/oob frameworks so that no async operations are executing during destruct of the base-level lists and objects. cmr=v1.7.5:reviewer=jsquyres:subject=fix race condition in finalize This commit was SVN r30641. --- orte/mca/ess/base/ess_base_std_app.c | 65 +++++++++++++++++++++++----- orte/runtime/orte_finalize.c | 12 ----- orte/runtime/orte_init.c | 27 ++---------- 3 files changed, 59 insertions(+), 45 deletions(-) diff --git a/orte/mca/ess/base/ess_base_std_app.c b/orte/mca/ess/base/ess_base_std_app.c index 97cdfc32bd..a78aaf0b5d 100644 --- a/orte/mca/ess/base/ess_base_std_app.c +++ b/orte/mca/ess/base/ess_base_std_app.c @@ -67,6 +67,9 @@ #include "orte/mca/ess/base/base.h" +static void* orte_progress_thread_engine(opal_object_t *obj); +static bool progress_thread_running = false; + int orte_ess_base_app_setup(bool db_restrict_local) { int ret; @@ -92,6 +95,9 @@ int orte_ess_base_app_setup(bool db_restrict_local) } } + /* get a separate orte event base */ + orte_event_base = opal_event_base_create(); + /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); @@ -194,6 +200,17 @@ int orte_ess_base_app_setup(bool db_restrict_local) goto error; } + /* construct the thread object */ + OBJ_CONSTRUCT(&orte_progress_thread, opal_thread_t); + /* fork off a thread to progress it */ + orte_progress_thread.t_run = orte_progress_thread_engine; + progress_thread_running = true; + if (OPAL_SUCCESS != (ret = opal_thread_start(&orte_progress_thread))) { + error = "orte progress thread start"; + progress_thread_running = false; + goto error; + } + /* enable communication via the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); @@ -219,18 +236,13 @@ int orte_ess_base_app_setup(bool db_restrict_local) } /* Once the session directory location has been established, set - the opal_output env file location to be in the - proc-specific session directory. */ + the opal_output env file location to be in the + proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); } - /* setup the routed info - the selected routed component - * will know what to do. Some may put us in a blocking - * receive here so they can get ALL of the contact info - * from our peers. Others may just find the local daemon's - * contact info and immediately return. - */ + /* setup the routed info */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; @@ -285,7 +297,7 @@ int orte_ess_base_app_setup(bool db_restrict_local) error = "orte_dfs_base_open"; goto error; } - if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) { + if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_select"; goto error; @@ -293,7 +305,13 @@ int orte_ess_base_app_setup(bool db_restrict_local) return ORTE_SUCCESS; -error: + error: + if (!progress_thread_running) { + /* can't send the help message, so ensure it + * comes out locally + */ + orte_show_help_finalize(); + } orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); @@ -319,9 +337,28 @@ int orte_ess_base_app_finalize(void) (void) mca_base_framework_close(&opal_db_base_framework); (void) mca_base_framework_close(&orte_dfs_base_framework); (void) mca_base_framework_close(&orte_routed_base_framework); + + if (progress_thread_running) { + /* we had to leave the progress thread running until + * we closed the routed framework as that closure + * sends a "sync" message to the local daemon. it + * is now safe to stop the progress thread + */ + orte_event_base_active = false; + /* break the event loop */ + opal_event_base_loopbreak(orte_event_base); + /* wait for thread to exit */ + opal_thread_join(&orte_progress_thread, NULL); + OBJ_DESTRUCT(&orte_progress_thread); + progress_thread_running = false; + } + (void) mca_base_framework_close(&orte_rml_base_framework); (void) mca_base_framework_close(&orte_oob_base_framework); + /* release the event base */ + opal_event_base_free(orte_event_base); + orte_session_dir_finalize(ORTE_PROC_MY_NAME); return ORTE_SUCCESS; @@ -413,3 +450,11 @@ void orte_ess_base_app_abort(int status, bool report) /* Now Exit */ exit(status); } + +static void* orte_progress_thread_engine(opal_object_t *obj) +{ + while (orte_event_base_active) { + opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); + } + return OPAL_THREAD_CANCELLED; +} diff --git a/orte/runtime/orte_finalize.c b/orte/runtime/orte_finalize.c index 093957ae65..7b0c659c99 100644 --- a/orte/runtime/orte_finalize.c +++ b/orte/runtime/orte_finalize.c @@ -64,18 +64,6 @@ int orte_finalize(void) /* close the ess itself */ (void) mca_base_framework_close(&orte_ess_base_framework); - if (ORTE_PROC_IS_APP) { - /* stop the progress thread */ - orte_event_base_active = false; - /* break the event loop */ - opal_event_base_loopbreak(orte_event_base); - /* wait for thread to exit */ - opal_thread_join(&orte_progress_thread, NULL); - OBJ_DESTRUCT(&orte_progress_thread); - /* release the event base */ - opal_event_base_free(orte_event_base); - } - /* cleanup the process info */ orte_proc_info_finalize(); diff --git a/orte/runtime/orte_init.c b/orte/runtime/orte_init.c index 85f91be94a..bc8bf0ba0c 100644 --- a/orte/runtime/orte_init.c +++ b/orte/runtime/orte_init.c @@ -68,7 +68,6 @@ orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCAR orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, ORTE_VPID_INVALID}; -static void* orte_progress_thread_engine(opal_object_t *obj); #if OPAL_CC_USE_PRAGMA_IDENT #pragma ident ORTE_IDENT_STRING @@ -136,20 +135,11 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags) goto error; } - if (ORTE_PROC_IS_APP) { - /* get a separate orte event base */ - orte_event_base = opal_event_base_create(); - /* construct the thread object */ - OBJ_CONSTRUCT(&orte_progress_thread, opal_thread_t); - /* fork off a thread to progress it */ - orte_progress_thread.t_run = orte_progress_thread_engine; - if (OPAL_SUCCESS != (ret = opal_thread_start(&orte_progress_thread))) { - error = "orte progress thread start"; - goto error; - } - } else { + if (!ORTE_PROC_IS_APP) { /* ORTE tools "block" in their own loop over the event - * base, so no progress thread is required + * base, so no progress thread is required - apps will + * start their progress thread in ess_base_std_app.c + * at the appropriate point */ orte_event_base = opal_event_base; } @@ -172,12 +162,3 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags) return ret; } - - -static void* orte_progress_thread_engine(opal_object_t *obj) -{ - while (orte_event_base_active) { - opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE); - } - return OPAL_THREAD_CANCELLED; -}