1
1

Fix a race condition that could result in assert failures during finalize. Ensure we shutdown the orte progress thread prior to finalizing the rml/oob frameworks so that no async operations are executing during destruct of the base-level lists and objects.

cmr=v1.7.5:reviewer=jsquyres:subject=fix race condition in finalize

This commit was SVN r30641.
This commit is contained in:
Ralph Castain 2014-02-08 22:04:19 +00:00
parent 5b8e1180cf
commit 1d8c061687
3 changed files with 59 additions and 45 deletions

View File

@ -67,6 +67,9 @@
#include "orte/mca/ess/base/base.h"
static void* orte_progress_thread_engine(opal_object_t *obj);
static bool progress_thread_running = false;
int orte_ess_base_app_setup(bool db_restrict_local)
{
int ret;
@ -92,6 +95,9 @@ int orte_ess_base_app_setup(bool db_restrict_local)
}
}
/* get a separate orte event base */
orte_event_base = opal_event_base_create();
/* open and setup the state machine */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
@ -194,6 +200,17 @@ int orte_ess_base_app_setup(bool db_restrict_local)
goto error;
}
/* construct the thread object */
OBJ_CONSTRUCT(&orte_progress_thread, opal_thread_t);
/* fork off a thread to progress it */
orte_progress_thread.t_run = orte_progress_thread_engine;
progress_thread_running = true;
if (OPAL_SUCCESS != (ret = opal_thread_start(&orte_progress_thread))) {
error = "orte progress thread start";
progress_thread_running = false;
goto error;
}
/* enable communication via the rml */
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
ORTE_ERROR_LOG(ret);
@ -219,18 +236,13 @@ int orte_ess_base_app_setup(bool db_restrict_local)
}
/* Once the session directory location has been established, set
the opal_output env file location to be in the
proc-specific session directory. */
the opal_output env file location to be in the
proc-specific session directory. */
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
"output-", NULL, NULL);
}
/* setup the routed info - the selected routed component
* will know what to do. Some may put us in a blocking
* receive here so they can get ALL of the contact info
* from our peers. Others may just find the local daemon's
* contact info and immediately return.
*/
/* setup the routed info */
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
ORTE_ERROR_LOG(ret);
error = "orte_routed.init_routes";
@ -285,7 +297,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
error = "orte_dfs_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_dfs_base_select";
goto error;
@ -293,7 +305,13 @@ int orte_ess_base_app_setup(bool db_restrict_local)
return ORTE_SUCCESS;
error:
error:
if (!progress_thread_running) {
/* can't send the help message, so ensure it
* comes out locally
*/
orte_show_help_finalize();
}
orte_show_help("help-orte-runtime.txt",
"orte_init:startup:internal-failure",
true, error, ORTE_ERROR_NAME(ret), ret);
@ -319,9 +337,28 @@ int orte_ess_base_app_finalize(void)
(void) mca_base_framework_close(&opal_db_base_framework);
(void) mca_base_framework_close(&orte_dfs_base_framework);
(void) mca_base_framework_close(&orte_routed_base_framework);
if (progress_thread_running) {
/* we had to leave the progress thread running until
* we closed the routed framework as that closure
* sends a "sync" message to the local daemon. it
* is now safe to stop the progress thread
*/
orte_event_base_active = false;
/* break the event loop */
opal_event_base_loopbreak(orte_event_base);
/* wait for thread to exit */
opal_thread_join(&orte_progress_thread, NULL);
OBJ_DESTRUCT(&orte_progress_thread);
progress_thread_running = false;
}
(void) mca_base_framework_close(&orte_rml_base_framework);
(void) mca_base_framework_close(&orte_oob_base_framework);
/* release the event base */
opal_event_base_free(orte_event_base);
orte_session_dir_finalize(ORTE_PROC_MY_NAME);
return ORTE_SUCCESS;
@ -413,3 +450,11 @@ void orte_ess_base_app_abort(int status, bool report)
/* Now Exit */
exit(status);
}
static void* orte_progress_thread_engine(opal_object_t *obj)
{
while (orte_event_base_active) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
return OPAL_THREAD_CANCELLED;
}

View File

@ -64,18 +64,6 @@ int orte_finalize(void)
/* close the ess itself */
(void) mca_base_framework_close(&orte_ess_base_framework);
if (ORTE_PROC_IS_APP) {
/* stop the progress thread */
orte_event_base_active = false;
/* break the event loop */
opal_event_base_loopbreak(orte_event_base);
/* wait for thread to exit */
opal_thread_join(&orte_progress_thread, NULL);
OBJ_DESTRUCT(&orte_progress_thread);
/* release the event base */
opal_event_base_free(orte_event_base);
}
/* cleanup the process info */
orte_proc_info_finalize();

View File

@ -68,7 +68,6 @@ orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCAR
orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, ORTE_VPID_INVALID};
static void* orte_progress_thread_engine(opal_object_t *obj);
#if OPAL_CC_USE_PRAGMA_IDENT
#pragma ident ORTE_IDENT_STRING
@ -136,20 +135,11 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags)
goto error;
}
if (ORTE_PROC_IS_APP) {
/* get a separate orte event base */
orte_event_base = opal_event_base_create();
/* construct the thread object */
OBJ_CONSTRUCT(&orte_progress_thread, opal_thread_t);
/* fork off a thread to progress it */
orte_progress_thread.t_run = orte_progress_thread_engine;
if (OPAL_SUCCESS != (ret = opal_thread_start(&orte_progress_thread))) {
error = "orte progress thread start";
goto error;
}
} else {
if (!ORTE_PROC_IS_APP) {
/* ORTE tools "block" in their own loop over the event
* base, so no progress thread is required
* base, so no progress thread is required - apps will
* start their progress thread in ess_base_std_app.c
* at the appropriate point
*/
orte_event_base = opal_event_base;
}
@ -172,12 +162,3 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags)
return ret;
}
static void* orte_progress_thread_engine(opal_object_t *obj)
{
while (orte_event_base_active) {
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
}
return OPAL_THREAD_CANCELLED;
}