Fix a race condition that could result in assert failures during finalize. Ensure we shutdown the orte progress thread prior to finalizing the rml/oob frameworks so that no async operations are executing during destruct of the base-level lists and objects.
cmr=v1.7.5:reviewer=jsquyres:subject=fix race condition in finalize This commit was SVN r30641.
Этот коммит содержится в:
родитель
5b8e1180cf
Коммит
1d8c061687
@ -67,6 +67,9 @@
|
|||||||
|
|
||||||
#include "orte/mca/ess/base/base.h"
|
#include "orte/mca/ess/base/base.h"
|
||||||
|
|
||||||
|
static void* orte_progress_thread_engine(opal_object_t *obj);
|
||||||
|
static bool progress_thread_running = false;
|
||||||
|
|
||||||
int orte_ess_base_app_setup(bool db_restrict_local)
|
int orte_ess_base_app_setup(bool db_restrict_local)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
@ -92,6 +95,9 @@ int orte_ess_base_app_setup(bool db_restrict_local)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* get a separate orte event base */
|
||||||
|
orte_event_base = opal_event_base_create();
|
||||||
|
|
||||||
/* open and setup the state machine */
|
/* open and setup the state machine */
|
||||||
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
|
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
@ -194,6 +200,17 @@ int orte_ess_base_app_setup(bool db_restrict_local)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* construct the thread object */
|
||||||
|
OBJ_CONSTRUCT(&orte_progress_thread, opal_thread_t);
|
||||||
|
/* fork off a thread to progress it */
|
||||||
|
orte_progress_thread.t_run = orte_progress_thread_engine;
|
||||||
|
progress_thread_running = true;
|
||||||
|
if (OPAL_SUCCESS != (ret = opal_thread_start(&orte_progress_thread))) {
|
||||||
|
error = "orte progress thread start";
|
||||||
|
progress_thread_running = false;
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
/* enable communication via the rml */
|
/* enable communication via the rml */
|
||||||
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
|
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
@ -225,12 +242,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
|
|||||||
"output-", NULL, NULL);
|
"output-", NULL, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* setup the routed info - the selected routed component
|
/* setup the routed info */
|
||||||
* will know what to do. Some may put us in a blocking
|
|
||||||
* receive here so they can get ALL of the contact info
|
|
||||||
* from our peers. Others may just find the local daemon's
|
|
||||||
* contact info and immediately return.
|
|
||||||
*/
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
|
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
|
||||||
ORTE_ERROR_LOG(ret);
|
ORTE_ERROR_LOG(ret);
|
||||||
error = "orte_routed.init_routes";
|
error = "orte_routed.init_routes";
|
||||||
@ -293,7 +305,13 @@ int orte_ess_base_app_setup(bool db_restrict_local)
|
|||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
|
if (!progress_thread_running) {
|
||||||
|
/* can't send the help message, so ensure it
|
||||||
|
* comes out locally
|
||||||
|
*/
|
||||||
|
orte_show_help_finalize();
|
||||||
|
}
|
||||||
orte_show_help("help-orte-runtime.txt",
|
orte_show_help("help-orte-runtime.txt",
|
||||||
"orte_init:startup:internal-failure",
|
"orte_init:startup:internal-failure",
|
||||||
true, error, ORTE_ERROR_NAME(ret), ret);
|
true, error, ORTE_ERROR_NAME(ret), ret);
|
||||||
@ -319,9 +337,28 @@ int orte_ess_base_app_finalize(void)
|
|||||||
(void) mca_base_framework_close(&opal_db_base_framework);
|
(void) mca_base_framework_close(&opal_db_base_framework);
|
||||||
(void) mca_base_framework_close(&orte_dfs_base_framework);
|
(void) mca_base_framework_close(&orte_dfs_base_framework);
|
||||||
(void) mca_base_framework_close(&orte_routed_base_framework);
|
(void) mca_base_framework_close(&orte_routed_base_framework);
|
||||||
|
|
||||||
|
if (progress_thread_running) {
|
||||||
|
/* we had to leave the progress thread running until
|
||||||
|
* we closed the routed framework as that closure
|
||||||
|
* sends a "sync" message to the local daemon. it
|
||||||
|
* is now safe to stop the progress thread
|
||||||
|
*/
|
||||||
|
orte_event_base_active = false;
|
||||||
|
/* break the event loop */
|
||||||
|
opal_event_base_loopbreak(orte_event_base);
|
||||||
|
/* wait for thread to exit */
|
||||||
|
opal_thread_join(&orte_progress_thread, NULL);
|
||||||
|
OBJ_DESTRUCT(&orte_progress_thread);
|
||||||
|
progress_thread_running = false;
|
||||||
|
}
|
||||||
|
|
||||||
(void) mca_base_framework_close(&orte_rml_base_framework);
|
(void) mca_base_framework_close(&orte_rml_base_framework);
|
||||||
(void) mca_base_framework_close(&orte_oob_base_framework);
|
(void) mca_base_framework_close(&orte_oob_base_framework);
|
||||||
|
|
||||||
|
/* release the event base */
|
||||||
|
opal_event_base_free(orte_event_base);
|
||||||
|
|
||||||
orte_session_dir_finalize(ORTE_PROC_MY_NAME);
|
orte_session_dir_finalize(ORTE_PROC_MY_NAME);
|
||||||
|
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
@ -413,3 +450,11 @@ void orte_ess_base_app_abort(int status, bool report)
|
|||||||
/* Now Exit */
|
/* Now Exit */
|
||||||
exit(status);
|
exit(status);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void* orte_progress_thread_engine(opal_object_t *obj)
|
||||||
|
{
|
||||||
|
while (orte_event_base_active) {
|
||||||
|
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
||||||
|
}
|
||||||
|
return OPAL_THREAD_CANCELLED;
|
||||||
|
}
|
||||||
|
@ -64,18 +64,6 @@ int orte_finalize(void)
|
|||||||
/* close the ess itself */
|
/* close the ess itself */
|
||||||
(void) mca_base_framework_close(&orte_ess_base_framework);
|
(void) mca_base_framework_close(&orte_ess_base_framework);
|
||||||
|
|
||||||
if (ORTE_PROC_IS_APP) {
|
|
||||||
/* stop the progress thread */
|
|
||||||
orte_event_base_active = false;
|
|
||||||
/* break the event loop */
|
|
||||||
opal_event_base_loopbreak(orte_event_base);
|
|
||||||
/* wait for thread to exit */
|
|
||||||
opal_thread_join(&orte_progress_thread, NULL);
|
|
||||||
OBJ_DESTRUCT(&orte_progress_thread);
|
|
||||||
/* release the event base */
|
|
||||||
opal_event_base_free(orte_event_base);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* cleanup the process info */
|
/* cleanup the process info */
|
||||||
orte_proc_info_finalize();
|
orte_proc_info_finalize();
|
||||||
|
|
||||||
|
@ -68,7 +68,6 @@ orte_process_name_t orte_name_wildcard = {ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCAR
|
|||||||
|
|
||||||
orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, ORTE_VPID_INVALID};
|
orte_process_name_t orte_name_invalid = {ORTE_JOBID_INVALID, ORTE_VPID_INVALID};
|
||||||
|
|
||||||
static void* orte_progress_thread_engine(opal_object_t *obj);
|
|
||||||
|
|
||||||
#if OPAL_CC_USE_PRAGMA_IDENT
|
#if OPAL_CC_USE_PRAGMA_IDENT
|
||||||
#pragma ident ORTE_IDENT_STRING
|
#pragma ident ORTE_IDENT_STRING
|
||||||
@ -136,20 +135,11 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags)
|
|||||||
goto error;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ORTE_PROC_IS_APP) {
|
if (!ORTE_PROC_IS_APP) {
|
||||||
/* get a separate orte event base */
|
|
||||||
orte_event_base = opal_event_base_create();
|
|
||||||
/* construct the thread object */
|
|
||||||
OBJ_CONSTRUCT(&orte_progress_thread, opal_thread_t);
|
|
||||||
/* fork off a thread to progress it */
|
|
||||||
orte_progress_thread.t_run = orte_progress_thread_engine;
|
|
||||||
if (OPAL_SUCCESS != (ret = opal_thread_start(&orte_progress_thread))) {
|
|
||||||
error = "orte progress thread start";
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
/* ORTE tools "block" in their own loop over the event
|
/* ORTE tools "block" in their own loop over the event
|
||||||
* base, so no progress thread is required
|
* base, so no progress thread is required - apps will
|
||||||
|
* start their progress thread in ess_base_std_app.c
|
||||||
|
* at the appropriate point
|
||||||
*/
|
*/
|
||||||
orte_event_base = opal_event_base;
|
orte_event_base = opal_event_base;
|
||||||
}
|
}
|
||||||
@ -172,12 +162,3 @@ int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags)
|
|||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void* orte_progress_thread_engine(opal_object_t *obj)
|
|
||||||
{
|
|
||||||
while (orte_event_base_active) {
|
|
||||||
opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
|
|
||||||
}
|
|
||||||
return OPAL_THREAD_CANCELLED;
|
|
||||||
}
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user