Some minor updates to the locking system changes. Remove obsolete locks. Ensure the trigger event objects do not get deconstructed until the very end to avoid possible problems due to race conditions. Route all orted abnormal term tests through the trigger.
This commit was SVN r19172.
Этот коммит содержится в:
родитель
bb90ad793a
Коммит
63c33a9c32
orte
@ -96,7 +96,6 @@ static opal_event_t *orted_exit_event;
|
||||
static void shutdown_callback(int fd, short flags, void *arg);
|
||||
static void shutdown_signal(int fd, short flags, void *arg);
|
||||
static void signal_callback(int fd, short event, void *arg);
|
||||
static void clean_fail(int fd, short flags, void *arg);
|
||||
|
||||
static struct {
|
||||
bool debug;
|
||||
@ -246,6 +245,9 @@ int orte_daemon(int argc, char *argv[])
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* setup the exit triggers */
|
||||
OBJ_CONSTRUCT(&orte_exit, orte_trigger_event_t);
|
||||
|
||||
/* save the environment for launch purposes. This MUST be
|
||||
* done so that we can pass it to any local procs we
|
||||
* spawn - otherwise, those local procs won't see any
|
||||
@ -334,7 +336,7 @@ int orte_daemon(int argc, char *argv[])
|
||||
* and have it kill us
|
||||
*/
|
||||
if (0 < orted_globals.fail_delay) {
|
||||
ORTE_TIMER_EVENT(orted_globals.fail_delay, clean_fail);
|
||||
ORTE_TIMER_EVENT(orted_globals.fail_delay, shutdown_signal);
|
||||
|
||||
} else {
|
||||
opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
@ -627,46 +629,14 @@ int orte_daemon(int argc, char *argv[])
|
||||
/* cleanup any lingering session directories */
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
/* cleanup the triggers */
|
||||
OBJ_DESTRUCT(&orte_exit);
|
||||
|
||||
/* Finalize and clean up ourselves */
|
||||
ret = orte_finalize();
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void clean_fail(int fd, short flags, void *arg)
|
||||
{
|
||||
/* protect against multiple calls to exit */
|
||||
if (!opal_atomic_trylock(&orted_exit_lock)) { /* returns 1 if already locked */
|
||||
return;
|
||||
}
|
||||
|
||||
opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
orted_globals.abort ? "abort" : "abnormal termination");
|
||||
|
||||
/* cleanup */
|
||||
if (NULL != log_path) {
|
||||
unlink(log_path);
|
||||
}
|
||||
|
||||
/* make sure our local procs are dead - but don't update their state
|
||||
* on the HNP as this may be redundant
|
||||
*/
|
||||
orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false);
|
||||
|
||||
/* do -not- call finalize as this will send a message to the HNP
|
||||
* indicating clean termination! Instead, just forcibly cleanup
|
||||
* the local session_dir tree and exit
|
||||
*/
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
|
||||
/* if we were ordered to abort, do so */
|
||||
if (orted_globals.abort) {
|
||||
abort();
|
||||
}
|
||||
|
||||
/* otherwise, exit with a non-zero status */
|
||||
exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
|
||||
static void shutdown_signal(int fd, short flags, void *arg)
|
||||
{
|
||||
/* trigger the call to shutdown callback to protect
|
||||
@ -699,7 +669,29 @@ static void shutdown_callback(int fd, short flags, void *arg)
|
||||
*/
|
||||
orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false);
|
||||
|
||||
/* Finalize and clean up ourselves */
|
||||
/* cleanup the triggers */
|
||||
OBJ_DESTRUCT(&orte_exit);
|
||||
|
||||
/* if we were ordered to abort, do so */
|
||||
if (orted_globals.abort) {
|
||||
opal_output(0, "%s is executing clean abort", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
/* do -not- call finalize as this will send a message to the HNP
|
||||
* indicating clean termination! Instead, just forcibly cleanup
|
||||
* the local session_dir tree and abort
|
||||
*/
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
abort();
|
||||
} else if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) {
|
||||
opal_output(0, "%s is executing clean abnormal termination", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
/* do -not- call finalize as this will send a message to the HNP
|
||||
* indicating clean termination! Instead, just forcibly cleanup
|
||||
* the local session_dir tree and exit
|
||||
*/
|
||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||
exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
|
||||
}
|
||||
|
||||
/* Finalize and clean up ourselves */
|
||||
ret = orte_finalize();
|
||||
exit(ret);
|
||||
}
|
||||
|
@ -25,12 +25,7 @@
|
||||
/* for everyone */
|
||||
opal_atomic_lock_t orte_finalize_lock;
|
||||
|
||||
/* for orteds */
|
||||
opal_atomic_lock_t orted_exit_lock;
|
||||
|
||||
/* for HNPs */
|
||||
opal_atomic_lock_t orte_job_complete_lock;
|
||||
opal_atomic_lock_t orte_terminate_lock;
|
||||
opal_atomic_lock_t orte_abort_inprogress_lock;
|
||||
|
||||
|
||||
@ -39,12 +34,7 @@ int orte_locks_init(void)
|
||||
/* for everyone */
|
||||
opal_atomic_init(&orte_finalize_lock, OPAL_ATOMIC_UNLOCKED);
|
||||
|
||||
/* for orteds */
|
||||
opal_atomic_init(&orted_exit_lock, OPAL_ATOMIC_UNLOCKED);
|
||||
|
||||
/* for HNPs */
|
||||
opal_atomic_init(&orte_job_complete_lock, OPAL_ATOMIC_UNLOCKED);
|
||||
opal_atomic_init(&orte_terminate_lock, OPAL_ATOMIC_UNLOCKED);
|
||||
opal_atomic_init(&orte_abort_inprogress_lock, OPAL_ATOMIC_UNLOCKED);
|
||||
|
||||
return ORTE_SUCCESS;
|
||||
|
@ -34,12 +34,7 @@ BEGIN_C_DECLS
|
||||
/* for everyone */
|
||||
ORTE_DECLSPEC extern opal_atomic_lock_t orte_finalize_lock;
|
||||
|
||||
/* for orteds */
|
||||
ORTE_DECLSPEC extern opal_atomic_lock_t orted_exit_lock;
|
||||
|
||||
/* for HNPs */
|
||||
ORTE_DECLSPEC extern opal_atomic_lock_t orte_job_complete_lock;
|
||||
ORTE_DECLSPEC extern opal_atomic_lock_t orte_terminate_lock;
|
||||
ORTE_DECLSPEC extern opal_atomic_lock_t orte_abort_inprogress_lock;
|
||||
|
||||
|
||||
|
@ -493,9 +493,6 @@ int orte_wait_event(opal_event_t **event, orte_trigger_event_t *trig,
|
||||
/* create the event */
|
||||
*event = (opal_event_t*)malloc(sizeof(opal_event_t));
|
||||
|
||||
/* setup the trigger and its associated lock */
|
||||
OBJ_CONSTRUCT(trig, orte_trigger_event_t);
|
||||
|
||||
/* pass back the write end of the pipe */
|
||||
trig->channel = p[1];
|
||||
|
||||
@ -1086,9 +1083,6 @@ int orte_wait_event(opal_event_t **event, orte_trigger_event_t *trig,
|
||||
/* create the event */
|
||||
*event = (opal_event_t*)malloc(sizeof(opal_event_t));
|
||||
|
||||
/* setup the trigger and its associated lock */
|
||||
OBJ_CONSTRUCT(trig, orte_trigger_event_t);
|
||||
|
||||
/* pass back the write end of the pipe */
|
||||
trig->channel = p[1];
|
||||
|
||||
|
@ -364,6 +364,10 @@ int orterun(int argc, char *argv[])
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* setup the exit triggers */
|
||||
OBJ_CONSTRUCT(&orte_exit, orte_trigger_event_t);
|
||||
OBJ_CONSTRUCT(&orteds_exit, orte_trigger_event_t);
|
||||
|
||||
/* flag that I am the HNP */
|
||||
orte_process_info.hnp = true;
|
||||
|
||||
@ -631,9 +635,6 @@ static void job_completed(int trigpipe, short event, void *arg)
|
||||
free(abort_exit_event);
|
||||
}
|
||||
|
||||
/* cleanup the trigger */
|
||||
OBJ_DESTRUCT(&orte_exit);
|
||||
|
||||
exit_state = jdata->state;
|
||||
|
||||
if (ORTE_JOB_STATE_TERMINATED != exit_state) {
|
||||
@ -699,6 +700,10 @@ DONE:
|
||||
/* cleanup our data server */
|
||||
orte_data_server_finalize();
|
||||
|
||||
/* cleanup the triggers */
|
||||
OBJ_DESTRUCT(&orte_exit);
|
||||
OBJ_DESTRUCT(&orteds_exit);
|
||||
|
||||
orte_finalize();
|
||||
free(orterun_basename);
|
||||
exit(rc);
|
||||
@ -711,9 +716,6 @@ static void terminated(int trigpipe, short event, void *arg)
|
||||
orte_proc_t **procs;
|
||||
orte_vpid_t i;
|
||||
|
||||
/* cleanup the trigger */
|
||||
OBJ_DESTRUCT(&orteds_exit);
|
||||
|
||||
/* clear the event timer */
|
||||
if (NULL != timeout_ev) {
|
||||
opal_evtimer_del(timeout_ev);
|
||||
@ -780,6 +782,10 @@ finish:
|
||||
/* cleanup our data server */
|
||||
orte_data_server_finalize();
|
||||
|
||||
/* cleanup the triggers */
|
||||
OBJ_DESTRUCT(&orte_exit);
|
||||
OBJ_DESTRUCT(&orteds_exit);
|
||||
|
||||
orte_finalize();
|
||||
free(orterun_basename);
|
||||
if (orte_debug_flag) {
|
||||
@ -995,7 +1001,10 @@ static void abort_exit_callback(int fd, short ign, void *arg)
|
||||
* during finalize
|
||||
*/
|
||||
OBJ_RELEASE(jdata);
|
||||
|
||||
/* cleanup the triggers */
|
||||
OBJ_DESTRUCT(&orte_exit);
|
||||
OBJ_DESTRUCT(&orteds_exit);
|
||||
|
||||
orte_finalize();
|
||||
free(orterun_basename);
|
||||
ORTE_UPDATE_EXIT_STATUS(1);
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user