1
1

Cleanup the plm failed-to-start problem a little - ensure that the event is always defined so we don't have to check when trying to trigger it, thus avoiding potential race conditions.

This commit was SVN r19755.
Этот коммит содержится в:
Ralph Castain 2008-10-16 14:58:32 +00:00
родитель 48c3de1865
Коммит b46d3e766e
2 изменённых файлов: 28 добавлений и 34 удалений

Просмотреть файл

@ -518,11 +518,6 @@ void orte_trigger_event(orte_trigger_event_t *trig)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trig->name));
/* if the trigger isn't setup yet, just return */
if (trig->channel < 0) {
return;
}
/* if we already fired it, don't do it again - this automatically
* records that we did fire it
*/
@ -888,11 +883,6 @@ void orte_trigger_event(orte_trigger_event_t *trig)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
trig->name));
/* if the trigger isn't setup yet, just return */
if (trig->channel < 0) {
return;
}
if (!opal_atomic_trylock(&trig->lock)) { /* returns 1 if already locked */
return;
}

Просмотреть файл

@ -441,6 +441,33 @@ int orterun(int argc, char *argv[])
return rc;
}
/* setup an event we can wait for that will tell
* us to terminate - both normal and abnormal
* termination will call us here. Use the
* same exit fd as the daemon does so that orted_comm
* can cause either of us to exit since we share that code
*/
if (ORTE_SUCCESS != (rc = orte_wait_event(&orterun_event, &orte_exit, "job_complete", job_completed))) {
orte_show_help("help-orterun.txt", "orterun:event-def-failed", true,
orterun_basename, ORTE_ERROR_NAME(rc));
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
goto DONE;
}
/* setup an event that will
* trigger when the orteds are gone and tell the orteds that it is
* okay to finalize and exit, we are done with them.
* We set this up here in order to provide a way for us to
* wakeup and terminate should the daemons themselves fail to launch,
* and before we define signal handlers since they will call the
* exit event trigger!
*/
if (ORTE_SUCCESS != (rc = orte_wait_event(&orteds_exit_event, &orteds_exit, "orted_exit", terminated))) {
orte_show_help("help-orterun.txt", "orterun:event-def-failed", true,
orterun_basename, ORTE_ERROR_NAME(rc));
goto DONE;
}
/** setup callbacks for abort signals - from this point
* forward, we need to abort in a manner that allows us
* to cleanup
@ -598,19 +625,6 @@ int orterun(int argc, char *argv[])
/* setup for debugging */
orte_debugger_init_before_spawn(jdata);
/* setup an event we can wait for that will tell
* us to terminate - both normal and abnormal
* termination will call us here. Use the
* same exit fd as the daemon does so that orted_comm
* can cause either of us to exit since we share that code
*/
if (ORTE_SUCCESS != (rc = orte_wait_event(&orterun_event, &orte_exit, "job_complete", job_completed))) {
orte_show_help("help-orterun.txt", "orterun:event-def-failed", true,
orterun_basename, ORTE_ERROR_NAME(rc));
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
goto DONE;
}
/* Spawn the job */
rc = orte_plm.spawn(jdata);
@ -679,16 +693,6 @@ static void job_completed(int trigpipe, short event, void *arg)
/* if the debuggers were run, clean up */
orte_debugger_finalize();
/* the job is complete - now setup an event that will
* trigger when the orteds are gone and tell the orteds that it is
* okay to finalize and exit, we are done with them.
*/
if (ORTE_SUCCESS != (rc = orte_wait_event(&orteds_exit_event, &orteds_exit, "orted_exit", terminated))) {
orte_show_help("help-orterun.txt", "orterun:event-def-failed", true,
orterun_basename, ORTE_ERROR_NAME(rc));
goto DONE;
}
if (ORTE_SUCCESS != (rc = orte_plm.terminate_orteds())) {
/* since we know that the sends didn't completely go out,
* we know that the prior event will never fire. Add a timeout so