1
1

Ensure that signals are de-trapped before exiting to stop the $#@@#$ event library from "asserting"

This commit was SVN r20630.
Этот коммит содержится в:
Ralph Castain 2009-02-25 03:10:21 +00:00
родитель dcff523244
Коммит 85a9a2e6d8
2 изменённых файлов: 102 добавлений и 34 удалений

Просмотреть файл

@ -97,6 +97,7 @@ static opal_event_t sigusr2_handler;
#endif /* __WINDOWS__ */
char *log_path = NULL;
static opal_event_t *orted_exit_event;
static bool signals_set=false;
static void shutdown_callback(int fd, short flags, void *arg);
static void shutdown_signal(int fd, short flags, void *arg);
@ -374,7 +375,8 @@ int orte_daemon(int argc, char *argv[])
}
/* otherwise, return with non-zero status */
return ORTE_ERROR_DEFAULT_EXIT_CODE;
ret = ORTE_ERROR_DEFAULT_EXIT_CODE;
goto DONE;
}
}
}
@ -410,7 +412,7 @@ int orte_daemon(int argc, char *argv[])
*/
if (ORTE_SUCCESS != (ret = orte_wait_event(&orted_exit_event, &orte_exit, "orted_shutdown", shutdown_callback))) {
ORTE_ERROR_LOG(ret);
return ret;
goto DONE;
}
/* setup the primary daemon command receive function */
@ -418,7 +420,7 @@ int orte_daemon(int argc, char *argv[])
ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(ret);
return ret;
goto DONE;
}
/* Set signal handlers to catch kill signals so we can properly clean up
@ -441,6 +443,8 @@ int orte_daemon(int argc, char *argv[])
opal_signal_add(&sigusr2_handler, NULL);
#endif /* __WINDOWS__ */
signals_set = true;
/* setup stdout/stderr */
if (orte_debug_daemons_file_flag) {
/* if we are debugging to a file, then send stdout/stderr to
@ -451,7 +455,7 @@ int orte_daemon(int argc, char *argv[])
if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring,
ORTE_PROC_MY_NAME->jobid))) {
ORTE_ERROR_LOG(ret);
return ret;
goto DONE;
}
/* define a log file name in the session directory */
@ -608,13 +612,13 @@ int orte_daemon(int argc, char *argv[])
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
goto DONE;
}
/* send our architecture */
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &orte_process_info.arch, 1, OPAL_INT32))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
goto DONE;
}
if (orte_timing) {
int64_t secs, usecs;
@ -623,13 +627,13 @@ int orte_daemon(int argc, char *argv[])
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
goto DONE;
}
usecs = starttime.tv_usec;
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
goto DONE;
}
/* get and send our setup time */
gettimeofday(&setuptime, NULL);
@ -643,12 +647,12 @@ int orte_daemon(int argc, char *argv[])
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
goto DONE;
}
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
goto DONE;
}
/* include the actual timestamp so the HNP can figure out how
* long it took for this message to arrive
@ -657,20 +661,20 @@ int orte_daemon(int argc, char *argv[])
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
goto DONE;
}
usecs = setuptime.tv_usec;
if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
goto DONE;
}
}
if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer,
ORTE_RML_TAG_ORTED_CALLBACK, 0))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(buffer);
return ret;
goto DONE;
}
OBJ_RELEASE(buffer); /* done with this */
}
@ -688,6 +692,16 @@ int orte_daemon(int argc, char *argv[])
opal_event_dispatch();
/* should never get here, but if we do... */
DONE:
if (signals_set) {
/* Release all local signal handlers */
opal_event_del(&term_handler);
opal_event_del(&int_handler);
#ifndef __WINDOWS__
opal_signal_del(&sigusr1_handler);
opal_signal_del(&sigusr2_handler);
#endif /* __WINDOWS__ */
}
/* cleanup any lingering session directories */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
@ -696,7 +710,7 @@ int orte_daemon(int argc, char *argv[])
OBJ_DESTRUCT(&orte_exit);
/* Finalize and clean up ourselves */
ret = orte_finalize();
orte_finalize();
return ret;
}
@ -754,13 +768,15 @@ static void shutdown_callback(int fd, short flags, void *arg)
exit(ORTE_ERROR_DEFAULT_EXIT_CODE);
}
/* Release all local signal handlers */
opal_event_del(&term_handler);
opal_event_del(&int_handler);
if (signals_set) {
/* Release all local signal handlers */
opal_event_del(&term_handler);
opal_event_del(&int_handler);
#ifndef __WINDOWS__
opal_signal_del(&sigusr1_handler);
opal_signal_del(&sigusr2_handler);
opal_signal_del(&sigusr1_handler);
opal_signal_del(&sigusr2_handler);
#endif /* __WINDOWS__ */
}
/* Finalize and clean up ourselves */
ret = orte_finalize();

Просмотреть файл

@ -123,6 +123,7 @@ static opal_event_t *abort_exit_event=NULL;
static bool forcibly_die = false;
static opal_event_t *timeout_ev=NULL;
static bool profile_is_set = false;
static bool signals_set=false;
/*
* Globals
@ -603,6 +604,8 @@ int orterun(int argc, char *argv[])
}
#endif /* __WINDOWS__ */
signals_set = true;
/* we are an hnp, so update the contact info field for later use */
orte_process_info.my_hnp_uri = orte_rml.get_contact_info();
@ -683,7 +686,8 @@ int orterun(int argc, char *argv[])
ORTE_ERROR_LOG(rc);
orte_show_help("help-orterun.txt", "orterun:precondition", false,
orterun_basename, NULL, NULL, rc);
return rc;
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
goto DONE;
}
/* setup to listen for commands sent specifically to me, even though I would probably
@ -695,13 +699,15 @@ int orterun(int argc, char *argv[])
ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL);
if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) {
ORTE_ERROR_LOG(rc);
return rc;
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
goto DONE;
}
/* setup the data server */
if (ORTE_SUCCESS != (rc = orte_data_server_init())) {
ORTE_ERROR_LOG(rc);
return rc;
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
goto DONE;
}
/* if an uri for the ompi-server was provided, set the route */
@ -729,7 +735,7 @@ int orterun(int argc, char *argv[])
orterun_basename, ompi_server,
(long)orterun_globals.server_wait_timeout,
ORTE_ERROR_NAME(rc));
orte_exit_status = ORTE_ERROR_DEFAULT_EXIT_CODE;
ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
goto DONE;
}
}
@ -752,6 +758,21 @@ int orterun(int argc, char *argv[])
* to an error - so just cleanup and leave
*/
DONE:
if (signals_set) {
/* Remove the TERM and INT signal handlers */
opal_signal_del(&term_handler);
opal_signal_del(&int_handler);
#ifndef __WINDOWS__
/** Remove the USR signal handlers */
opal_signal_del(&sigusr1_handler);
opal_signal_del(&sigusr2_handler);
if (orte_forward_job_control) {
opal_signal_del(&sigtstp_handler);
opal_signal_del(&sigcont_handler);
}
#endif /* __WINDOWS__ */
}
/* whack any lingering session directory files from our jobs */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
@ -837,6 +858,21 @@ static void job_completed(int trigpipe, short event, void *arg)
* all we can do is cleanly exit ourselves
*/
DONE:
if (signals_set) {
/* Remove the TERM and INT signal handlers */
opal_signal_del(&term_handler);
opal_signal_del(&int_handler);
#ifndef __WINDOWS__
/** Remove the USR signal handlers */
opal_signal_del(&sigusr1_handler);
opal_signal_del(&sigusr2_handler);
if (orte_forward_job_control) {
opal_signal_del(&sigtstp_handler);
opal_signal_del(&sigcont_handler);
}
#endif /* __WINDOWS__ */
}
/* whack any lingering session directory files from our jobs */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
@ -861,18 +897,20 @@ static void terminated(int trigpipe, short event, void *arg)
free(timeout_ev);
}
/* Remove the TERM and INT signal handlers */
opal_signal_del(&term_handler);
opal_signal_del(&int_handler);
if (signals_set) {
/* Remove the TERM and INT signal handlers */
opal_signal_del(&term_handler);
opal_signal_del(&int_handler);
#ifndef __WINDOWS__
/** Remove the USR signal handlers */
opal_signal_del(&sigusr1_handler);
opal_signal_del(&sigusr2_handler);
if (orte_forward_job_control) {
opal_signal_del(&sigtstp_handler);
opal_signal_del(&sigcont_handler);
}
/** Remove the USR signal handlers */
opal_signal_del(&sigusr1_handler);
opal_signal_del(&sigusr2_handler);
if (orte_forward_job_control) {
opal_signal_del(&sigtstp_handler);
opal_signal_del(&sigcont_handler);
}
#endif /* __WINDOWS__ */
}
/* get the daemon job object */
if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
@ -1133,6 +1171,20 @@ static void abort_exit_callback(int fd, short ign, void *arg)
* the point of setting the job up, so there is nothing
* to do but just clean ourselves up and exit
*/
if (signals_set) {
/* Remove the TERM and INT signal handlers */
opal_signal_del(&term_handler);
opal_signal_del(&int_handler);
#ifndef __WINDOWS__
/** Remove the USR signal handlers */
opal_signal_del(&sigusr1_handler);
opal_signal_del(&sigusr2_handler);
if (orte_forward_job_control) {
opal_signal_del(&sigtstp_handler);
opal_signal_del(&sigcont_handler);
}
#endif /* __WINDOWS__ */
}
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
/* need to release jdata separately as it won't be