diff --git a/orte/orted/orted_main.c b/orte/orted/orted_main.c index 0c2ac1df79..6086c1a351 100644 --- a/orte/orted/orted_main.c +++ b/orte/orted/orted_main.c @@ -97,6 +97,7 @@ static opal_event_t sigusr2_handler; #endif /* __WINDOWS__ */ char *log_path = NULL; static opal_event_t *orted_exit_event; +static bool signals_set=false; static void shutdown_callback(int fd, short flags, void *arg); static void shutdown_signal(int fd, short flags, void *arg); @@ -374,7 +375,8 @@ int orte_daemon(int argc, char *argv[]) } /* otherwise, return with non-zero status */ - return ORTE_ERROR_DEFAULT_EXIT_CODE; + ret = ORTE_ERROR_DEFAULT_EXIT_CODE; + goto DONE; } } } @@ -410,7 +412,7 @@ int orte_daemon(int argc, char *argv[]) */ if (ORTE_SUCCESS != (ret = orte_wait_event(&orted_exit_event, &orte_exit, "orted_shutdown", shutdown_callback))) { ORTE_ERROR_LOG(ret); - return ret; + goto DONE; } /* setup the primary daemon command receive function */ @@ -418,7 +420,7 @@ int orte_daemon(int argc, char *argv[]) ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); - return ret; + goto DONE; } /* Set signal handlers to catch kill signals so we can properly clean up @@ -441,6 +443,8 @@ int orte_daemon(int argc, char *argv[]) opal_signal_add(&sigusr2_handler, NULL); #endif /* __WINDOWS__ */ + signals_set = true; + /* setup stdout/stderr */ if (orte_debug_daemons_file_flag) { /* if we are debugging to a file, then send stdout/stderr to @@ -451,7 +455,7 @@ int orte_daemon(int argc, char *argv[]) if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring, ORTE_PROC_MY_NAME->jobid))) { ORTE_ERROR_LOG(ret); - return ret; + goto DONE; } /* define a log file name in the session directory */ @@ -608,13 +612,13 @@ int orte_daemon(int argc, char *argv[]) if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); - return ret; + goto DONE; } /* send our architecture */ if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &orte_process_info.arch, 1, OPAL_INT32))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); - return ret; + goto DONE; } if (orte_timing) { int64_t secs, usecs; @@ -623,13 +627,13 @@ int orte_daemon(int argc, char *argv[]) if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); - return ret; + goto DONE; } usecs = starttime.tv_usec; if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); - return ret; + goto DONE; } /* get and send our setup time */ gettimeofday(&setuptime, NULL); @@ -643,12 +647,12 @@ int orte_daemon(int argc, char *argv[]) if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); - return ret; + goto DONE; } if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); - return ret; + goto DONE; } /* include the actual timestamp so the HNP can figure out how * long it took for this message to arrive @@ -657,20 +661,20 @@ int orte_daemon(int argc, char *argv[]) if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &secs, 1, OPAL_INT64))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); - return ret; + goto DONE; } usecs = setuptime.tv_usec; if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &usecs, 1, OPAL_INT64))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); - return ret; + goto DONE; } } if (0 > (ret = orte_rml.send_buffer(ORTE_PROC_MY_HNP, buffer, ORTE_RML_TAG_ORTED_CALLBACK, 0))) { ORTE_ERROR_LOG(ret); OBJ_RELEASE(buffer); - return ret; + goto DONE; } OBJ_RELEASE(buffer); /* done with this */ } @@ -688,6 +692,16 @@ int orte_daemon(int argc, char *argv[]) opal_event_dispatch(); /* should never get here, but if we do... */ +DONE: + if (signals_set) { + /* Release all local signal handlers */ + opal_event_del(&term_handler); + opal_event_del(&int_handler); +#ifndef __WINDOWS__ + opal_signal_del(&sigusr1_handler); + opal_signal_del(&sigusr2_handler); +#endif /* __WINDOWS__ */ + } /* cleanup any lingering session directories */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); @@ -696,7 +710,7 @@ int orte_daemon(int argc, char *argv[]) OBJ_DESTRUCT(&orte_exit); /* Finalize and clean up ourselves */ - ret = orte_finalize(); + orte_finalize(); return ret; } @@ -754,13 +768,15 @@ static void shutdown_callback(int fd, short flags, void *arg) exit(ORTE_ERROR_DEFAULT_EXIT_CODE); } - /* Release all local signal handlers */ - opal_event_del(&term_handler); - opal_event_del(&int_handler); + if (signals_set) { + /* Release all local signal handlers */ + opal_event_del(&term_handler); + opal_event_del(&int_handler); #ifndef __WINDOWS__ - opal_signal_del(&sigusr1_handler); - opal_signal_del(&sigusr2_handler); + opal_signal_del(&sigusr1_handler); + opal_signal_del(&sigusr2_handler); #endif /* __WINDOWS__ */ + } /* Finalize and clean up ourselves */ ret = orte_finalize(); diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index c6adb7ab02..a93d51548a 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -123,6 +123,7 @@ static opal_event_t *abort_exit_event=NULL; static bool forcibly_die = false; static opal_event_t *timeout_ev=NULL; static bool profile_is_set = false; +static bool signals_set=false; /* * Globals @@ -603,6 +604,8 @@ int orterun(int argc, char *argv[]) } #endif /* __WINDOWS__ */ + signals_set = true; + /* we are an hnp, so update the contact info field for later use */ orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); @@ -683,7 +686,8 @@ int orterun(int argc, char *argv[]) ORTE_ERROR_LOG(rc); orte_show_help("help-orterun.txt", "orterun:precondition", false, orterun_basename, NULL, NULL, rc); - return rc; + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + goto DONE; } /* setup to listen for commands sent specifically to me, even though I would probably @@ -695,13 +699,15 @@ int orterun(int argc, char *argv[]) ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); if (rc != ORTE_SUCCESS && rc != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(rc); - return rc; + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + goto DONE; } /* setup the data server */ if (ORTE_SUCCESS != (rc = orte_data_server_init())) { ORTE_ERROR_LOG(rc); - return rc; + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); + goto DONE; } /* if an uri for the ompi-server was provided, set the route */ @@ -729,7 +735,7 @@ int orterun(int argc, char *argv[]) orterun_basename, ompi_server, (long)orterun_globals.server_wait_timeout, ORTE_ERROR_NAME(rc)); - orte_exit_status = ORTE_ERROR_DEFAULT_EXIT_CODE; + ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); goto DONE; } } @@ -751,7 +757,22 @@ int orterun(int argc, char *argv[]) /* we only reach this point by jumping there due * to an error - so just cleanup and leave */ -DONE: +DONE: + if (signals_set) { + /* Remove the TERM and INT signal handlers */ + opal_signal_del(&term_handler); + opal_signal_del(&int_handler); +#ifndef __WINDOWS__ + /** Remove the USR signal handlers */ + opal_signal_del(&sigusr1_handler); + opal_signal_del(&sigusr2_handler); + if (orte_forward_job_control) { + opal_signal_del(&sigtstp_handler); + opal_signal_del(&sigcont_handler); + } +#endif /* __WINDOWS__ */ + } + /* whack any lingering session directory files from our jobs */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); @@ -837,6 +858,21 @@ static void job_completed(int trigpipe, short event, void *arg) * all we can do is cleanly exit ourselves */ DONE: + if (signals_set) { + /* Remove the TERM and INT signal handlers */ + opal_signal_del(&term_handler); + opal_signal_del(&int_handler); +#ifndef __WINDOWS__ + /** Remove the USR signal handlers */ + opal_signal_del(&sigusr1_handler); + opal_signal_del(&sigusr2_handler); + if (orte_forward_job_control) { + opal_signal_del(&sigtstp_handler); + opal_signal_del(&sigcont_handler); + } +#endif /* __WINDOWS__ */ + } + /* whack any lingering session directory files from our jobs */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); @@ -861,18 +897,20 @@ static void terminated(int trigpipe, short event, void *arg) free(timeout_ev); } - /* Remove the TERM and INT signal handlers */ - opal_signal_del(&term_handler); - opal_signal_del(&int_handler); + if (signals_set) { + /* Remove the TERM and INT signal handlers */ + opal_signal_del(&term_handler); + opal_signal_del(&int_handler); #ifndef __WINDOWS__ - /** Remove the USR signal handlers */ - opal_signal_del(&sigusr1_handler); - opal_signal_del(&sigusr2_handler); - if (orte_forward_job_control) { - opal_signal_del(&sigtstp_handler); - opal_signal_del(&sigcont_handler); - } + /** Remove the USR signal handlers */ + opal_signal_del(&sigusr1_handler); + opal_signal_del(&sigusr2_handler); + if (orte_forward_job_control) { + opal_signal_del(&sigtstp_handler); + opal_signal_del(&sigcont_handler); + } #endif /* __WINDOWS__ */ + } /* get the daemon job object */ if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) { @@ -1133,6 +1171,20 @@ static void abort_exit_callback(int fd, short ign, void *arg) * the point of setting the job up, so there is nothing * to do but just clean ourselves up and exit */ + if (signals_set) { + /* Remove the TERM and INT signal handlers */ + opal_signal_del(&term_handler); + opal_signal_del(&int_handler); +#ifndef __WINDOWS__ + /** Remove the USR signal handlers */ + opal_signal_del(&sigusr1_handler); + opal_signal_del(&sigusr2_handler); + if (orte_forward_job_control) { + opal_signal_del(&sigtstp_handler); + opal_signal_del(&sigcont_handler); + } +#endif /* __WINDOWS__ */ + } orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); /* need to release jdata separately as it won't be