diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 3d61802d98..5ea0c912e2 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -86,12 +86,6 @@ static struct opal_event sigusr2_handler; static orte_jobid_t jobid = ORTE_JOBID_MAX; static orte_pointer_array_t *apps_pa; static bool wait_for_job_completion = true; -static char *abort_msg = NULL; -static size_t abort_msg_len = 0; -static char *sigusr1_msg = NULL; -static size_t sigusr1_msg_len = 0; -static char *sigusr2_msg = NULL; -static size_t sigusr2_msg_len = 0; static char *orterun_basename = NULL; static int max_display_aborted = 1; static int num_aborted = 0; @@ -256,9 +250,8 @@ opal_cmd_line_init_t cmd_line_init[] = { * Local functions */ static void exit_callback(int fd, short event, void *arg); -static void abort_signal_callback(int fd, short flags, void *arg); -static void sigusr1_callback(int fd, short flags, void *arg); -static void sigusr2_callback(int fd, short flags, void *arg); +static void abort_signal_callback(int fd, short event, void *arg); +static void signal_forward_callback(int fd, short event, void *arg); static int create_app(int argc, char* argv[], orte_app_context_t **app, bool *made_app, char ***app_env); static int init_globals(void); @@ -280,17 +273,9 @@ int orterun(int argc, char *argv[]) mca_base_param_init(); orte_register_params(false); - /* Setup the abort message (for use in the signal handler) */ - + /* find our basename (the name of the executable) so that we can + use it in pretty-print error messages */ orterun_basename = opal_basename(argv[0]); - asprintf(&abort_msg, "%s: killing job...\n", orterun_basename); - abort_msg_len = strlen(abort_msg); - - /** Setup the user signal message (for use in the signal handler) */ - asprintf(&sigusr1_msg, "%s: received SIGUSR1 signal\n", orterun_basename); - sigusr1_msg_len = strlen(sigusr1_msg); - asprintf(&sigusr2_msg, "%s: received SIGUSR2 signal\n", orterun_basename); - sigusr2_msg_len = strlen(sigusr2_msg); /* Check for some "global" command line params */ @@ -410,18 +395,18 @@ int orterun(int argc, char *argv[]) /** setup callbacks for abort signals */ opal_signal_set(&term_handler, SIGTERM, - abort_signal_callback, NULL); + abort_signal_callback, &term_handler); opal_signal_add(&term_handler, NULL); opal_signal_set(&int_handler, SIGINT, - abort_signal_callback, NULL); + abort_signal_callback, &int_handler); opal_signal_add(&int_handler, NULL); - /** setup callbacks for user signals */ + /** setup callbacks for signals we should foward */ opal_signal_set(&sigusr1_handler, SIGUSR1, - sigusr1_callback, NULL); + signal_forward_callback, &sigusr1_handler); opal_signal_add(&sigusr1_handler, NULL); opal_signal_set(&sigusr2_handler, SIGUSR2, - sigusr2_callback, NULL); + signal_forward_callback, &sigusr2_handler); opal_signal_add(&sigusr2_handler, NULL); orte_totalview_init_before_spawn(); @@ -476,7 +461,6 @@ int orterun(int argc, char *argv[]) free(apps); OBJ_RELEASE(apps_pa); orte_finalize(); - free(abort_msg); free(orterun_basename); free(proc_infos); return rc; @@ -719,7 +703,7 @@ static void abort_signal_callback(int fd, short flags, void *arg) if (0 != signalled++) { return; } - write(2, abort_msg, abort_msg_len); + fprintf(stderr, "%s: killing job...", orterun_basename); if (jobid != ORTE_JOBID_MAX) { ret = orte_rmgr.terminate_job(jobid); @@ -738,47 +722,22 @@ static void abort_signal_callback(int fd, short flags, void *arg) /** * Pass user signals to the remote application processes */ - -static void sigusr1_callback(int fd, short flags, void *arg) +static void signal_forward_callback(int fd, short event, void *arg) { - int ret; - static int signalled = 0; + struct opal_event *signal = arg; + int signum, ret; OPAL_TRACE(1); - if (0 != signalled++) { /** protect against multiple entry */ - return; - } - - write (2, sigusr1_msg, sigusr1_msg_len); + signum = OPAL_EVENT_SIGNAL(signal); + fprintf(stderr, "%s: Forwarding signal %d to job", + orterun_basename, signum); /** send the signal out to the processes */ - - if (ORTE_SUCCESS != (ret = orte_rmgr.signal_job(jobid, SIGUSR1))) { - fprintf(stderr, "SIGUSR1 could not be sent to the job\n"); + if (ORTE_SUCCESS != (ret = orte_rmgr.signal_job(jobid, signum))) { + fprintf(stderr, "Signal %d could not be sent to the job (returned %d)", + signum, ret); } - -} - -static void sigusr2_callback(int fd, short flags, void *arg) -{ - int ret; - static int signalled = 0; - - OPAL_TRACE(1); - - if (0 != signalled++) { /** protect against multiple entry */ - return; - } - - write (2, sigusr2_msg, sigusr2_msg_len); - - /** send the signal out to the processes */ - - if (ORTE_SUCCESS != (ret = orte_rmgr.signal_job(jobid, SIGUSR2))) { - fprintf(stderr, "SIGUSR2 could not be sent to the job\n"); - } - }