diff --git a/orte/mca/ess/base/ess_base_frame.c b/orte/mca/ess/base/ess_base_frame.c index 0eba2c98e9..c74075f289 100644 --- a/orte/mca/ess/base/ess_base_frame.c +++ b/orte/mca/ess/base/ess_base_frame.c @@ -161,13 +161,34 @@ static struct known_signal known_signals[] = { {SIGHUP, "SIGHUP", false}, {SIGINT, "SIGINT", false}, {SIGKILL, "SIGKILL", false}, + {SIGPIPE, "SIGPIPE", false}, +#ifdef SIGQUIT + {SIGQUIT, "SIGQUIT", false}, +#endif +#ifdef SIGTRAP + {SIGTRAP, "SIGTRAP", true}, +#endif +#ifdef SIGTSTP + {SIGTSTP, "SIGTSTP", true}, +#endif +#ifdef SIGABRT + {SIGABRT, "SIGABRT", true}, +#endif +#ifdef SIGCONT + {SIGCONT, "SIGCONT", true}, +#endif #ifdef SIGSYS {SIGSYS, "SIGSYS", true}, #endif #ifdef SIGXCPU {SIGXCPU, "SIGXCPU", true}, #endif +#ifdef SIGXFSZ {SIGXFSZ, "SIGXFSZ", true}, +#endif +#ifdef SIGALRM + {SIGALRM, "SIGALRM", true}, +#endif #ifdef SIGVTALRM {SIGVTALRM, "SIGVTALRM", true}, #endif diff --git a/orte/mca/ess/base/ess_base_std_orted.c b/orte/mca/ess/base/ess_base_std_orted.c index 57b9d2e7a7..d90eb7761d 100644 --- a/orte/mca/ess/base/ess_base_std_orted.c +++ b/orte/mca/ess/base/ess_base_std_orted.c @@ -88,12 +88,11 @@ static bool signals_set=false; static opal_event_t term_handler; static opal_event_t int_handler; static opal_event_t epipe_handler; -static opal_event_t sigusr1_handler; -static opal_event_t sigusr2_handler; static char *log_path = NULL; static void shutdown_signal(int fd, short flags, void *arg); -static void signal_callback(int fd, short flags, void *arg); static void epipe_signal_callback(int fd, short flags, void *arg); +static void signal_forward_callback(int fd, short event, void *arg); +static opal_event_t *forward_signals_events = NULL; static void setup_sighandler(int signal, opal_event_t *ev, opal_event_cbfunc_t cbfunc) @@ -119,6 +118,8 @@ int orte_ess_base_orted_setup(void) unsigned i, j; orte_topology_t *t; opal_list_t transports; + orte_ess_base_signal_t *sig; + int idx; /* my name is set, xfer it to the OPAL layer */ orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME; @@ -128,6 +129,7 @@ int orte_ess_base_orted_setup(void) opal_proc_local_set(&orte_process_info.super); plm_in_use = false; + /* setup callback for SIGPIPE */ setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /* Set signal handlers to catch kill signals so we can properly clean up @@ -135,11 +137,23 @@ int orte_ess_base_orted_setup(void) */ setup_sighandler(SIGTERM, &term_handler, shutdown_signal); setup_sighandler(SIGINT, &int_handler, shutdown_signal); - /** setup callbacks for signals we should ignore */ - setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback); - setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback); + /** setup callbacks for signals we should forward */ + if (0 < (idx = opal_list_get_size(&orte_ess_base_signals))) { + forward_signals_events = (opal_event_t*)malloc(sizeof(opal_event_t) * idx); + if (NULL == forward_signals_events) { + ret = ORTE_ERR_OUT_OF_RESOURCE; + error = "unable to malloc"; + goto error; + } + idx = 0; + OPAL_LIST_FOREACH(sig, &orte_ess_base_signals, orte_ess_base_signal_t) { + setup_sighandler(sig->signal, forward_signals_events + idx, signal_forward_callback); + ++idx; + } + } signals_set = true; + /* get the local topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { @@ -653,14 +667,24 @@ int orte_ess_base_orted_setup(void) int orte_ess_base_orted_finalize(void) { + orte_ess_base_signal_t *sig; + unsigned int i; + if (signals_set) { - /* Release all local signal handlers */ opal_event_del(&epipe_handler); opal_event_del(&term_handler); opal_event_del(&int_handler); - opal_event_signal_del(&sigusr1_handler); - opal_event_signal_del(&sigusr2_handler); + /** Remove the USR signal handlers */ + i = 0; + OPAL_LIST_FOREACH(sig, &orte_ess_base_signals, orte_ess_base_signal_t) { + opal_event_signal_del(forward_signals_events + i); + ++i; + } + free (forward_signals_events); + forward_signals_events = NULL; + signals_set = false; } + /* cleanup */ if (NULL != log_path) { unlink(log_path); @@ -717,7 +741,51 @@ static void epipe_signal_callback(int fd, short flags, void *arg) return; } -static void signal_callback(int fd, short event, void *arg) +/* Pass user signals to the local application processes */ +static void signal_forward_callback(int fd, short event, void *arg) { - /* just ignore these signals */ + opal_event_t *signal = (opal_event_t*)arg; + int32_t signum, rc; + opal_buffer_t *cmd; + orte_daemon_cmd_flag_t command=ORTE_DAEMON_SIGNAL_LOCAL_PROCS; + orte_jobid_t job = ORTE_JOBID_WILDCARD; + + signum = OPAL_EVENT_SIGNAL(signal); + if (!orte_execute_quiet){ + fprintf(stderr, "%s: Forwarding signal %d to job\n", + orte_basename, signum); + } + + cmd = OBJ_NEW(opal_buffer_t); + + /* pack the command */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return; + } + + /* pack the jobid */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return; + } + + /* pack the signal */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &signum, 1, OPAL_INT32))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + return; + } + + /* send it to ourselves */ + if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, + ORTE_PROC_MY_NAME, cmd, + ORTE_RML_TAG_DAEMON, + NULL, NULL))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(cmd); + } + } diff --git a/orte/mca/ess/slurm/ess_slurm_module.c b/orte/mca/ess/slurm/ess_slurm_module.c index 7982fe10aa..59f23099b0 100644 --- a/orte/mca/ess/slurm/ess_slurm_module.c +++ b/orte/mca/ess/slurm/ess_slurm_module.c @@ -59,24 +59,10 @@ orte_ess_base_module_t orte_ess_slurm_module = { NULL /* ft_event */ }; -static void signal_forward_callback(int fd, short event, void *arg); -static opal_event_t *forward_signals_events = NULL; -static bool signals_set=false; - -static void setup_sighandler(int signal, opal_event_t *ev, - opal_event_cbfunc_t cbfunc) -{ - opal_event_signal_set(orte_event_base, ev, signal, cbfunc, ev); - opal_event_set_priority(ev, ORTE_ERROR_PRI); - opal_event_signal_add(ev, NULL); -} - static int rte_init(void) { int ret; char *error = NULL; - orte_ess_base_signal_t *sig; - int idx; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { @@ -91,29 +77,11 @@ static int rte_init(void) * default procedure */ if (ORTE_PROC_IS_DAEMON) { - /** setup callbacks for signals we should forward */ - if (0 < (idx = opal_list_get_size(&orte_ess_base_signals))) { - forward_signals_events = (opal_event_t*)malloc(sizeof(opal_event_t) * idx); - if (NULL == forward_signals_events) { - ret = ORTE_ERR_OUT_OF_RESOURCE; - error = "unable to malloc"; - goto error; - } - idx = 0; - OPAL_LIST_FOREACH(sig, &orte_ess_base_signals, orte_ess_base_signal_t) { - setup_sighandler(sig->signal, forward_signals_events + idx, signal_forward_callback); - ++idx; - } - } - signals_set = true; - if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } - /* setup the signal handlers */ - return ORTE_SUCCESS; } @@ -145,23 +113,9 @@ error: static int rte_finalize(void) { int ret; - orte_ess_base_signal_t *sig; - unsigned int i; /* if I am a daemon, finalize using the default procedure */ if (ORTE_PROC_IS_DAEMON) { - if (signals_set) { - /** Remove the USR signal handlers */ - i = 0; - OPAL_LIST_FOREACH(sig, &orte_ess_base_signals, orte_ess_base_signal_t) { - opal_event_signal_del(forward_signals_events + i); - ++i; - } - free (forward_signals_events); - forward_signals_events = NULL; - signals_set = false; - } - if (ORTE_SUCCESS != (ret = orte_ess_base_orted_finalize())) { ORTE_ERROR_LOG(ret); return ret; @@ -246,52 +200,3 @@ static int slurm_set_name(void) return ORTE_SUCCESS; } - -/* Pass user signals to the local application processes */ -static void signal_forward_callback(int fd, short event, void *arg) -{ - opal_event_t *signal = (opal_event_t*)arg; - int32_t signum, rc; - opal_buffer_t *cmd; - orte_daemon_cmd_flag_t command=ORTE_DAEMON_SIGNAL_LOCAL_PROCS; - orte_jobid_t job = ORTE_JOBID_WILDCARD; - - signum = OPAL_EVENT_SIGNAL(signal); - if (!orte_execute_quiet){ - fprintf(stderr, "%s: Forwarding signal %d to job\n", - orte_basename, signum); - } - - cmd = OBJ_NEW(opal_buffer_t); - - /* pack the command */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &command, 1, ORTE_DAEMON_CMD))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - return; - } - - /* pack the jobid */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &job, 1, ORTE_JOBID))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - return; - } - - /* pack the signal */ - if (ORTE_SUCCESS != (rc = opal_dss.pack(cmd, &signum, 1, OPAL_INT32))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - return; - } - - /* send it to ourselves */ - if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit, - ORTE_PROC_MY_NAME, cmd, - ORTE_RML_TAG_DAEMON, - NULL, NULL))) { - ORTE_ERROR_LOG(rc); - OBJ_RELEASE(cmd); - } - -}