1
1

* fix up signal handling code so that one function handles SIGUSR1 and

SIGUSR2.  This can be extended later if needed to include other
  signals we should forward to the user processes (TSTP and CONT,
  perhaps?)
* Since the signal handlers don't actually run in signal context, we
  can use malloc/fprintf/etc.  So clean up some of the signal handler
  code so that we don't keep message buffers around for the life of
  the process

This commit was SVN r10496.
Этот коммит содержится в:
Brian Barrett 2006-06-26 15:12:52 +00:00
родитель b7715395cb
Коммит 4e8abb943b

Просмотреть файл

@ -86,12 +86,6 @@ static struct opal_event sigusr2_handler;
static orte_jobid_t jobid = ORTE_JOBID_MAX;
static orte_pointer_array_t *apps_pa;
static bool wait_for_job_completion = true;
static char *abort_msg = NULL;
static size_t abort_msg_len = 0;
static char *sigusr1_msg = NULL;
static size_t sigusr1_msg_len = 0;
static char *sigusr2_msg = NULL;
static size_t sigusr2_msg_len = 0;
static char *orterun_basename = NULL;
static int max_display_aborted = 1;
static int num_aborted = 0;
@ -256,9 +250,8 @@ opal_cmd_line_init_t cmd_line_init[] = {
* Local functions
*/
static void exit_callback(int fd, short event, void *arg);
static void abort_signal_callback(int fd, short flags, void *arg);
static void sigusr1_callback(int fd, short flags, void *arg);
static void sigusr2_callback(int fd, short flags, void *arg);
static void abort_signal_callback(int fd, short event, void *arg);
static void signal_forward_callback(int fd, short event, void *arg);
static int create_app(int argc, char* argv[], orte_app_context_t **app,
bool *made_app, char ***app_env);
static int init_globals(void);
@ -280,17 +273,9 @@ int orterun(int argc, char *argv[])
mca_base_param_init();
orte_register_params(false);
/* Setup the abort message (for use in the signal handler) */
/* find our basename (the name of the executable) so that we can
use it in pretty-print error messages */
orterun_basename = opal_basename(argv[0]);
asprintf(&abort_msg, "%s: killing job...\n", orterun_basename);
abort_msg_len = strlen(abort_msg);
/** Setup the user signal message (for use in the signal handler) */
asprintf(&sigusr1_msg, "%s: received SIGUSR1 signal\n", orterun_basename);
sigusr1_msg_len = strlen(sigusr1_msg);
asprintf(&sigusr2_msg, "%s: received SIGUSR2 signal\n", orterun_basename);
sigusr2_msg_len = strlen(sigusr2_msg);
/* Check for some "global" command line params */
@ -410,18 +395,18 @@ int orterun(int argc, char *argv[])
/** setup callbacks for abort signals */
opal_signal_set(&term_handler, SIGTERM,
abort_signal_callback, NULL);
abort_signal_callback, &term_handler);
opal_signal_add(&term_handler, NULL);
opal_signal_set(&int_handler, SIGINT,
abort_signal_callback, NULL);
abort_signal_callback, &int_handler);
opal_signal_add(&int_handler, NULL);
/** setup callbacks for user signals */
/** setup callbacks for signals we should foward */
opal_signal_set(&sigusr1_handler, SIGUSR1,
sigusr1_callback, NULL);
signal_forward_callback, &sigusr1_handler);
opal_signal_add(&sigusr1_handler, NULL);
opal_signal_set(&sigusr2_handler, SIGUSR2,
sigusr2_callback, NULL);
signal_forward_callback, &sigusr2_handler);
opal_signal_add(&sigusr2_handler, NULL);
orte_totalview_init_before_spawn();
@ -476,7 +461,6 @@ int orterun(int argc, char *argv[])
free(apps);
OBJ_RELEASE(apps_pa);
orte_finalize();
free(abort_msg);
free(orterun_basename);
free(proc_infos);
return rc;
@ -719,7 +703,7 @@ static void abort_signal_callback(int fd, short flags, void *arg)
if (0 != signalled++) {
return;
}
write(2, abort_msg, abort_msg_len);
fprintf(stderr, "%s: killing job...", orterun_basename);
if (jobid != ORTE_JOBID_MAX) {
ret = orte_rmgr.terminate_job(jobid);
@ -738,47 +722,22 @@ static void abort_signal_callback(int fd, short flags, void *arg)
/**
* Pass user signals to the remote application processes
*/
static void sigusr1_callback(int fd, short flags, void *arg)
static void signal_forward_callback(int fd, short event, void *arg)
{
int ret;
static int signalled = 0;
struct opal_event *signal = arg;
int signum, ret;
OPAL_TRACE(1);
if (0 != signalled++) { /** protect against multiple entry */
return;
}
write (2, sigusr1_msg, sigusr1_msg_len);
signum = OPAL_EVENT_SIGNAL(signal);
fprintf(stderr, "%s: Forwarding signal %d to job",
orterun_basename, signum);
/** send the signal out to the processes */
if (ORTE_SUCCESS != (ret = orte_rmgr.signal_job(jobid, SIGUSR1))) {
fprintf(stderr, "SIGUSR1 could not be sent to the job\n");
if (ORTE_SUCCESS != (ret = orte_rmgr.signal_job(jobid, signum))) {
fprintf(stderr, "Signal %d could not be sent to the job (returned %d)",
signum, ret);
}
}
static void sigusr2_callback(int fd, short flags, void *arg)
{
int ret;
static int signalled = 0;
OPAL_TRACE(1);
if (0 != signalled++) { /** protect against multiple entry */
return;
}
write (2, sigusr2_msg, sigusr2_msg_len);
/** send the signal out to the processes */
if (ORTE_SUCCESS != (ret = orte_rmgr.signal_job(jobid, SIGUSR2))) {
fprintf(stderr, "SIGUSR2 could not be sent to the job\n");
}
}