* fix up signal handling code so that one function handles SIGUSR1 and
SIGUSR2. This can be extended later if needed to include other signals we should forward to the user processes (TSTP and CONT, perhaps?) * Since the signal handlers don't actually run in signal context, we can use malloc/fprintf/etc. So clean up some of the signal handler code so that we don't keep message buffers around for the life of the process This commit was SVN r10496.
Этот коммит содержится в:
родитель
b7715395cb
Коммит
4e8abb943b
@ -86,12 +86,6 @@ static struct opal_event sigusr2_handler;
|
|||||||
static orte_jobid_t jobid = ORTE_JOBID_MAX;
|
static orte_jobid_t jobid = ORTE_JOBID_MAX;
|
||||||
static orte_pointer_array_t *apps_pa;
|
static orte_pointer_array_t *apps_pa;
|
||||||
static bool wait_for_job_completion = true;
|
static bool wait_for_job_completion = true;
|
||||||
static char *abort_msg = NULL;
|
|
||||||
static size_t abort_msg_len = 0;
|
|
||||||
static char *sigusr1_msg = NULL;
|
|
||||||
static size_t sigusr1_msg_len = 0;
|
|
||||||
static char *sigusr2_msg = NULL;
|
|
||||||
static size_t sigusr2_msg_len = 0;
|
|
||||||
static char *orterun_basename = NULL;
|
static char *orterun_basename = NULL;
|
||||||
static int max_display_aborted = 1;
|
static int max_display_aborted = 1;
|
||||||
static int num_aborted = 0;
|
static int num_aborted = 0;
|
||||||
@ -256,9 +250,8 @@ opal_cmd_line_init_t cmd_line_init[] = {
|
|||||||
* Local functions
|
* Local functions
|
||||||
*/
|
*/
|
||||||
static void exit_callback(int fd, short event, void *arg);
|
static void exit_callback(int fd, short event, void *arg);
|
||||||
static void abort_signal_callback(int fd, short flags, void *arg);
|
static void abort_signal_callback(int fd, short event, void *arg);
|
||||||
static void sigusr1_callback(int fd, short flags, void *arg);
|
static void signal_forward_callback(int fd, short event, void *arg);
|
||||||
static void sigusr2_callback(int fd, short flags, void *arg);
|
|
||||||
static int create_app(int argc, char* argv[], orte_app_context_t **app,
|
static int create_app(int argc, char* argv[], orte_app_context_t **app,
|
||||||
bool *made_app, char ***app_env);
|
bool *made_app, char ***app_env);
|
||||||
static int init_globals(void);
|
static int init_globals(void);
|
||||||
@ -280,17 +273,9 @@ int orterun(int argc, char *argv[])
|
|||||||
mca_base_param_init();
|
mca_base_param_init();
|
||||||
orte_register_params(false);
|
orte_register_params(false);
|
||||||
|
|
||||||
/* Setup the abort message (for use in the signal handler) */
|
/* find our basename (the name of the executable) so that we can
|
||||||
|
use it in pretty-print error messages */
|
||||||
orterun_basename = opal_basename(argv[0]);
|
orterun_basename = opal_basename(argv[0]);
|
||||||
asprintf(&abort_msg, "%s: killing job...\n", orterun_basename);
|
|
||||||
abort_msg_len = strlen(abort_msg);
|
|
||||||
|
|
||||||
/** Setup the user signal message (for use in the signal handler) */
|
|
||||||
asprintf(&sigusr1_msg, "%s: received SIGUSR1 signal\n", orterun_basename);
|
|
||||||
sigusr1_msg_len = strlen(sigusr1_msg);
|
|
||||||
asprintf(&sigusr2_msg, "%s: received SIGUSR2 signal\n", orterun_basename);
|
|
||||||
sigusr2_msg_len = strlen(sigusr2_msg);
|
|
||||||
|
|
||||||
/* Check for some "global" command line params */
|
/* Check for some "global" command line params */
|
||||||
|
|
||||||
@ -410,18 +395,18 @@ int orterun(int argc, char *argv[])
|
|||||||
|
|
||||||
/** setup callbacks for abort signals */
|
/** setup callbacks for abort signals */
|
||||||
opal_signal_set(&term_handler, SIGTERM,
|
opal_signal_set(&term_handler, SIGTERM,
|
||||||
abort_signal_callback, NULL);
|
abort_signal_callback, &term_handler);
|
||||||
opal_signal_add(&term_handler, NULL);
|
opal_signal_add(&term_handler, NULL);
|
||||||
opal_signal_set(&int_handler, SIGINT,
|
opal_signal_set(&int_handler, SIGINT,
|
||||||
abort_signal_callback, NULL);
|
abort_signal_callback, &int_handler);
|
||||||
opal_signal_add(&int_handler, NULL);
|
opal_signal_add(&int_handler, NULL);
|
||||||
|
|
||||||
/** setup callbacks for user signals */
|
/** setup callbacks for signals we should foward */
|
||||||
opal_signal_set(&sigusr1_handler, SIGUSR1,
|
opal_signal_set(&sigusr1_handler, SIGUSR1,
|
||||||
sigusr1_callback, NULL);
|
signal_forward_callback, &sigusr1_handler);
|
||||||
opal_signal_add(&sigusr1_handler, NULL);
|
opal_signal_add(&sigusr1_handler, NULL);
|
||||||
opal_signal_set(&sigusr2_handler, SIGUSR2,
|
opal_signal_set(&sigusr2_handler, SIGUSR2,
|
||||||
sigusr2_callback, NULL);
|
signal_forward_callback, &sigusr2_handler);
|
||||||
opal_signal_add(&sigusr2_handler, NULL);
|
opal_signal_add(&sigusr2_handler, NULL);
|
||||||
|
|
||||||
orte_totalview_init_before_spawn();
|
orte_totalview_init_before_spawn();
|
||||||
@ -476,7 +461,6 @@ int orterun(int argc, char *argv[])
|
|||||||
free(apps);
|
free(apps);
|
||||||
OBJ_RELEASE(apps_pa);
|
OBJ_RELEASE(apps_pa);
|
||||||
orte_finalize();
|
orte_finalize();
|
||||||
free(abort_msg);
|
|
||||||
free(orterun_basename);
|
free(orterun_basename);
|
||||||
free(proc_infos);
|
free(proc_infos);
|
||||||
return rc;
|
return rc;
|
||||||
@ -719,7 +703,7 @@ static void abort_signal_callback(int fd, short flags, void *arg)
|
|||||||
if (0 != signalled++) {
|
if (0 != signalled++) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
write(2, abort_msg, abort_msg_len);
|
fprintf(stderr, "%s: killing job...", orterun_basename);
|
||||||
|
|
||||||
if (jobid != ORTE_JOBID_MAX) {
|
if (jobid != ORTE_JOBID_MAX) {
|
||||||
ret = orte_rmgr.terminate_job(jobid);
|
ret = orte_rmgr.terminate_job(jobid);
|
||||||
@ -738,47 +722,22 @@ static void abort_signal_callback(int fd, short flags, void *arg)
|
|||||||
/**
|
/**
|
||||||
* Pass user signals to the remote application processes
|
* Pass user signals to the remote application processes
|
||||||
*/
|
*/
|
||||||
|
static void signal_forward_callback(int fd, short event, void *arg)
|
||||||
static void sigusr1_callback(int fd, short flags, void *arg)
|
|
||||||
{
|
{
|
||||||
int ret;
|
struct opal_event *signal = arg;
|
||||||
static int signalled = 0;
|
int signum, ret;
|
||||||
|
|
||||||
OPAL_TRACE(1);
|
OPAL_TRACE(1);
|
||||||
|
|
||||||
if (0 != signalled++) { /** protect against multiple entry */
|
signum = OPAL_EVENT_SIGNAL(signal);
|
||||||
return;
|
fprintf(stderr, "%s: Forwarding signal %d to job",
|
||||||
}
|
orterun_basename, signum);
|
||||||
|
|
||||||
write (2, sigusr1_msg, sigusr1_msg_len);
|
|
||||||
|
|
||||||
/** send the signal out to the processes */
|
/** send the signal out to the processes */
|
||||||
|
if (ORTE_SUCCESS != (ret = orte_rmgr.signal_job(jobid, signum))) {
|
||||||
if (ORTE_SUCCESS != (ret = orte_rmgr.signal_job(jobid, SIGUSR1))) {
|
fprintf(stderr, "Signal %d could not be sent to the job (returned %d)",
|
||||||
fprintf(stderr, "SIGUSR1 could not be sent to the job\n");
|
signum, ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
static void sigusr2_callback(int fd, short flags, void *arg)
|
|
||||||
{
|
|
||||||
int ret;
|
|
||||||
static int signalled = 0;
|
|
||||||
|
|
||||||
OPAL_TRACE(1);
|
|
||||||
|
|
||||||
if (0 != signalled++) { /** protect against multiple entry */
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
write (2, sigusr2_msg, sigusr2_msg_len);
|
|
||||||
|
|
||||||
/** send the signal out to the processes */
|
|
||||||
|
|
||||||
if (ORTE_SUCCESS != (ret = orte_rmgr.signal_job(jobid, SIGUSR2))) {
|
|
||||||
fprintf(stderr, "SIGUSR2 could not be sent to the job\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Загрузка…
Ссылка в новой задаче
Block a user