1
1

Add the ability to forward SIGTSTP (converted to SIGSTOP) and

SIGCONT to the a.outs.  By default, they are not forwarded and
the behavior remains as it has always been.  However, if one
runs with --mca orte_forward_job_control 1, then mpirun will
catch those two signals and forward them to the orteds which
will deliver them to the a.outs.  We have had requests for
this feature.

This commit was SVN r20391.
This commit is contained in:
Rolf vandeVaart 2009-01-30 18:50:10 +00:00
parent 5e6d3ba289
commit 0704b98668
6 changed files with 55 additions and 9 deletions

View File

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -2290,6 +2290,12 @@ int orte_odls_base_default_kill_local_procs(orte_jobid_t job, bool set_state,
goto MOVEON;
}
/* First send a SIGCONT in case the process is in stopped state.
If it is in a stopped state and we do not first change it to
running, then SIGTERM will not get delivered. Ignore return
value. */
kill_local(child->pid, SIGCONT);
/* Send a sigterm to the process. If we get ESRCH back, that
means the process is already dead, so just move on. */

View File

@ -12,6 +12,7 @@
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -393,10 +394,6 @@ static int process_commands(orte_process_name_t* sender,
/**** SIGNAL_LOCAL_PROCS ****/
case ORTE_DAEMON_SIGNAL_LOCAL_PROCS:
if (orte_debug_daemons_flag) {
opal_output(0, "%s orted_cmd: received signal_local_procs",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
/* unpack the jobid */
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) {
@ -410,7 +407,22 @@ static int process_commands(orte_process_name_t* sender,
ORTE_ERROR_LOG(ret);
goto CLEANUP;
}
/* Convert SIGTSTP to SIGSTOP so we can suspend a.out */
if (SIGTSTP == signal) {
if (orte_debug_daemons_flag) {
opal_output(0, "%s orted_cmd: converted SIGTSTP to SIGSTOP before delivering",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
signal = SIGSTOP;
}
if (orte_debug_daemons_flag) {
opal_output(0, "%s orted_cmd: received signal_local_procs, delivering signal %d",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
signal);
}
/* signal them */
if (ORTE_SUCCESS != (ret = orte_odls.signal_local_procs(NULL, signal))) {
ORTE_ERROR_LOG(ret);

View File

@ -10,6 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -109,6 +110,9 @@ opal_list_t orte_local_children;
/* list of job data for local children on a daemon */
opal_list_t orte_local_jobdata;
/* whether or not to forward SIGTSTP and SIGCONT signals */
bool orte_forward_job_control;
#endif /* !ORTE_DISABLE_FULL_RTE */
int orte_debug_output = -1;

View File

@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
@ -482,6 +482,8 @@ ORTE_DECLSPEC extern opal_pointer_array_t orte_jobmap;
ORTE_DECLSPEC extern opal_list_t orte_local_children;
/* list of job data for local children on a daemon */
ORTE_DECLSPEC extern opal_list_t orte_local_jobdata;
/* whether or not to forward SIGTSTP and SIGCONT signals */
ORTE_DECLSPEC extern bool orte_forward_job_control;
#endif /* ORTE_DISABLE_FULL_SUPPORT */

View File

@ -10,6 +10,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@ -244,6 +245,13 @@ int orte_register_params(void)
"Whether or not an allocation by a resource manager is required [default: no]",
false, false, (int)false, &value);
orte_allocation_required = OPAL_INT_TO_BOOL(value);
/* whether or not to forward SIGTSTP and SIGCONT signals */
mca_base_param_reg_int_name("orte", "forward_job_control",
"Forward SIGTSTP (after converting to SIGSTOP) and SIGCONT signals to the application procs [default: no]",
false, false,
(int) false, &value);
orte_forward_job_control = OPAL_INT_TO_BOOL(value);
#endif /* ORTE_DISABLE_FULL_SUPPORT */

View File

@ -11,7 +11,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
@ -101,6 +101,8 @@ static struct opal_event int_handler;
#ifndef __WINDOWS__
static struct opal_event sigusr1_handler;
static struct opal_event sigusr2_handler;
static struct opal_event sigtstp_handler;
static struct opal_event sigcont_handler;
#endif /* __WINDOWS__ */
static orte_job_t *jdata;
static char *orterun_basename = NULL;
@ -569,7 +571,7 @@ int orterun(int argc, char *argv[])
opal_signal_set(&int_handler, SIGINT,
abort_signal_callback, &int_handler);
opal_signal_add(&int_handler, NULL);
#ifndef __WINDOWS__
/** setup callbacks for signals we should foward */
opal_signal_set(&sigusr1_handler, SIGUSR1,
@ -578,6 +580,14 @@ int orterun(int argc, char *argv[])
opal_signal_set(&sigusr2_handler, SIGUSR2,
signal_forward_callback, &sigusr2_handler);
opal_signal_add(&sigusr2_handler, NULL);
if (orte_forward_job_control) {
opal_signal_set(&sigtstp_handler, SIGTSTP,
signal_forward_callback, &sigtstp_handler);
opal_signal_add(&sigtstp_handler, NULL);
opal_signal_set(&sigcont_handler, SIGCONT,
signal_forward_callback, &sigcont_handler);
opal_signal_add(&sigcont_handler, NULL);
}
#endif /* __WINDOWS__ */
/* we are an hnp, so update the contact info field for later use */
@ -845,6 +855,10 @@ static void terminated(int trigpipe, short event, void *arg)
/** Remove the USR signal handlers */
opal_signal_del(&sigusr1_handler);
opal_signal_del(&sigusr2_handler);
if (orte_forward_job_control) {
opal_signal_del(&sigtstp_handler);
opal_signal_del(&sigcont_handler);
}
#endif /* __WINDOWS__ */
/* get the daemon job object */