diff --git a/orte/mca/odls/base/odls_base_default_fns.c b/orte/mca/odls/base/odls_base_default_fns.c index 6abc2ac304..31a48b5326 100644 --- a/orte/mca/odls/base/odls_base_default_fns.c +++ b/orte/mca/odls/base/odls_base_default_fns.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -2290,6 +2290,12 @@ int orte_odls_base_default_kill_local_procs(orte_jobid_t job, bool set_state, goto MOVEON; } + + /* First send a SIGCONT in case the process is in stopped state. + If it is in a stopped state and we do not first change it to + running, then SIGTERM will not get delivered. Ignore return + value. */ + kill_local(child->pid, SIGCONT); /* Send a sigterm to the process. If we get ESRCH back, that means the process is already dead, so just move on. */ diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index bd90ac82f7..6fb6baa547 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -12,6 +12,7 @@ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -393,10 +394,6 @@ static int process_commands(orte_process_name_t* sender, /**** SIGNAL_LOCAL_PROCS ****/ case ORTE_DAEMON_SIGNAL_LOCAL_PROCS: - if (orte_debug_daemons_flag) { - opal_output(0, "%s orted_cmd: received signal_local_procs", - ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); - } /* unpack the jobid */ n = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &job, &n, ORTE_JOBID))) { @@ -410,7 +407,22 @@ static int process_commands(orte_process_name_t* sender, ORTE_ERROR_LOG(ret); goto CLEANUP; } - + + /* Convert SIGTSTP to SIGSTOP so we can suspend a.out */ + if (SIGTSTP == signal) { + if (orte_debug_daemons_flag) { + opal_output(0, "%s orted_cmd: converted SIGTSTP to SIGSTOP before delivering", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); + } + signal = SIGSTOP; + } + + if (orte_debug_daemons_flag) { + opal_output(0, "%s orted_cmd: received signal_local_procs, delivering signal %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + signal); + } + /* signal them */ if (ORTE_SUCCESS != (ret = orte_odls.signal_local_procs(NULL, signal))) { ORTE_ERROR_LOG(ret); diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index 472e56afaa..9b738ba4d1 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -10,6 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -109,6 +110,9 @@ opal_list_t orte_local_children; /* list of job data for local children on a daemon */ opal_list_t orte_local_jobdata; +/* whether or not to forward SIGTSTP and SIGCONT signals */ +bool orte_forward_job_control; + #endif /* !ORTE_DISABLE_FULL_RTE */ int orte_debug_output = -1; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index 80ff0e46e3..4c3f55e2a2 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * @@ -482,6 +482,8 @@ ORTE_DECLSPEC extern opal_pointer_array_t orte_jobmap; ORTE_DECLSPEC extern opal_list_t orte_local_children; /* list of job data for local children on a daemon */ ORTE_DECLSPEC extern opal_list_t orte_local_jobdata; +/* whether or not to forward SIGTSTP and SIGCONT signals */ +ORTE_DECLSPEC extern bool orte_forward_job_control; #endif /* ORTE_DISABLE_FULL_SUPPORT */ diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index 4d5f6210d9..58b579c05e 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -10,6 +10,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -244,6 +245,13 @@ int orte_register_params(void) "Whether or not an allocation by a resource manager is required [default: no]", false, false, (int)false, &value); orte_allocation_required = OPAL_INT_TO_BOOL(value); + + /* whether or not to forward SIGTSTP and SIGCONT signals */ + mca_base_param_reg_int_name("orte", "forward_job_control", + "Forward SIGTSTP (after converting to SIGSTOP) and SIGCONT signals to the application procs [default: no]", + false, false, + (int) false, &value); + orte_forward_job_control = OPAL_INT_TO_BOOL(value); #endif /* ORTE_DISABLE_FULL_SUPPORT */ diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 3e04b2726d..ea17c65d4a 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -11,7 +11,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2006-2007 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -101,6 +101,8 @@ static struct opal_event int_handler; #ifndef __WINDOWS__ static struct opal_event sigusr1_handler; static struct opal_event sigusr2_handler; +static struct opal_event sigtstp_handler; +static struct opal_event sigcont_handler; #endif /* __WINDOWS__ */ static orte_job_t *jdata; static char *orterun_basename = NULL; @@ -569,7 +571,7 @@ int orterun(int argc, char *argv[]) opal_signal_set(&int_handler, SIGINT, abort_signal_callback, &int_handler); opal_signal_add(&int_handler, NULL); - + #ifndef __WINDOWS__ /** setup callbacks for signals we should foward */ opal_signal_set(&sigusr1_handler, SIGUSR1, @@ -578,6 +580,14 @@ int orterun(int argc, char *argv[]) opal_signal_set(&sigusr2_handler, SIGUSR2, signal_forward_callback, &sigusr2_handler); opal_signal_add(&sigusr2_handler, NULL); + if (orte_forward_job_control) { + opal_signal_set(&sigtstp_handler, SIGTSTP, + signal_forward_callback, &sigtstp_handler); + opal_signal_add(&sigtstp_handler, NULL); + opal_signal_set(&sigcont_handler, SIGCONT, + signal_forward_callback, &sigcont_handler); + opal_signal_add(&sigcont_handler, NULL); + } #endif /* __WINDOWS__ */ /* we are an hnp, so update the contact info field for later use */ @@ -845,6 +855,10 @@ static void terminated(int trigpipe, short event, void *arg) /** Remove the USR signal handlers */ opal_signal_del(&sigusr1_handler); opal_signal_del(&sigusr2_handler); + if (orte_forward_job_control) { + opal_signal_del(&sigtstp_handler); + opal_signal_del(&sigcont_handler); + } #endif /* __WINDOWS__ */ /* get the daemon job object */