1
1
Have orte call setpgrp after forking (but before exec) when
orte_forward_job_control is set. Then have it send signals to the
child's process group.  This allows suspending jobs that fork.

If a SIGTSTP arrives before the processes have been launched, then
record it and suspend them right after launching.

This commit was SVN r22557.
Этот коммит содержится в:
Iain Bason 2010-02-04 15:47:20 +00:00
родитель 23bb52ad05
Коммит 28f03a2d86
3 изменённых файлов: 45 добавлений и 2 удалений

Просмотреть файл

@ -78,6 +78,7 @@
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/iof/base/iof_base_setup.h"
#include "orte/mca/plm/plm.h"
#include "orte/util/name_fns.h"
#include "orte/mca/odls/base/odls_private.h"
@ -182,6 +183,9 @@ static bool odls_default_child_died(pid_t pid, unsigned int timeout, int *exit_s
static int odls_default_kill_local(pid_t pid, int signum)
{
if (orte_forward_job_control) {
pid = -pid;
}
if (0 != kill(pid, signum)) {
if (ESRCH != errno) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
@ -288,6 +292,13 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
if (pid == 0) {
long fd, fdmax = sysconf(_SC_OPEN_MAX);
if (orte_forward_job_control) {
/* Set a new process group for this child, so that a
SIGSTOP can be sent to it without being sent to the
orted. */
setpgrp();
}
/* Setup the pipe to be close-on-exec */
close(p[0]);
@ -900,6 +911,7 @@ int orte_odls_default_launch_local_procs(opal_buffer_t *data)
{
int rc;
orte_jobid_t job;
orte_job_t *jdata;
/* construct the list of children we are to launch */
if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) {
@ -917,6 +929,23 @@ int orte_odls_default_launch_local_procs(opal_buffer_t *data)
goto CLEANUP;
}
/* look up job data object */
if (NULL != (jdata = orte_get_job_data_object(job))) {
if (jdata->state & ORTE_JOB_STATE_SUSPENDED) {
if (ORTE_PROC_IS_HNP) {
/* Have the plm send the signal to all the nodes.
If the signal arrived before the orteds started,
then they won't know to suspend their procs.
The plm also arranges for any local procs to
be signaled.
*/
orte_plm.signal_job(jdata->jobid, SIGTSTP);
} else {
orte_odls_default_signal_local_procs(NULL, SIGTSTP);
}
}
}
CLEANUP:
return rc;
@ -946,7 +975,12 @@ static int send_signal(pid_t pid, int signal)
"%s sending signal %d to pid %ld",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
signal, (long)pid));
if (orte_forward_job_control) {
/* Send the signal to the process group rather than the
process. The child is the leader of its process group. */
pid = -pid;
}
if (kill(pid, signal) != 0) {
switch(errno) {
case EINVAL:

Просмотреть файл

@ -72,12 +72,13 @@ typedef uint16_t orte_job_state_t;
#define ORTE_JOB_STATE_RESTART 0x0002 /* the job is ready for restart after one or more procs failed */
#define ORTE_JOB_STATE_LAUNCHED 0x0004 /* job has been launched by plm */
#define ORTE_JOB_STATE_RUNNING 0x0010 /* all process have been fork'd */
#define ORTE_JOB_STATE_SUSPENDED 0x0020 /* job has been suspended */
/*
* Define a "boundary" so we can easily and quickly determine
* if a job is still running or not - any value less than
* this one means that we are not terminated
*/
#define ORTE_JOB_STATE_UNTERMINATED 0x0020
#define ORTE_JOB_STATE_UNTERMINATED 0x0040
#define ORTE_JOB_STATE_TERMINATED 0x0080 /* all processes have terminated and is no longer running */
#define ORTE_JOB_STATE_ABORTED 0x0100 /* at least one process aborted, causing job to abort */

Просмотреть файл

@ -425,6 +425,9 @@ static int process_commands(orte_process_name_t* sender,
goto CLEANUP;
}
/* look up job data object */
jdata = orte_get_job_data_object(job);
/* get the signal */
n = 1;
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &signal, &n, OPAL_INT32))) {
@ -439,6 +442,11 @@ static int process_commands(orte_process_name_t* sender,
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
}
signal = SIGSTOP;
if (NULL != jdata) {
jdata->state |= ORTE_JOB_STATE_SUSPENDED;
}
} else if (SIGCONT == signal && NULL != jdata) {
jdata->state &= ~ORTE_JOB_STATE_SUSPENDED;
}
if (orte_debug_daemons_flag) {