Suspend/resume enhancements:
Have orte call setpgrp after forking (but before exec) when orte_forward_job_control is set. Then have it send signals to the child's process group. This allows suspending jobs that fork. If a SIGTSTP arrives before the processes have been launched, then record it and suspend them right after launching. This commit was SVN r22557.
Этот коммит содержится в:
родитель
23bb52ad05
Коммит
28f03a2d86
@ -78,6 +78,7 @@
|
||||
#include "orte/mca/errmgr/errmgr.h"
|
||||
#include "orte/mca/ess/ess.h"
|
||||
#include "orte/mca/iof/base/iof_base_setup.h"
|
||||
#include "orte/mca/plm/plm.h"
|
||||
#include "orte/util/name_fns.h"
|
||||
|
||||
#include "orte/mca/odls/base/odls_private.h"
|
||||
@ -182,6 +183,9 @@ static bool odls_default_child_died(pid_t pid, unsigned int timeout, int *exit_s
|
||||
|
||||
static int odls_default_kill_local(pid_t pid, int signum)
|
||||
{
|
||||
if (orte_forward_job_control) {
|
||||
pid = -pid;
|
||||
}
|
||||
if (0 != kill(pid, signum)) {
|
||||
if (ESRCH != errno) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output,
|
||||
@ -288,6 +292,13 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
||||
|
||||
if (pid == 0) {
|
||||
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
||||
|
||||
if (orte_forward_job_control) {
|
||||
/* Set a new process group for this child, so that a
|
||||
SIGSTOP can be sent to it without being sent to the
|
||||
orted. */
|
||||
setpgrp();
|
||||
}
|
||||
|
||||
/* Setup the pipe to be close-on-exec */
|
||||
close(p[0]);
|
||||
@ -900,6 +911,7 @@ int orte_odls_default_launch_local_procs(opal_buffer_t *data)
|
||||
{
|
||||
int rc;
|
||||
orte_jobid_t job;
|
||||
orte_job_t *jdata;
|
||||
|
||||
/* construct the list of children we are to launch */
|
||||
if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) {
|
||||
@ -917,6 +929,23 @@ int orte_odls_default_launch_local_procs(opal_buffer_t *data)
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* look up job data object */
|
||||
if (NULL != (jdata = orte_get_job_data_object(job))) {
|
||||
if (jdata->state & ORTE_JOB_STATE_SUSPENDED) {
|
||||
if (ORTE_PROC_IS_HNP) {
|
||||
/* Have the plm send the signal to all the nodes.
|
||||
If the signal arrived before the orteds started,
|
||||
then they won't know to suspend their procs.
|
||||
The plm also arranges for any local procs to
|
||||
be signaled.
|
||||
*/
|
||||
orte_plm.signal_job(jdata->jobid, SIGTSTP);
|
||||
} else {
|
||||
orte_odls_default_signal_local_procs(NULL, SIGTSTP);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CLEANUP:
|
||||
|
||||
return rc;
|
||||
@ -946,7 +975,12 @@ static int send_signal(pid_t pid, int signal)
|
||||
"%s sending signal %d to pid %ld",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
signal, (long)pid));
|
||||
|
||||
|
||||
if (orte_forward_job_control) {
|
||||
/* Send the signal to the process group rather than the
|
||||
process. The child is the leader of its process group. */
|
||||
pid = -pid;
|
||||
}
|
||||
if (kill(pid, signal) != 0) {
|
||||
switch(errno) {
|
||||
case EINVAL:
|
||||
|
@ -72,12 +72,13 @@ typedef uint16_t orte_job_state_t;
|
||||
#define ORTE_JOB_STATE_RESTART 0x0002 /* the job is ready for restart after one or more procs failed */
|
||||
#define ORTE_JOB_STATE_LAUNCHED 0x0004 /* job has been launched by plm */
|
||||
#define ORTE_JOB_STATE_RUNNING 0x0010 /* all process have been fork'd */
|
||||
#define ORTE_JOB_STATE_SUSPENDED 0x0020 /* job has been suspended */
|
||||
/*
|
||||
* Define a "boundary" so we can easily and quickly determine
|
||||
* if a job is still running or not - any value less than
|
||||
* this one means that we are not terminated
|
||||
*/
|
||||
#define ORTE_JOB_STATE_UNTERMINATED 0x0020
|
||||
#define ORTE_JOB_STATE_UNTERMINATED 0x0040
|
||||
|
||||
#define ORTE_JOB_STATE_TERMINATED 0x0080 /* all processes have terminated and is no longer running */
|
||||
#define ORTE_JOB_STATE_ABORTED 0x0100 /* at least one process aborted, causing job to abort */
|
||||
|
@ -425,6 +425,9 @@ static int process_commands(orte_process_name_t* sender,
|
||||
goto CLEANUP;
|
||||
}
|
||||
|
||||
/* look up job data object */
|
||||
jdata = orte_get_job_data_object(job);
|
||||
|
||||
/* get the signal */
|
||||
n = 1;
|
||||
if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &signal, &n, OPAL_INT32))) {
|
||||
@ -439,6 +442,11 @@ static int process_commands(orte_process_name_t* sender,
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||
}
|
||||
signal = SIGSTOP;
|
||||
if (NULL != jdata) {
|
||||
jdata->state |= ORTE_JOB_STATE_SUSPENDED;
|
||||
}
|
||||
} else if (SIGCONT == signal && NULL != jdata) {
|
||||
jdata->state &= ~ORTE_JOB_STATE_SUSPENDED;
|
||||
}
|
||||
|
||||
if (orte_debug_daemons_flag) {
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user