Merge pull request #1431 from rhc54/topic/orted
Do not push child processes into separate process groups so that any …
Этот коммит содержится в:
Коммит
8ffde8d020
@ -786,10 +786,8 @@ static int rte_finalize(void)
|
||||
/** Remove the USR signal handlers */
|
||||
opal_event_signal_del(&sigusr1_handler);
|
||||
opal_event_signal_del(&sigusr2_handler);
|
||||
if (orte_forward_job_control) {
|
||||
opal_event_signal_del(&sigtstp_handler);
|
||||
opal_event_signal_del(&sigcont_handler);
|
||||
}
|
||||
opal_event_signal_del(&sigtstp_handler);
|
||||
opal_event_signal_del(&sigcont_handler);
|
||||
signals_set = false;
|
||||
}
|
||||
|
||||
|
@ -416,13 +416,6 @@ static int do_child(orte_app_context_t* context,
|
||||
sigset_t sigs;
|
||||
char *param, *msg;
|
||||
|
||||
if (orte_forward_job_control) {
|
||||
/* Set a new process group for this child, so that a
|
||||
SIGSTOP can be sent to it without being sent to the
|
||||
orted. */
|
||||
setpgid(0, 0);
|
||||
}
|
||||
|
||||
/* Setup the pipe to be close-on-exec */
|
||||
opal_fd_set_cloexec(write_fd);
|
||||
|
||||
@ -798,11 +791,6 @@ static int send_signal(pid_t pid, int signal)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
signal, (long)pid));
|
||||
|
||||
if (orte_forward_job_control) {
|
||||
/* Send the signal to the process group rather than the
|
||||
process. The child is the leader of its process group. */
|
||||
pid = -pid;
|
||||
}
|
||||
if (kill(pid, signal) != 0) {
|
||||
switch(errno) {
|
||||
case EINVAL:
|
||||
|
@ -193,18 +193,18 @@ static bool odls_default_child_died(orte_proc_t *child)
|
||||
* that occasionally causes us to incorrectly report a proc
|
||||
* as refusing to die. Unfortunately, errno may not be reset
|
||||
* by waitpid in this case, so we cannot check it.
|
||||
*
|
||||
* (note the previous fix to this, to return 'process dead'
|
||||
* here, fixes the race condition at the cost of reporting
|
||||
* all live processes have immediately died! Better to
|
||||
* occasionally report a dead process as still living -
|
||||
* which will occasionally trip the timeout for cases that
|
||||
* are right on the edge.)
|
||||
*
|
||||
* (note the previous fix to this, to return 'process dead'
|
||||
* here, fixes the race condition at the cost of reporting
|
||||
* all live processes have immediately died! Better to
|
||||
* occasionally report a dead process as still living -
|
||||
* which will occasionally trip the timeout for cases that
|
||||
* are right on the edge.)
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
|
||||
"%s odls:default:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
|
||||
/* Do nothing, process still alive */
|
||||
/* Do nothing, process still alive */
|
||||
} else if (-1 == ret && ECHILD == errno) {
|
||||
/* The pid no longer exists, so we'll call this "good
|
||||
enough for government work" */
|
||||
@ -228,23 +228,10 @@ static bool odls_default_child_died(orte_proc_t *child)
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/* deliver a signal to a specified pid. */
|
||||
static int odls_default_kill_local(pid_t pid, int signum)
|
||||
{
|
||||
pid_t pgrp;
|
||||
|
||||
#if HAVE_SETPGID
|
||||
pgrp = getpgid(pid);
|
||||
if (-1 != pgrp) {
|
||||
/* target the lead process of the process
|
||||
* group so we ensure that the signal is
|
||||
* seen by all members of that group. This
|
||||
* ensures that the signal is seen by any
|
||||
* child processes our child may have
|
||||
* started
|
||||
*/
|
||||
pid = pgrp;
|
||||
}
|
||||
#endif
|
||||
if (0 != kill(pid, signum)) {
|
||||
if (ESRCH != errno) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
|
||||
@ -391,13 +378,6 @@ static int do_child(orte_app_context_t* context,
|
||||
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
||||
char *param, *msg;
|
||||
|
||||
if (orte_forward_job_control) {
|
||||
/* Set a new process group for this child, so that a
|
||||
SIGSTOP can be sent to it without being sent to the
|
||||
orted. */
|
||||
setpgid(0, 0);
|
||||
}
|
||||
|
||||
/* Setup the pipe to be close-on-exec */
|
||||
opal_fd_set_cloexec(write_fd);
|
||||
|
||||
@ -720,10 +700,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
||||
}
|
||||
|
||||
if (pid == 0) {
|
||||
close(p[0]);
|
||||
#if HAVE_SETPGID
|
||||
setpgid(0, 0);
|
||||
#endif
|
||||
close(p[0]);
|
||||
do_child(context, child, environ_copy, jobdat, p[1], opts);
|
||||
/* Does not return */
|
||||
}
|
||||
@ -770,11 +747,6 @@ static int send_signal(pid_t pid, int signal)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
signal, (long)pid));
|
||||
|
||||
if (orte_forward_job_control) {
|
||||
/* Send the signal to the process group rather than the
|
||||
process. The child is the leader of its process group. */
|
||||
pid = -pid;
|
||||
}
|
||||
if (kill(pid, signal) != 0) {
|
||||
switch(errno) {
|
||||
case EINVAL:
|
||||
|
@ -143,9 +143,6 @@ char *orte_output_filename = NULL;
|
||||
/* generate new xterm windows to display output from specified ranks */
|
||||
char *orte_xterm = NULL;
|
||||
|
||||
/* whether or not to forward SIGTSTP and SIGCONT signals */
|
||||
bool orte_forward_job_control = false;
|
||||
|
||||
/* report launch progress */
|
||||
bool orte_report_launch_progress = false;
|
||||
|
||||
|
@ -521,9 +521,6 @@ ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies;
|
||||
ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children;
|
||||
ORTE_DECLSPEC extern orte_vpid_t orte_total_procs;
|
||||
|
||||
/* whether or not to forward SIGTSTP and SIGCONT signals */
|
||||
ORTE_DECLSPEC extern bool orte_forward_job_control;
|
||||
|
||||
/* IOF controls */
|
||||
ORTE_DECLSPEC extern bool orte_tag_output;
|
||||
ORTE_DECLSPEC extern bool orte_timestamp_output;
|
||||
|
@ -543,14 +543,6 @@ int orte_register_params(void)
|
||||
orte_map_stddiag_to_stderr = true;
|
||||
}
|
||||
|
||||
/* whether or not to forward SIGTSTP and SIGCONT signals */
|
||||
orte_forward_job_control = false;
|
||||
(void) mca_base_var_register ("orte", "orte", NULL, "forward_job_control",
|
||||
"Forward SIGTSTP (after converting to SIGSTOP) and SIGCONT signals to the application procs [default: no]",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_forward_job_control);
|
||||
|
||||
/* whether or not to report launch progress */
|
||||
orte_report_launch_progress = false;
|
||||
(void) mca_base_var_register ("orte", "orte", NULL, "report_launch_progress",
|
||||
|
@ -1133,9 +1133,7 @@ SIGUSR1 and SIGUSR2 signals received by orte-submit are propagated to
|
||||
all processes in the job.
|
||||
.
|
||||
.PP
|
||||
One can turn on forwarding of SIGSTOP and SIGCONT to the program executed
|
||||
by ompi-submit by setting the MCA parameter orte_forward_job_control to 1.
|
||||
A SIGTSTOP signal to ompi-submit will then cause a SIGSTOP signal to be sent
|
||||
A SIGTSTOP signal to ompi-submit will cause a SIGSTOP signal to be sent
|
||||
to all of the programs started by ompi-submit and likewise a SIGCONT signal
|
||||
to ompi-submit will cause a SIGCONT sent.
|
||||
.
|
||||
|
@ -1240,9 +1240,7 @@ SIGUSR1 and SIGUSR2 signals received by orterun are propagated to
|
||||
all processes in the job.
|
||||
.
|
||||
.PP
|
||||
One can turn on forwarding of SIGSTOP and SIGCONT to the program executed
|
||||
by mpirun by setting the MCA parameter orte_forward_job_control to 1.
|
||||
A SIGTSTOP signal to mpirun will then cause a SIGSTOP signal to be sent
|
||||
A SIGTSTOP signal to mpirun will cause a SIGSTOP signal to be sent
|
||||
to all of the programs started by mpirun and likewise a SIGCONT signal
|
||||
to mpirun will cause a SIGCONT sent.
|
||||
.
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user