1
1

Merge pull request #1431 from rhc54/topic/orted

Do not push child processes into separate process groups so that any …
Этот коммит содержится в:
rhc54 2016-03-06 20:08:17 -08:00
родитель 36a6a3b691 d72c1c72ff
Коммит 8ffde8d020
8 изменённых файлов: 15 добавлений и 75 удалений

Просмотреть файл

@ -786,10 +786,8 @@ static int rte_finalize(void)
/** Remove the USR signal handlers */
opal_event_signal_del(&sigusr1_handler);
opal_event_signal_del(&sigusr2_handler);
if (orte_forward_job_control) {
opal_event_signal_del(&sigtstp_handler);
opal_event_signal_del(&sigcont_handler);
}
opal_event_signal_del(&sigtstp_handler);
opal_event_signal_del(&sigcont_handler);
signals_set = false;
}

Просмотреть файл

@ -416,13 +416,6 @@ static int do_child(orte_app_context_t* context,
sigset_t sigs;
char *param, *msg;
if (orte_forward_job_control) {
/* Set a new process group for this child, so that a
SIGSTOP can be sent to it without being sent to the
orted. */
setpgid(0, 0);
}
/* Setup the pipe to be close-on-exec */
opal_fd_set_cloexec(write_fd);
@ -798,11 +791,6 @@ static int send_signal(pid_t pid, int signal)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
signal, (long)pid));
if (orte_forward_job_control) {
/* Send the signal to the process group rather than the
process. The child is the leader of its process group. */
pid = -pid;
}
if (kill(pid, signal) != 0) {
switch(errno) {
case EINVAL:

Просмотреть файл

@ -193,18 +193,18 @@ static bool odls_default_child_died(orte_proc_t *child)
* that occasionally causes us to incorrectly report a proc
* as refusing to die. Unfortunately, errno may not be reset
* by waitpid in this case, so we cannot check it.
*
* (note the previous fix to this, to return 'process dead'
* here, fixes the race condition at the cost of reporting
* all live processes have immediately died! Better to
* occasionally report a dead process as still living -
* which will occasionally trip the timeout for cases that
* are right on the edge.)
*
* (note the previous fix to this, to return 'process dead'
* here, fixes the race condition at the cost of reporting
* all live processes have immediately died! Better to
* occasionally report a dead process as still living -
* which will occasionally trip the timeout for cases that
* are right on the edge.)
*/
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
"%s odls:default:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
/* Do nothing, process still alive */
/* Do nothing, process still alive */
} else if (-1 == ret && ECHILD == errno) {
/* The pid no longer exists, so we'll call this "good
enough for government work" */
@ -228,23 +228,10 @@ static bool odls_default_child_died(orte_proc_t *child)
return false;
}
/* deliver a signal to a specified pid. */
static int odls_default_kill_local(pid_t pid, int signum)
{
pid_t pgrp;
#if HAVE_SETPGID
pgrp = getpgid(pid);
if (-1 != pgrp) {
/* target the lead process of the process
* group so we ensure that the signal is
* seen by all members of that group. This
* ensures that the signal is seen by any
* child processes our child may have
* started
*/
pid = pgrp;
}
#endif
if (0 != kill(pid, signum)) {
if (ESRCH != errno) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
@ -391,13 +378,6 @@ static int do_child(orte_app_context_t* context,
long fd, fdmax = sysconf(_SC_OPEN_MAX);
char *param, *msg;
if (orte_forward_job_control) {
/* Set a new process group for this child, so that a
SIGSTOP can be sent to it without being sent to the
orted. */
setpgid(0, 0);
}
/* Setup the pipe to be close-on-exec */
opal_fd_set_cloexec(write_fd);
@ -720,10 +700,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
}
if (pid == 0) {
close(p[0]);
#if HAVE_SETPGID
setpgid(0, 0);
#endif
close(p[0]);
do_child(context, child, environ_copy, jobdat, p[1], opts);
/* Does not return */
}
@ -770,11 +747,6 @@ static int send_signal(pid_t pid, int signal)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
signal, (long)pid));
if (orte_forward_job_control) {
/* Send the signal to the process group rather than the
process. The child is the leader of its process group. */
pid = -pid;
}
if (kill(pid, signal) != 0) {
switch(errno) {
case EINVAL:

Просмотреть файл

@ -143,9 +143,6 @@ char *orte_output_filename = NULL;
/* generate new xterm windows to display output from specified ranks */
char *orte_xterm = NULL;
/* whether or not to forward SIGTSTP and SIGCONT signals */
bool orte_forward_job_control = false;
/* report launch progress */
bool orte_report_launch_progress = false;

Просмотреть файл

@ -521,9 +521,6 @@ ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies;
ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children;
ORTE_DECLSPEC extern orte_vpid_t orte_total_procs;
/* whether or not to forward SIGTSTP and SIGCONT signals */
ORTE_DECLSPEC extern bool orte_forward_job_control;
/* IOF controls */
ORTE_DECLSPEC extern bool orte_tag_output;
ORTE_DECLSPEC extern bool orte_timestamp_output;

Просмотреть файл

@ -543,14 +543,6 @@ int orte_register_params(void)
orte_map_stddiag_to_stderr = true;
}
/* whether or not to forward SIGTSTP and SIGCONT signals */
orte_forward_job_control = false;
(void) mca_base_var_register ("orte", "orte", NULL, "forward_job_control",
"Forward SIGTSTP (after converting to SIGSTOP) and SIGCONT signals to the application procs [default: no]",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_forward_job_control);
/* whether or not to report launch progress */
orte_report_launch_progress = false;
(void) mca_base_var_register ("orte", "orte", NULL, "report_launch_progress",

Просмотреть файл

@ -1133,9 +1133,7 @@ SIGUSR1 and SIGUSR2 signals received by orte-submit are propagated to
all processes in the job.
.
.PP
One can turn on forwarding of SIGSTOP and SIGCONT to the program executed
by ompi-submit by setting the MCA parameter orte_forward_job_control to 1.
A SIGTSTOP signal to ompi-submit will then cause a SIGSTOP signal to be sent
A SIGTSTOP signal to ompi-submit will cause a SIGSTOP signal to be sent
to all of the programs started by ompi-submit and likewise a SIGCONT signal
to ompi-submit will cause a SIGCONT sent.
.

Просмотреть файл

@ -1240,9 +1240,7 @@ SIGUSR1 and SIGUSR2 signals received by orterun are propagated to
all processes in the job.
.
.PP
One can turn on forwarding of SIGSTOP and SIGCONT to the program executed
by mpirun by setting the MCA parameter orte_forward_job_control to 1.
A SIGTSTOP signal to mpirun will then cause a SIGSTOP signal to be sent
A SIGTSTOP signal to mpirun will cause a SIGSTOP signal to be sent
to all of the programs started by mpirun and likewise a SIGCONT signal
to mpirun will cause a SIGCONT sent.
.