1
1

Do not push child processes into separate process groups so that any host RM can still "see" them, and ensure that any signal sent to the orted's themselves will be provided to all child processes. Forward all signals from mpirun to the child processes, removing the old MCA parameter required to turn that behavior "on".

Этот коммит содержится в:
Ralph Castain 2016-03-06 17:55:09 -08:00
родитель 36a6a3b691
Коммит d72c1c72ff
8 изменённых файлов: 15 добавлений и 75 удалений

Просмотреть файл

@ -786,10 +786,8 @@ static int rte_finalize(void)
/** Remove the USR signal handlers */
opal_event_signal_del(&sigusr1_handler);
opal_event_signal_del(&sigusr2_handler);
if (orte_forward_job_control) {
opal_event_signal_del(&sigtstp_handler);
opal_event_signal_del(&sigcont_handler);
}
opal_event_signal_del(&sigtstp_handler);
opal_event_signal_del(&sigcont_handler);
signals_set = false;
}

Просмотреть файл

@ -416,13 +416,6 @@ static int do_child(orte_app_context_t* context,
sigset_t sigs;
char *param, *msg;
if (orte_forward_job_control) {
/* Set a new process group for this child, so that a
SIGSTOP can be sent to it without being sent to the
orted. */
setpgid(0, 0);
}
/* Setup the pipe to be close-on-exec */
opal_fd_set_cloexec(write_fd);
@ -798,11 +791,6 @@ static int send_signal(pid_t pid, int signal)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
signal, (long)pid));
if (orte_forward_job_control) {
/* Send the signal to the process group rather than the
process. The child is the leader of its process group. */
pid = -pid;
}
if (kill(pid, signal) != 0) {
switch(errno) {
case EINVAL:

Просмотреть файл

@ -193,18 +193,18 @@ static bool odls_default_child_died(orte_proc_t *child)
* that occasionally causes us to incorrectly report a proc
* as refusing to die. Unfortunately, errno may not be reset
* by waitpid in this case, so we cannot check it.
*
* (note the previous fix to this, to return 'process dead'
* here, fixes the race condition at the cost of reporting
* all live processes have immediately died! Better to
* occasionally report a dead process as still living -
* which will occasionally trip the timeout for cases that
* are right on the edge.)
*
* (note the previous fix to this, to return 'process dead'
* here, fixes the race condition at the cost of reporting
* all live processes have immediately died! Better to
* occasionally report a dead process as still living -
* which will occasionally trip the timeout for cases that
* are right on the edge.)
*/
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
"%s odls:default:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
/* Do nothing, process still alive */
/* Do nothing, process still alive */
} else if (-1 == ret && ECHILD == errno) {
/* The pid no longer exists, so we'll call this "good
enough for government work" */
@ -228,23 +228,10 @@ static bool odls_default_child_died(orte_proc_t *child)
return false;
}
/* deliver a signal to a specified pid. */
static int odls_default_kill_local(pid_t pid, int signum)
{
pid_t pgrp;
#if HAVE_SETPGID
pgrp = getpgid(pid);
if (-1 != pgrp) {
/* target the lead process of the process
* group so we ensure that the signal is
* seen by all members of that group. This
* ensures that the signal is seen by any
* child processes our child may have
* started
*/
pid = pgrp;
}
#endif
if (0 != kill(pid, signum)) {
if (ESRCH != errno) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
@ -391,13 +378,6 @@ static int do_child(orte_app_context_t* context,
long fd, fdmax = sysconf(_SC_OPEN_MAX);
char *param, *msg;
if (orte_forward_job_control) {
/* Set a new process group for this child, so that a
SIGSTOP can be sent to it without being sent to the
orted. */
setpgid(0, 0);
}
/* Setup the pipe to be close-on-exec */
opal_fd_set_cloexec(write_fd);
@ -720,10 +700,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
}
if (pid == 0) {
close(p[0]);
#if HAVE_SETPGID
setpgid(0, 0);
#endif
close(p[0]);
do_child(context, child, environ_copy, jobdat, p[1], opts);
/* Does not return */
}
@ -770,11 +747,6 @@ static int send_signal(pid_t pid, int signal)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
signal, (long)pid));
if (orte_forward_job_control) {
/* Send the signal to the process group rather than the
process. The child is the leader of its process group. */
pid = -pid;
}
if (kill(pid, signal) != 0) {
switch(errno) {
case EINVAL:

Просмотреть файл

@ -143,9 +143,6 @@ char *orte_output_filename = NULL;
/* generate new xterm windows to display output from specified ranks */
char *orte_xterm = NULL;
/* whether or not to forward SIGTSTP and SIGCONT signals */
bool orte_forward_job_control = false;
/* report launch progress */
bool orte_report_launch_progress = false;

Просмотреть файл

@ -521,9 +521,6 @@ ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies;
ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children;
ORTE_DECLSPEC extern orte_vpid_t orte_total_procs;
/* whether or not to forward SIGTSTP and SIGCONT signals */
ORTE_DECLSPEC extern bool orte_forward_job_control;
/* IOF controls */
ORTE_DECLSPEC extern bool orte_tag_output;
ORTE_DECLSPEC extern bool orte_timestamp_output;

Просмотреть файл

@ -543,14 +543,6 @@ int orte_register_params(void)
orte_map_stddiag_to_stderr = true;
}
/* whether or not to forward SIGTSTP and SIGCONT signals */
orte_forward_job_control = false;
(void) mca_base_var_register ("orte", "orte", NULL, "forward_job_control",
"Forward SIGTSTP (after converting to SIGSTOP) and SIGCONT signals to the application procs [default: no]",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_forward_job_control);
/* whether or not to report launch progress */
orte_report_launch_progress = false;
(void) mca_base_var_register ("orte", "orte", NULL, "report_launch_progress",

Просмотреть файл

@ -1133,9 +1133,7 @@ SIGUSR1 and SIGUSR2 signals received by orte-submit are propagated to
all processes in the job.
.
.PP
One can turn on forwarding of SIGSTOP and SIGCONT to the program executed
by ompi-submit by setting the MCA parameter orte_forward_job_control to 1.
A SIGTSTOP signal to ompi-submit will then cause a SIGSTOP signal to be sent
A SIGTSTOP signal to ompi-submit will cause a SIGSTOP signal to be sent
to all of the programs started by ompi-submit and likewise a SIGCONT signal
to ompi-submit will cause a SIGCONT sent.
.

Просмотреть файл

@ -1240,9 +1240,7 @@ SIGUSR1 and SIGUSR2 signals received by orterun are propagated to
all processes in the job.
.
.PP
One can turn on forwarding of SIGSTOP and SIGCONT to the program executed
by mpirun by setting the MCA parameter orte_forward_job_control to 1.
A SIGTSTOP signal to mpirun will then cause a SIGSTOP signal to be sent
A SIGTSTOP signal to mpirun will cause a SIGSTOP signal to be sent
to all of the programs started by mpirun and likewise a SIGCONT signal
to mpirun will cause a SIGCONT sent.
.