Do not push child processes into separate process groups so that any host RM can still "see" them, and ensure that any signal sent to the orted's themselves will be provided to all child processes. Forward all signals from mpirun to the child processes, removing the old MCA parameter required to turn that behavior "on".
Этот коммит содержится в:
родитель
36a6a3b691
Коммит
d72c1c72ff
@ -786,10 +786,8 @@ static int rte_finalize(void)
|
||||
/** Remove the USR signal handlers */
|
||||
opal_event_signal_del(&sigusr1_handler);
|
||||
opal_event_signal_del(&sigusr2_handler);
|
||||
if (orte_forward_job_control) {
|
||||
opal_event_signal_del(&sigtstp_handler);
|
||||
opal_event_signal_del(&sigcont_handler);
|
||||
}
|
||||
opal_event_signal_del(&sigtstp_handler);
|
||||
opal_event_signal_del(&sigcont_handler);
|
||||
signals_set = false;
|
||||
}
|
||||
|
||||
|
@ -416,13 +416,6 @@ static int do_child(orte_app_context_t* context,
|
||||
sigset_t sigs;
|
||||
char *param, *msg;
|
||||
|
||||
if (orte_forward_job_control) {
|
||||
/* Set a new process group for this child, so that a
|
||||
SIGSTOP can be sent to it without being sent to the
|
||||
orted. */
|
||||
setpgid(0, 0);
|
||||
}
|
||||
|
||||
/* Setup the pipe to be close-on-exec */
|
||||
opal_fd_set_cloexec(write_fd);
|
||||
|
||||
@ -798,11 +791,6 @@ static int send_signal(pid_t pid, int signal)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
signal, (long)pid));
|
||||
|
||||
if (orte_forward_job_control) {
|
||||
/* Send the signal to the process group rather than the
|
||||
process. The child is the leader of its process group. */
|
||||
pid = -pid;
|
||||
}
|
||||
if (kill(pid, signal) != 0) {
|
||||
switch(errno) {
|
||||
case EINVAL:
|
||||
|
@ -193,18 +193,18 @@ static bool odls_default_child_died(orte_proc_t *child)
|
||||
* that occasionally causes us to incorrectly report a proc
|
||||
* as refusing to die. Unfortunately, errno may not be reset
|
||||
* by waitpid in this case, so we cannot check it.
|
||||
*
|
||||
* (note the previous fix to this, to return 'process dead'
|
||||
* here, fixes the race condition at the cost of reporting
|
||||
* all live processes have immediately died! Better to
|
||||
* occasionally report a dead process as still living -
|
||||
* which will occasionally trip the timeout for cases that
|
||||
* are right on the edge.)
|
||||
*
|
||||
* (note the previous fix to this, to return 'process dead'
|
||||
* here, fixes the race condition at the cost of reporting
|
||||
* all live processes have immediately died! Better to
|
||||
* occasionally report a dead process as still living -
|
||||
* which will occasionally trip the timeout for cases that
|
||||
* are right on the edge.)
|
||||
*/
|
||||
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
|
||||
"%s odls:default:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED",
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
|
||||
/* Do nothing, process still alive */
|
||||
/* Do nothing, process still alive */
|
||||
} else if (-1 == ret && ECHILD == errno) {
|
||||
/* The pid no longer exists, so we'll call this "good
|
||||
enough for government work" */
|
||||
@ -228,23 +228,10 @@ static bool odls_default_child_died(orte_proc_t *child)
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/* deliver a signal to a specified pid. */
|
||||
static int odls_default_kill_local(pid_t pid, int signum)
|
||||
{
|
||||
pid_t pgrp;
|
||||
|
||||
#if HAVE_SETPGID
|
||||
pgrp = getpgid(pid);
|
||||
if (-1 != pgrp) {
|
||||
/* target the lead process of the process
|
||||
* group so we ensure that the signal is
|
||||
* seen by all members of that group. This
|
||||
* ensures that the signal is seen by any
|
||||
* child processes our child may have
|
||||
* started
|
||||
*/
|
||||
pid = pgrp;
|
||||
}
|
||||
#endif
|
||||
if (0 != kill(pid, signum)) {
|
||||
if (ESRCH != errno) {
|
||||
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
|
||||
@ -391,13 +378,6 @@ static int do_child(orte_app_context_t* context,
|
||||
long fd, fdmax = sysconf(_SC_OPEN_MAX);
|
||||
char *param, *msg;
|
||||
|
||||
if (orte_forward_job_control) {
|
||||
/* Set a new process group for this child, so that a
|
||||
SIGSTOP can be sent to it without being sent to the
|
||||
orted. */
|
||||
setpgid(0, 0);
|
||||
}
|
||||
|
||||
/* Setup the pipe to be close-on-exec */
|
||||
opal_fd_set_cloexec(write_fd);
|
||||
|
||||
@ -720,10 +700,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
|
||||
}
|
||||
|
||||
if (pid == 0) {
|
||||
close(p[0]);
|
||||
#if HAVE_SETPGID
|
||||
setpgid(0, 0);
|
||||
#endif
|
||||
close(p[0]);
|
||||
do_child(context, child, environ_copy, jobdat, p[1], opts);
|
||||
/* Does not return */
|
||||
}
|
||||
@ -770,11 +747,6 @@ static int send_signal(pid_t pid, int signal)
|
||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||
signal, (long)pid));
|
||||
|
||||
if (orte_forward_job_control) {
|
||||
/* Send the signal to the process group rather than the
|
||||
process. The child is the leader of its process group. */
|
||||
pid = -pid;
|
||||
}
|
||||
if (kill(pid, signal) != 0) {
|
||||
switch(errno) {
|
||||
case EINVAL:
|
||||
|
@ -143,9 +143,6 @@ char *orte_output_filename = NULL;
|
||||
/* generate new xterm windows to display output from specified ranks */
|
||||
char *orte_xterm = NULL;
|
||||
|
||||
/* whether or not to forward SIGTSTP and SIGCONT signals */
|
||||
bool orte_forward_job_control = false;
|
||||
|
||||
/* report launch progress */
|
||||
bool orte_report_launch_progress = false;
|
||||
|
||||
|
@ -521,9 +521,6 @@ ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies;
|
||||
ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children;
|
||||
ORTE_DECLSPEC extern orte_vpid_t orte_total_procs;
|
||||
|
||||
/* whether or not to forward SIGTSTP and SIGCONT signals */
|
||||
ORTE_DECLSPEC extern bool orte_forward_job_control;
|
||||
|
||||
/* IOF controls */
|
||||
ORTE_DECLSPEC extern bool orte_tag_output;
|
||||
ORTE_DECLSPEC extern bool orte_timestamp_output;
|
||||
|
@ -543,14 +543,6 @@ int orte_register_params(void)
|
||||
orte_map_stddiag_to_stderr = true;
|
||||
}
|
||||
|
||||
/* whether or not to forward SIGTSTP and SIGCONT signals */
|
||||
orte_forward_job_control = false;
|
||||
(void) mca_base_var_register ("orte", "orte", NULL, "forward_job_control",
|
||||
"Forward SIGTSTP (after converting to SIGSTOP) and SIGCONT signals to the application procs [default: no]",
|
||||
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
|
||||
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
|
||||
&orte_forward_job_control);
|
||||
|
||||
/* whether or not to report launch progress */
|
||||
orte_report_launch_progress = false;
|
||||
(void) mca_base_var_register ("orte", "orte", NULL, "report_launch_progress",
|
||||
|
@ -1133,9 +1133,7 @@ SIGUSR1 and SIGUSR2 signals received by orte-submit are propagated to
|
||||
all processes in the job.
|
||||
.
|
||||
.PP
|
||||
One can turn on forwarding of SIGSTOP and SIGCONT to the program executed
|
||||
by ompi-submit by setting the MCA parameter orte_forward_job_control to 1.
|
||||
A SIGTSTOP signal to ompi-submit will then cause a SIGSTOP signal to be sent
|
||||
A SIGTSTOP signal to ompi-submit will cause a SIGSTOP signal to be sent
|
||||
to all of the programs started by ompi-submit and likewise a SIGCONT signal
|
||||
to ompi-submit will cause a SIGCONT sent.
|
||||
.
|
||||
|
@ -1240,9 +1240,7 @@ SIGUSR1 and SIGUSR2 signals received by orterun are propagated to
|
||||
all processes in the job.
|
||||
.
|
||||
.PP
|
||||
One can turn on forwarding of SIGSTOP and SIGCONT to the program executed
|
||||
by mpirun by setting the MCA parameter orte_forward_job_control to 1.
|
||||
A SIGTSTOP signal to mpirun will then cause a SIGSTOP signal to be sent
|
||||
A SIGTSTOP signal to mpirun will cause a SIGSTOP signal to be sent
|
||||
to all of the programs started by mpirun and likewise a SIGCONT signal
|
||||
to mpirun will cause a SIGCONT sent.
|
||||
.
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user