From 843fcca03cd83793e5dfa0bc4cb0f97f2f132fa6 Mon Sep 17 00:00:00 2001 From: Joshua Hursey Date: Fri, 10 Feb 2017 14:24:02 -0600 Subject: [PATCH] plm/rsh: Fix signal handling for rsh launcher * Similar to the other launchers (i.e., slurm, alps) we need to put the children in a separate process group to prevent SIGINT (from a CTRL-C) from being delivered to the whole process group and prematurely killing the rsh/ssh connections to the remote daemons. Signed-off-by: Joshua Hursey --- orte/mca/plm/rsh/plm_rsh_module.c | 38 ++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/orte/mca/plm/rsh/plm_rsh_module.c b/orte/mca/plm/rsh/plm_rsh_module.c index 8c83fe1639..df63cd3860 100644 --- a/orte/mca/plm/rsh/plm_rsh_module.c +++ b/orte/mca/plm/rsh/plm_rsh_module.c @@ -13,7 +13,7 @@ * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2008-2009 Sun Microsystems, Inc. All rights reserved. - * Copyright (c) 2011 IBM Corporation. All rights reserved. + * Copyright (c) 2011-2017 IBM Corporation. All rights reserved. * Copyright (c) 2014-2017 Intel, Inc. All rights reserved. * Copyright (c) 2015-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -957,9 +957,45 @@ static void process_launch_list(int fd, short args, void *cbdata) /* child */ if (pid == 0) { + /* + * When the user presses CTRL-C, SIGINT is sent to the whole process + * group which terminates the rsh/ssh command. This can cause the + * remote daemon to crash with a SIGPIPE when it tried to print out + * status information. This has two concequences: + * 1) The remote node is not cleaned up as it should. The local + * processes will notice that the orted failed and cleanup their + * part of the session directory, but the job level part will + * remain littered. + * 2) Any debugging information we expected to see from the orted + * during shutdown is lost. + * + * The solution here is to put the child processes in a separate + * process group from the HNP. So when the user presses CTRL-C + * then only the HNP receives the signal, and not the rsh/ssh + * child processes. + */ +#if HAVE_SETPGID + if( 0 != setpgid(0, 0) ) { + opal_output(0, "plm:rsh: Error: setpgid(0,0) failed in child with errno=%s(%d)\n", + strerror(errno), errno); + exit(-1); + } +#endif + /* do the ssh launch - this will exit if it fails */ ssh_child(caddy->argc, caddy->argv); } else { /* father */ + // Put the child in a separate progress group + // - see comment in child section. +#if HAVE_SETPGID + if( 0 != setpgid(pid, pid) ) { + opal_output(0, "plm:rsh: Warning: setpgid(%ld,%ld) failed in parent with errno=%s(%d)\n", + (long)pid, (long)pid, strerror(errno), errno); + // Ignore this error since the child is off and running. + // We still need to track it. + } +#endif + /* indicate this daemon has been launched */ caddy->daemon->state = ORTE_PROC_STATE_RUNNING; /* record the pid of the ssh fork */