From 160120b4c570f27373cd873e83e941a7e194801b Mon Sep 17 00:00:00 2001 From: Josh Hursey Date: Fri, 8 Sep 2006 15:17:17 +0000 Subject: [PATCH] Fix a cut-n-paste error that causes the 'num_concurrent' to be set to 1 or 0 instead of the user defined number or default (128). This caused the PLS to deadlock when using '--debug-daemons' with more than 2 processes. :( svn blame says that it was broken in r11347 It is *not* a problem on v1.1 or v1.2 branches. Bug spotted by Tim Mattox and myself. This commit was SVN r11575. The following SVN revision numbers were found above: r11347 --> open-mpi/ompi@f52c10d18e701764980a1fc5548a500cb2ebe5c7 --- orte/mca/pls/rsh/pls_rsh_component.c | 3 ++- orte/mca/pls/rsh/pls_rsh_module.c | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/orte/mca/pls/rsh/pls_rsh_component.c b/orte/mca/pls/rsh/pls_rsh_component.c index dfe0d17b65..d86a7e7a80 100644 --- a/orte/mca/pls/rsh/pls_rsh_component.c +++ b/orte/mca/pls/rsh/pls_rsh_component.c @@ -128,7 +128,8 @@ int orte_pls_rsh_component_open(void) true, tmp); tmp = 1; } - mca_pls_rsh_component.num_concurrent = (tmp != 0 ? true : false); + mca_pls_rsh_component.num_concurrent = tmp; + if (mca_pls_rsh_component.debug == 0) { mca_base_param_reg_int_name("orte", "debug", "Whether or not to enable debugging output for all ORTE components (0 or 1)", diff --git a/orte/mca/pls/rsh/pls_rsh_module.c b/orte/mca/pls/rsh/pls_rsh_module.c index a4f353a559..99a4437f90 100644 --- a/orte/mca/pls/rsh/pls_rsh_module.c +++ b/orte/mca/pls/rsh/pls_rsh_module.c @@ -961,6 +961,17 @@ int orte_pls_rsh_launch(orte_jobid_t jobid) rsh_daemon_info_t *daemon_info; OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock); + /* JJH Bug: + * If we are in '--debug-daemons' we keep the ssh connection + * alive for the span of the run. If we use this option + * AND we launch on more than "num_concurrent" machines + * then we will deadlock. No connections are terminated + * until the job is complete, no job is started + * since all the orteds are waiting for all the others + * to come online, and the others ore not launched because + * we are waiting on those that have started to terminate + * their ssh tunnels. :( + */ if (mca_pls_rsh_component.num_children++ >= mca_pls_rsh_component.num_concurrent) { opal_condition_wait(&mca_pls_rsh_component.cond, &mca_pls_rsh_component.lock);