Fix a cut-n-paste error that causes the 'num_concurrent' to be
set to 1 or 0 instead of the user defined number or default (128). This caused the PLS to deadlock when using '--debug-daemons' with more than 2 processes. :( svn blame says that it was broken in r11347 It is *not* a problem on v1.1 or v1.2 branches. Bug spotted by Tim Mattox and myself. This commit was SVN r11575. The following SVN revision numbers were found above: r11347 --> open-mpi/ompi@f52c10d18e
Этот коммит содержится в:
родитель
fb4d7ab268
Коммит
160120b4c5
@ -128,7 +128,8 @@ int orte_pls_rsh_component_open(void)
|
||||
true, tmp);
|
||||
tmp = 1;
|
||||
}
|
||||
mca_pls_rsh_component.num_concurrent = (tmp != 0 ? true : false);
|
||||
mca_pls_rsh_component.num_concurrent = tmp;
|
||||
|
||||
if (mca_pls_rsh_component.debug == 0) {
|
||||
mca_base_param_reg_int_name("orte", "debug",
|
||||
"Whether or not to enable debugging output for all ORTE components (0 or 1)",
|
||||
|
@ -961,6 +961,17 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
rsh_daemon_info_t *daemon_info;
|
||||
|
||||
OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock);
|
||||
/* JJH Bug:
|
||||
* If we are in '--debug-daemons' we keep the ssh connection
|
||||
* alive for the span of the run. If we use this option
|
||||
* AND we launch on more than "num_concurrent" machines
|
||||
* then we will deadlock. No connections are terminated
|
||||
* until the job is complete, no job is started
|
||||
* since all the orteds are waiting for all the others
|
||||
* to come online, and the others ore not launched because
|
||||
* we are waiting on those that have started to terminate
|
||||
* their ssh tunnels. :(
|
||||
*/
|
||||
if (mca_pls_rsh_component.num_children++ >=
|
||||
mca_pls_rsh_component.num_concurrent) {
|
||||
opal_condition_wait(&mca_pls_rsh_component.cond, &mca_pls_rsh_component.lock);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user