Fix a cut-n-paste error that causes the 'num_concurrent' to be
set to 1 or 0 instead of the user defined number or default (128). This caused the PLS to deadlock when using '--debug-daemons' with more than 2 processes. :( svn blame says that it was broken in r11347 It is *not* a problem on v1.1 or v1.2 branches. Bug spotted by Tim Mattox and myself. This commit was SVN r11575. The following SVN revision numbers were found above: r11347 --> open-mpi/ompi@f52c10d18e
Этот коммит содержится в:
родитель
fb4d7ab268
Коммит
160120b4c5
@ -128,7 +128,8 @@ int orte_pls_rsh_component_open(void)
|
|||||||
true, tmp);
|
true, tmp);
|
||||||
tmp = 1;
|
tmp = 1;
|
||||||
}
|
}
|
||||||
mca_pls_rsh_component.num_concurrent = (tmp != 0 ? true : false);
|
mca_pls_rsh_component.num_concurrent = tmp;
|
||||||
|
|
||||||
if (mca_pls_rsh_component.debug == 0) {
|
if (mca_pls_rsh_component.debug == 0) {
|
||||||
mca_base_param_reg_int_name("orte", "debug",
|
mca_base_param_reg_int_name("orte", "debug",
|
||||||
"Whether or not to enable debugging output for all ORTE components (0 or 1)",
|
"Whether or not to enable debugging output for all ORTE components (0 or 1)",
|
||||||
|
@ -961,6 +961,17 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
rsh_daemon_info_t *daemon_info;
|
rsh_daemon_info_t *daemon_info;
|
||||||
|
|
||||||
OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock);
|
OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock);
|
||||||
|
/* JJH Bug:
|
||||||
|
* If we are in '--debug-daemons' we keep the ssh connection
|
||||||
|
* alive for the span of the run. If we use this option
|
||||||
|
* AND we launch on more than "num_concurrent" machines
|
||||||
|
* then we will deadlock. No connections are terminated
|
||||||
|
* until the job is complete, no job is started
|
||||||
|
* since all the orteds are waiting for all the others
|
||||||
|
* to come online, and the others ore not launched because
|
||||||
|
* we are waiting on those that have started to terminate
|
||||||
|
* their ssh tunnels. :(
|
||||||
|
*/
|
||||||
if (mca_pls_rsh_component.num_children++ >=
|
if (mca_pls_rsh_component.num_children++ >=
|
||||||
mca_pls_rsh_component.num_concurrent) {
|
mca_pls_rsh_component.num_concurrent) {
|
||||||
opal_condition_wait(&mca_pls_rsh_component.cond, &mca_pls_rsh_component.lock);
|
opal_condition_wait(&mca_pls_rsh_component.cond, &mca_pls_rsh_component.lock);
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user