Move the comment at the right place.
This commit was SVN r14237.
Этот коммит содержится в:
родитель
5c355d0bea
Коммит
33bf6c6e54
@ -477,8 +477,18 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
|
|
||||||
if (mca_pls_rsh_component.debug_daemons &&
|
if (mca_pls_rsh_component.debug_daemons &&
|
||||||
mca_pls_rsh_component.num_concurrent < num_nodes) {
|
mca_pls_rsh_component.num_concurrent < num_nodes) {
|
||||||
/* we can't run in this situation, so pretty print the error
|
/**
|
||||||
* and exit
|
* If we are in '--debug-daemons' we keep the ssh connection
|
||||||
|
* alive for the span of the run. If we use this option
|
||||||
|
* AND we launch on more than "num_concurrent" machines
|
||||||
|
* then we will deadlock. No connections are terminated
|
||||||
|
* until the job is complete, no job is started
|
||||||
|
* since all the orteds are waiting for all the others
|
||||||
|
* to come online, and the others ore not launched because
|
||||||
|
* we are waiting on those that have started to terminate
|
||||||
|
* their ssh tunnels. :(
|
||||||
|
* As we cannot run in this situation, pretty print the error
|
||||||
|
* and return an error code.
|
||||||
*/
|
*/
|
||||||
opal_show_help("help-pls-rsh.txt", "deadlock-params",
|
opal_show_help("help-pls-rsh.txt", "deadlock-params",
|
||||||
true, mca_pls_rsh_component.num_concurrent, num_nodes);
|
true, mca_pls_rsh_component.num_concurrent, num_nodes);
|
||||||
@ -1084,16 +1094,9 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
|
|
||||||
} else { /* father */
|
} else { /* father */
|
||||||
OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock);
|
OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock);
|
||||||
/* JJH Bug:
|
/* This situation can lead to a deadlock if '--debug-daemons' is set.
|
||||||
* If we are in '--debug-daemons' we keep the ssh connection
|
* However, the deadlock condition is tested at the begining of this
|
||||||
* alive for the span of the run. If we use this option
|
* function, so we're quite confident it should not happens here.
|
||||||
* AND we launch on more than "num_concurrent" machines
|
|
||||||
* then we will deadlock. No connections are terminated
|
|
||||||
* until the job is complete, no job is started
|
|
||||||
* since all the orteds are waiting for all the others
|
|
||||||
* to come online, and the others ore not launched because
|
|
||||||
* we are waiting on those that have started to terminate
|
|
||||||
* their ssh tunnels. :(
|
|
||||||
*/
|
*/
|
||||||
if (mca_pls_rsh_component.num_children++ >=
|
if (mca_pls_rsh_component.num_children++ >=
|
||||||
mca_pls_rsh_component.num_concurrent) {
|
mca_pls_rsh_component.num_concurrent) {
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user