Fix ticket 353 - print out a nice message that the combination of debug-daemons and num_concurrent in the pls rsh launcher will cause deadlock and exit
This commit was SVN r12279.
Этот коммит содержится в:
родитель
cb622db7c9
Коммит
8636ac6a4d
@ -50,3 +50,12 @@ remote agents (typically rsh or ssh) to invoke concurrently while
|
|||||||
launching parallel jobs.
|
launching parallel jobs.
|
||||||
|
|
||||||
This value has automatically be reset to 1; processing will continue.
|
This value has automatically be reset to 1; processing will continue.
|
||||||
|
|
||||||
|
[deadlock-params]
|
||||||
|
The rsh launcher has been given a number of %d concurrent daemons to
|
||||||
|
launch and is in a debug-daemons option. However, the total number of
|
||||||
|
daemons to launch (%d) is greater than this value. This is a scenario that
|
||||||
|
will cause the system to deadlock.
|
||||||
|
|
||||||
|
To avoid deadlock, either increase the number of concurrent daemons, or
|
||||||
|
remove the debug-daemons flag.
|
||||||
|
@ -62,6 +62,7 @@ struct orte_pls_rsh_component_t {
|
|||||||
orte_pls_base_component_t super;
|
orte_pls_base_component_t super;
|
||||||
bool debug;
|
bool debug;
|
||||||
bool debug_malloc;
|
bool debug_malloc;
|
||||||
|
bool debug_daemons;
|
||||||
bool reap;
|
bool reap;
|
||||||
bool assume_same_shell;
|
bool assume_same_shell;
|
||||||
int delay;
|
int delay;
|
||||||
|
@ -141,6 +141,11 @@ int orte_pls_rsh_component_open(void)
|
|||||||
mca_pls_rsh_component.debug = OPAL_INT_TO_BOOL(tmp);
|
mca_pls_rsh_component.debug = OPAL_INT_TO_BOOL(tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mca_base_param_reg_int("orte", "debug_daemons",
|
||||||
|
"Whether or not to enable debugging daemons (0 or 1)",
|
||||||
|
false, false, false, &tmp);
|
||||||
|
mca_pls_rsh_component.debug_daemons = OPAL_INT_TO_BOOL(tmp);
|
||||||
|
|
||||||
mca_base_param_reg_string(c, "orted",
|
mca_base_param_reg_string(c, "orted",
|
||||||
"The command name that the rsh pls component will invoke for the ORTE daemon",
|
"The command name that the rsh pls component will invoke for the ORTE daemon",
|
||||||
false, false, "orted",
|
false, false, "orted",
|
||||||
|
@ -420,6 +420,18 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
|||||||
|
|
||||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
|
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
|
||||||
|
|
||||||
|
if (mca_pls_rsh_component.debug_daemons &&
|
||||||
|
mca_pls_rsh_component.num_concurrent < num_nodes) {
|
||||||
|
/* we can't run in this situation, so pretty print the error
|
||||||
|
* and exit
|
||||||
|
*/
|
||||||
|
opal_show_help("help-pls-rsh.txt", "deadlock-params",
|
||||||
|
true, mca_pls_rsh_component.num_concurrent, num_nodes);
|
||||||
|
OBJ_RELEASE(map);
|
||||||
|
OBJ_DESTRUCT(&active_daemons);
|
||||||
|
return ORTE_ERR_FATAL;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* After a discussion between Ralph & Jeff, we concluded that we
|
* After a discussion between Ralph & Jeff, we concluded that we
|
||||||
* really are handling the prefix dir option incorrectly. It currently
|
* really are handling the prefix dir option incorrectly. It currently
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user