Fix ticket 353 - print out a nice message that the combination of debug-daemons and num_concurrent in the pls rsh launcher will cause deadlock and exit
This commit was SVN r12279.
Этот коммит содержится в:
родитель
cb622db7c9
Коммит
8636ac6a4d
@ -50,3 +50,12 @@ remote agents (typically rsh or ssh) to invoke concurrently while
|
||||
launching parallel jobs.
|
||||
|
||||
This value has automatically be reset to 1; processing will continue.
|
||||
|
||||
[deadlock-params]
|
||||
The rsh launcher has been given a number of %d concurrent daemons to
|
||||
launch and is in a debug-daemons option. However, the total number of
|
||||
daemons to launch (%d) is greater than this value. This is a scenario that
|
||||
will cause the system to deadlock.
|
||||
|
||||
To avoid deadlock, either increase the number of concurrent daemons, or
|
||||
remove the debug-daemons flag.
|
||||
|
@ -62,6 +62,7 @@ struct orte_pls_rsh_component_t {
|
||||
orte_pls_base_component_t super;
|
||||
bool debug;
|
||||
bool debug_malloc;
|
||||
bool debug_daemons;
|
||||
bool reap;
|
||||
bool assume_same_shell;
|
||||
int delay;
|
||||
|
@ -141,6 +141,11 @@ int orte_pls_rsh_component_open(void)
|
||||
mca_pls_rsh_component.debug = OPAL_INT_TO_BOOL(tmp);
|
||||
}
|
||||
|
||||
mca_base_param_reg_int("orte", "debug_daemons",
|
||||
"Whether or not to enable debugging daemons (0 or 1)",
|
||||
false, false, false, &tmp);
|
||||
mca_pls_rsh_component.debug_daemons = OPAL_INT_TO_BOOL(tmp);
|
||||
|
||||
mca_base_param_reg_string(c, "orted",
|
||||
"The command name that the rsh pls component will invoke for the ORTE daemon",
|
||||
false, false, "orted",
|
||||
|
@ -420,6 +420,18 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
|
||||
|
||||
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
|
||||
|
||||
if (mca_pls_rsh_component.debug_daemons &&
|
||||
mca_pls_rsh_component.num_concurrent < num_nodes) {
|
||||
/* we can't run in this situation, so pretty print the error
|
||||
* and exit
|
||||
*/
|
||||
opal_show_help("help-pls-rsh.txt", "deadlock-params",
|
||||
true, mca_pls_rsh_component.num_concurrent, num_nodes);
|
||||
OBJ_RELEASE(map);
|
||||
OBJ_DESTRUCT(&active_daemons);
|
||||
return ORTE_ERR_FATAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* After a discussion between Ralph & Jeff, we concluded that we
|
||||
* really are handling the prefix dir option incorrectly. It currently
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user