1
1

Fix ticket 353 - print out a nice message that the combination of debug-daemons and num_concurrent in the pls rsh launcher will cause deadlock and exit

This commit was SVN r12279.
Этот коммит содержится в:
Ralph Castain 2006-10-24 15:59:02 +00:00
родитель cb622db7c9
Коммит 8636ac6a4d
4 изменённых файлов: 27 добавлений и 0 удалений

Просмотреть файл

@ -50,3 +50,12 @@ remote agents (typically rsh or ssh) to invoke concurrently while
launching parallel jobs.
This value has automatically be reset to 1; processing will continue.
[deadlock-params]
The rsh launcher has been given a number of %d concurrent daemons to
launch and is in a debug-daemons option. However, the total number of
daemons to launch (%d) is greater than this value. This is a scenario that
will cause the system to deadlock.
To avoid deadlock, either increase the number of concurrent daemons, or
remove the debug-daemons flag.

Просмотреть файл

@ -62,6 +62,7 @@ struct orte_pls_rsh_component_t {
orte_pls_base_component_t super;
bool debug;
bool debug_malloc;
bool debug_daemons;
bool reap;
bool assume_same_shell;
int delay;

Просмотреть файл

@ -141,6 +141,11 @@ int orte_pls_rsh_component_open(void)
mca_pls_rsh_component.debug = OPAL_INT_TO_BOOL(tmp);
}
mca_base_param_reg_int("orte", "debug_daemons",
"Whether or not to enable debugging daemons (0 or 1)",
false, false, false, &tmp);
mca_pls_rsh_component.debug_daemons = OPAL_INT_TO_BOOL(tmp);
mca_base_param_reg_string(c, "orted",
"The command name that the rsh pls component will invoke for the ORTE daemon",
false, false, "orted",

Просмотреть файл

@ -420,6 +420,18 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
if (mca_pls_rsh_component.debug_daemons &&
mca_pls_rsh_component.num_concurrent < num_nodes) {
/* we can't run in this situation, so pretty print the error
* and exit
*/
opal_show_help("help-pls-rsh.txt", "deadlock-params",
true, mca_pls_rsh_component.num_concurrent, num_nodes);
OBJ_RELEASE(map);
OBJ_DESTRUCT(&active_daemons);
return ORTE_ERR_FATAL;
}
/*
* After a discussion between Ralph & Jeff, we concluded that we
* really are handling the prefix dir option incorrectly. It currently