1
1

Fix ticket 353 - print out a nice message that the combination of debug-daemons and num_concurrent in the pls rsh launcher will cause deadlock and exit

This commit was SVN r12279.
Этот коммит содержится в:
Ralph Castain 2006-10-24 15:59:02 +00:00
родитель cb622db7c9
Коммит 8636ac6a4d
4 изменённых файлов: 27 добавлений и 0 удалений

Просмотреть файл

@ -50,3 +50,12 @@ remote agents (typically rsh or ssh) to invoke concurrently while
launching parallel jobs. launching parallel jobs.
This value has automatically be reset to 1; processing will continue. This value has automatically be reset to 1; processing will continue.
[deadlock-params]
The rsh launcher has been given a number of %d concurrent daemons to
launch and is in a debug-daemons option. However, the total number of
daemons to launch (%d) is greater than this value. This is a scenario that
will cause the system to deadlock.
To avoid deadlock, either increase the number of concurrent daemons, or
remove the debug-daemons flag.

Просмотреть файл

@ -62,6 +62,7 @@ struct orte_pls_rsh_component_t {
orte_pls_base_component_t super; orte_pls_base_component_t super;
bool debug; bool debug;
bool debug_malloc; bool debug_malloc;
bool debug_daemons;
bool reap; bool reap;
bool assume_same_shell; bool assume_same_shell;
int delay; int delay;

Просмотреть файл

@ -141,6 +141,11 @@ int orte_pls_rsh_component_open(void)
mca_pls_rsh_component.debug = OPAL_INT_TO_BOOL(tmp); mca_pls_rsh_component.debug = OPAL_INT_TO_BOOL(tmp);
} }
mca_base_param_reg_int("orte", "debug_daemons",
"Whether or not to enable debugging daemons (0 or 1)",
false, false, false, &tmp);
mca_pls_rsh_component.debug_daemons = OPAL_INT_TO_BOOL(tmp);
mca_base_param_reg_string(c, "orted", mca_base_param_reg_string(c, "orted",
"The command name that the rsh pls component will invoke for the ORTE daemon", "The command name that the rsh pls component will invoke for the ORTE daemon",
false, false, "orted", false, false, "orted",

Просмотреть файл

@ -420,6 +420,18 @@ int orte_pls_rsh_launch(orte_jobid_t jobid)
num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes); num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes);
if (mca_pls_rsh_component.debug_daemons &&
mca_pls_rsh_component.num_concurrent < num_nodes) {
/* we can't run in this situation, so pretty print the error
* and exit
*/
opal_show_help("help-pls-rsh.txt", "deadlock-params",
true, mca_pls_rsh_component.num_concurrent, num_nodes);
OBJ_RELEASE(map);
OBJ_DESTRUCT(&active_daemons);
return ORTE_ERR_FATAL;
}
/* /*
* After a discussion between Ralph & Jeff, we concluded that we * After a discussion between Ralph & Jeff, we concluded that we
* really are handling the prefix dir option incorrectly. It currently * really are handling the prefix dir option incorrectly. It currently