Only set the state of the processes the daemon was responsible for to
ABORTED if the ssh that started the daemon exited abnormally. Otherwise, bad things happen if all the processes on that node exit before the processes on other nodes. This patch is bigger than it should be because I had to indent a bunch of code when I moved the if statement. This commit was SVN r5107.
Этот коммит содержится в:
родитель
13a4aee1a5
Коммит
5753c6a47f
@ -87,6 +87,15 @@ static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
ompi_list_item_t* item;
|
||||
int rc;
|
||||
|
||||
/* if ssh exited abnormally, set the child processes to aborted
|
||||
and print something useful to the user. The usual reasons for
|
||||
ssh to exit abnormally all are a pretty good indication that
|
||||
the child processes aren't going to start up properly.
|
||||
|
||||
This should somehow be pushed up to the calling level, but we
|
||||
don't really have a way to do that just yet.
|
||||
*/
|
||||
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {
|
||||
/* get the mapping for our node so we can cancel the right things */
|
||||
OBJ_CONSTRUCT(&map, ompi_list_t);
|
||||
rc = orte_rmaps_base_get_node_map(orte_process_info.my_name->cellid,
|
||||
@ -98,7 +107,8 @@ static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
/* set state of all processes associated with the daemon as terminated */
|
||||
/* set state of all processes associated with the daemon as
|
||||
terminated */
|
||||
for(item = ompi_list_get_first(&map);
|
||||
item != ompi_list_get_end(&map);
|
||||
item = ompi_list_get_next(item)) {
|
||||
@ -116,11 +126,7 @@ static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
OBJ_DESTRUCT(&map);
|
||||
|
||||
cleanup:
|
||||
/* BWB - XXX - FIXME - this should be made prettier in some way. We
|
||||
have something of a problem here, since it's a callback, so we
|
||||
don't have a good way to propogate back up to the user :/ */
|
||||
/* tell the user something went wrong */
|
||||
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {
|
||||
ompi_output(0, "A daemon on node %s failed to start as expected."
|
||||
"There may be more information available above from the"
|
||||
"remote shell.", info->node->node_name);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user