Only set the state of the processes the daemon was responsible for to
ABORTED if the ssh that started the daemon exited abnormally. Otherwise, bad things happen if all the processes on that node exit before the processes on other nodes. This patch is bigger than it should be because I had to indent a bunch of code when I moved the if statement. This commit was SVN r5107.
Этот коммит содержится в:
родитель
13a4aee1a5
Коммит
5753c6a47f
@ -87,40 +87,46 @@ static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata)
|
||||
ompi_list_item_t* item;
|
||||
int rc;
|
||||
|
||||
/* get the mapping for our node so we can cancel the right things */
|
||||
OBJ_CONSTRUCT(&map, ompi_list_t);
|
||||
rc = orte_rmaps_base_get_node_map(orte_process_info.my_name->cellid,
|
||||
info->jobid,
|
||||
info->node->node_name,
|
||||
&map);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
/* if ssh exited abnormally, set the child processes to aborted
|
||||
and print something useful to the user. The usual reasons for
|
||||
ssh to exit abnormally all are a pretty good indication that
|
||||
the child processes aren't going to start up properly.
|
||||
|
||||
/* set state of all processes associated with the daemon as terminated */
|
||||
for(item = ompi_list_get_first(&map);
|
||||
item != ompi_list_get_end(&map);
|
||||
item = ompi_list_get_next(item)) {
|
||||
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*) item;
|
||||
size_t i;
|
||||
|
||||
for (i = 0 ; i < map->num_procs ; ++i) {
|
||||
rc = orte_soh.set_proc_soh(&(map->procs[i]->proc_name),
|
||||
ORTE_PROC_STATE_ABORTED, status);
|
||||
}
|
||||
This should somehow be pushed up to the calling level, but we
|
||||
don't really have a way to do that just yet.
|
||||
*/
|
||||
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {
|
||||
/* get the mapping for our node so we can cancel the right things */
|
||||
OBJ_CONSTRUCT(&map, ompi_list_t);
|
||||
rc = orte_rmaps_base_get_node_map(orte_process_info.my_name->cellid,
|
||||
info->jobid,
|
||||
info->node->node_name,
|
||||
&map);
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&map);
|
||||
|
||||
/* set state of all processes associated with the daemon as
|
||||
terminated */
|
||||
for(item = ompi_list_get_first(&map);
|
||||
item != ompi_list_get_end(&map);
|
||||
item = ompi_list_get_next(item)) {
|
||||
orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*) item;
|
||||
size_t i;
|
||||
|
||||
for (i = 0 ; i < map->num_procs ; ++i) {
|
||||
rc = orte_soh.set_proc_soh(&(map->procs[i]->proc_name),
|
||||
ORTE_PROC_STATE_ABORTED, status);
|
||||
}
|
||||
if(ORTE_SUCCESS != rc) {
|
||||
ORTE_ERROR_LOG(rc);
|
||||
}
|
||||
}
|
||||
OBJ_DESTRUCT(&map);
|
||||
|
||||
cleanup:
|
||||
/* BWB - XXX - FIXME - this should be made prettier in some way. We
|
||||
have something of a problem here, since it's a callback, so we
|
||||
don't have a good way to propogate back up to the user :/ */
|
||||
/* tell the user something went wrong */
|
||||
if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) {
|
||||
/* tell the user something went wrong */
|
||||
ompi_output(0, "A daemon on node %s failed to start as expected."
|
||||
"There may be more information available above from the"
|
||||
"remote shell.", info->node->node_name);
|
||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user