From 5753c6a47f2e087576a02058c102d60dad33c42c Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Thu, 31 Mar 2005 04:23:55 +0000 Subject: [PATCH] Only set the state of the processes the daemon was responsible for to ABORTED if the ssh that started the daemon exited abnormally. Otherwise, bad things happen if all the processes on that node exit before the processes on other nodes. This patch is bigger than it should be because I had to indent a bunch of code when I moved the if statement. This commit was SVN r5107. --- src/mca/pls/rsh/pls_rsh_module.c | 62 +++++++++++++++++--------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/src/mca/pls/rsh/pls_rsh_module.c b/src/mca/pls/rsh/pls_rsh_module.c index d9abe40e4e..d372a2bf91 100644 --- a/src/mca/pls/rsh/pls_rsh_module.c +++ b/src/mca/pls/rsh/pls_rsh_module.c @@ -87,40 +87,46 @@ static void orte_pls_rsh_wait_daemon(pid_t pid, int status, void* cbdata) ompi_list_item_t* item; int rc; - /* get the mapping for our node so we can cancel the right things */ - OBJ_CONSTRUCT(&map, ompi_list_t); - rc = orte_rmaps_base_get_node_map(orte_process_info.my_name->cellid, - info->jobid, - info->node->node_name, - &map); - if(ORTE_SUCCESS != rc) { - ORTE_ERROR_LOG(rc); - goto cleanup; - } + /* if ssh exited abnormally, set the child processes to aborted + and print something useful to the user. The usual reasons for + ssh to exit abnormally all are a pretty good indication that + the child processes aren't going to start up properly. - /* set state of all processes associated with the daemon as terminated */ - for(item = ompi_list_get_first(&map); - item != ompi_list_get_end(&map); - item = ompi_list_get_next(item)) { - orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*) item; - size_t i; - - for (i = 0 ; i < map->num_procs ; ++i) { - rc = orte_soh.set_proc_soh(&(map->procs[i]->proc_name), - ORTE_PROC_STATE_ABORTED, status); - } + This should somehow be pushed up to the calling level, but we + don't really have a way to do that just yet. + */ + if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { + /* get the mapping for our node so we can cancel the right things */ + OBJ_CONSTRUCT(&map, ompi_list_t); + rc = orte_rmaps_base_get_node_map(orte_process_info.my_name->cellid, + info->jobid, + info->node->node_name, + &map); if(ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); + goto cleanup; } - } - OBJ_DESTRUCT(&map); + + /* set state of all processes associated with the daemon as + terminated */ + for(item = ompi_list_get_first(&map); + item != ompi_list_get_end(&map); + item = ompi_list_get_next(item)) { + orte_rmaps_base_map_t* map = (orte_rmaps_base_map_t*) item; + size_t i; + + for (i = 0 ; i < map->num_procs ; ++i) { + rc = orte_soh.set_proc_soh(&(map->procs[i]->proc_name), + ORTE_PROC_STATE_ABORTED, status); + } + if(ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + } + } + OBJ_DESTRUCT(&map); cleanup: - /* BWB - XXX - FIXME - this should be made prettier in some way. We - have something of a problem here, since it's a callback, so we - don't have a good way to propogate back up to the user :/ */ - /* tell the user something went wrong */ - if (! WIFEXITED(status) || ! WEXITSTATUS(status) == 0) { + /* tell the user something went wrong */ ompi_output(0, "A daemon on node %s failed to start as expected." "There may be more information available above from the" "remote shell.", info->node->node_name);