1
1

Although this didn't solve the earlier termination problem, the code will be required once we get connection terminations properly detected. If a daemon (or HNP) is trying to terminate, then we need to check for termination conditions whenever a route is lost - when all child connections are gone, then we are free to finalize.

This commit was SVN r25248.
Этот коммит содержится в:
Ralph Castain 2011-10-10 21:41:49 +00:00
родитель 1aa1c2e9b4
Коммит b42ccc89b8
5 изменённых файлов: 19 добавлений и 0 удалений

Просмотреть файл

@ -32,6 +32,7 @@
#include "orte/util/nidmap.h" #include "orte/util/nidmap.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h" #include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_quit.h"
#include "orte/runtime/runtime.h" #include "orte/runtime/runtime.h"
#include "orte/runtime/data_type_support/orte_dt_support.h" #include "orte/runtime/data_type_support/orte_dt_support.h"
@ -830,11 +831,22 @@ static int route_lost(const orte_process_name_t *route)
item = opal_list_get_next(item)) { item = opal_list_get_next(item)) {
child = (orte_routed_tree_t*)item; child = (orte_routed_tree_t*)item;
if (child->vpid == route->vpid) { if (child->vpid == route->vpid) {
OPAL_OUTPUT_VERBOSE((4, orte_routed_base_output,
"%s routed_binomial: removing route to child daemon %s",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
ORTE_NAME_PRINT(route)));
opal_list_remove_item(&my_children, item); opal_list_remove_item(&my_children, item);
OBJ_RELEASE(item); OBJ_RELEASE(item);
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
} }
/* if we are the HNP or daemon, AND we are terminating,
* then we want to finalize if all our child daemons
* have left
*/
if (orte_terminating && 0 == opal_list_get_size(&my_children)) {
orte_quit();
}
} }
/* we don't care about this one, so return success */ /* we don't care about this one, so return success */

Просмотреть файл

@ -743,6 +743,8 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
opal_output(0, "%s orted_cmd: received exit cmd", opal_output(0, "%s orted_cmd: received exit cmd",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
} }
/* flag that we are terminating */
orte_terminating = true;
/* kill the local procs */ /* kill the local procs */
orte_odls.kill_local_procs(NULL); orte_odls.kill_local_procs(NULL);
/* trigger our appropriate exit procedure /* trigger our appropriate exit procedure

Просмотреть файл

@ -52,6 +52,7 @@
*/ */
bool orte_initialized = false; bool orte_initialized = false;
bool orte_finalizing = false; bool orte_finalizing = false;
bool orte_terminating = false;
bool orte_debug_flag = false; bool orte_debug_flag = false;
int orte_debug_verbosity; int orte_debug_verbosity;
char *orte_prohibited_session_dirs = NULL; char *orte_prohibited_session_dirs = NULL;

Просмотреть файл

@ -143,6 +143,9 @@ void orte_quit(void)
return; return;
} }
/* flag that we are finalizing */
orte_finalizing = true;
/* whack any lingering session directory files from our jobs */ /* whack any lingering session directory files from our jobs */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);

Просмотреть файл

@ -43,6 +43,7 @@ ORTE_DECLSPEC extern const char orte_version_string[];
*/ */
ORTE_DECLSPEC extern bool orte_initialized; ORTE_DECLSPEC extern bool orte_initialized;
ORTE_DECLSPEC extern bool orte_finalizing; ORTE_DECLSPEC extern bool orte_finalizing;
ORTE_DECLSPEC extern bool orte_terminating;
ORTE_DECLSPEC extern int orte_debug_output; ORTE_DECLSPEC extern int orte_debug_output;
ORTE_DECLSPEC extern bool orte_debug_flag; ORTE_DECLSPEC extern bool orte_debug_flag;