Provide a generic fix for the termination issue instead of r25248. The
termination condition is to be checked at the daemon/HNP level not down in the routing. This commit was SVN r25313. The following SVN revision numbers were found above: r25248 --> open-mpi/ompi@b42ccc89b8
Этот коммит содержится в:
родитель
c453614f8b
Коммит
749b63c09d
@ -149,12 +149,13 @@ static int update_state(orte_jobid_t job,
|
|||||||
}
|
}
|
||||||
|
|
||||||
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
OPAL_OUTPUT_VERBOSE((10, orte_errmgr_base.output,
|
||||||
"errmgr:default_orted:update_state() %s) "
|
"errmgr:default_orted:update_state() %s) "
|
||||||
"------- %s state updated for process %s",
|
"------- %s state updated for process %s to %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
((NULL == proc) ? "App. Process" :
|
((NULL == proc) ? "App. Process" :
|
||||||
(proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")),
|
(proc->jobid == ORTE_PROC_MY_HNP->jobid ? "Daemon" : "App. Process")),
|
||||||
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));
|
(NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc),
|
||||||
|
orte_proc_state_to_str(state)));
|
||||||
|
|
||||||
/* if this is a heartbeat failure, let the HNP handle it */
|
/* if this is a heartbeat failure, let the HNP handle it */
|
||||||
if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate ||
|
if (ORTE_JOB_STATE_HEARTBEAT_FAILED == jobstate ||
|
||||||
@ -276,6 +277,20 @@ static int update_state(orte_jobid_t job,
|
|||||||
*/
|
*/
|
||||||
orte_quit();
|
orte_quit();
|
||||||
}
|
}
|
||||||
|
/* was it a daemon that failed? */
|
||||||
|
if (proc->jobid == ORTE_PROC_MY_NAME->jobid) {
|
||||||
|
/* if all my routes are gone, then terminate ourselves */
|
||||||
|
if (0 == orte_routed.num_routes() &&
|
||||||
|
0 == opal_list_get_size(&orte_local_children)) {
|
||||||
|
orte_quit();
|
||||||
|
} else {
|
||||||
|
OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base.output,
|
||||||
|
"%s errmgr:orted not exiting, num_routes() == %d, num children == %d",
|
||||||
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
|
(int)orte_routed.num_routes(),
|
||||||
|
(int)opal_list_get_size(&orte_local_children)));
|
||||||
|
}
|
||||||
|
}
|
||||||
/* if not, then indicate we can continue */
|
/* if not, then indicate we can continue */
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -32,7 +32,6 @@
|
|||||||
#include "orte/util/nidmap.h"
|
#include "orte/util/nidmap.h"
|
||||||
#include "orte/runtime/orte_globals.h"
|
#include "orte/runtime/orte_globals.h"
|
||||||
#include "orte/runtime/orte_wait.h"
|
#include "orte/runtime/orte_wait.h"
|
||||||
#include "orte/runtime/orte_quit.h"
|
|
||||||
#include "orte/runtime/runtime.h"
|
#include "orte/runtime/runtime.h"
|
||||||
#include "orte/runtime/data_type_support/orte_dt_support.h"
|
#include "orte/runtime/data_type_support/orte_dt_support.h"
|
||||||
|
|
||||||
@ -841,22 +840,15 @@ static int route_lost(const orte_process_name_t *route)
|
|||||||
item = opal_list_get_next(item)) {
|
item = opal_list_get_next(item)) {
|
||||||
child = (orte_routed_tree_t*)item;
|
child = (orte_routed_tree_t*)item;
|
||||||
if (child->vpid == route->vpid) {
|
if (child->vpid == route->vpid) {
|
||||||
OPAL_OUTPUT_VERBOSE((4, orte_routed_base_output,
|
OPAL_OUTPUT_VERBOSE((4, orte_routed_base_output,
|
||||||
"%s routed_binomial: removing route to child daemon %s",
|
"%s routed_binomial: removing route to child daemon %s",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
|
||||||
ORTE_NAME_PRINT(route)));
|
ORTE_NAME_PRINT(route)));
|
||||||
opal_list_remove_item(&my_children, item);
|
opal_list_remove_item(&my_children, item);
|
||||||
OBJ_RELEASE(item);
|
OBJ_RELEASE(item);
|
||||||
return ORTE_SUCCESS;
|
return ORTE_SUCCESS;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* if we are the HNP or daemon, AND we are terminating,
|
|
||||||
* then we want to finalize if all our child daemons
|
|
||||||
* have left
|
|
||||||
*/
|
|
||||||
if (orte_terminating && 0 == opal_list_get_size(&my_children)) {
|
|
||||||
orte_quit();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* we don't care about this one, so return success */
|
/* we don't care about this one, so return success */
|
||||||
|
@ -743,8 +743,6 @@ int orte_daemon_process_commands(orte_process_name_t* sender,
|
|||||||
opal_output(0, "%s orted_cmd: received exit cmd",
|
opal_output(0, "%s orted_cmd: received exit cmd",
|
||||||
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
|
||||||
}
|
}
|
||||||
/* flag that we are terminating */
|
|
||||||
orte_terminating = true;
|
|
||||||
/* kill the local procs */
|
/* kill the local procs */
|
||||||
orte_odls.kill_local_procs(NULL);
|
orte_odls.kill_local_procs(NULL);
|
||||||
/* trigger our appropriate exit procedure
|
/* trigger our appropriate exit procedure
|
||||||
|
@ -51,7 +51,6 @@
|
|||||||
*/
|
*/
|
||||||
bool orte_initialized = false;
|
bool orte_initialized = false;
|
||||||
bool orte_finalizing = false;
|
bool orte_finalizing = false;
|
||||||
bool orte_terminating = false;
|
|
||||||
bool orte_debug_flag = false;
|
bool orte_debug_flag = false;
|
||||||
int orte_debug_verbosity;
|
int orte_debug_verbosity;
|
||||||
char *orte_prohibited_session_dirs = NULL;
|
char *orte_prohibited_session_dirs = NULL;
|
||||||
|
@ -143,9 +143,6 @@ void orte_quit(void)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* flag that we are finalizing */
|
|
||||||
orte_finalizing = true;
|
|
||||||
|
|
||||||
/* whack any lingering session directory files from our jobs */
|
/* whack any lingering session directory files from our jobs */
|
||||||
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
|
||||||
|
|
||||||
|
@ -43,7 +43,6 @@ ORTE_DECLSPEC extern const char orte_version_string[];
|
|||||||
*/
|
*/
|
||||||
ORTE_DECLSPEC extern bool orte_initialized;
|
ORTE_DECLSPEC extern bool orte_initialized;
|
||||||
ORTE_DECLSPEC extern bool orte_finalizing;
|
ORTE_DECLSPEC extern bool orte_finalizing;
|
||||||
ORTE_DECLSPEC extern bool orte_terminating;
|
|
||||||
ORTE_DECLSPEC extern int orte_debug_output;
|
ORTE_DECLSPEC extern int orte_debug_output;
|
||||||
ORTE_DECLSPEC extern bool orte_debug_flag;
|
ORTE_DECLSPEC extern bool orte_debug_flag;
|
||||||
|
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user