Add an API to the errmgr so that apps can register for a callback to warn them of an impending migration - this gives apps a chance to cleanly terminate prior to being migrated for external reasons (e.g., impending failures). The timeout provided indicates to the daemon how long it should wait before proceeding to kill/migrate the process - if the process fails to exit before that time, the daemon will kill it.
This commit was SVN r24412.
Этот коммит содержится в:
родитель
a0f6e153c7
Коммит
b98a2917ff
@ -56,7 +56,8 @@ orte_errmgr_base_module_t orte_errmgr_app_module = {
|
||||
update_state,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
NULL,
|
||||
orte_errmgr_base_register_migration_warning
|
||||
};
|
||||
|
||||
/************************
|
||||
|
@ -250,6 +250,11 @@ int orte_errmgr_base_update_state(orte_jobid_t job,
|
||||
*/
|
||||
return ORTE_SUCCESS;
|
||||
}
|
||||
void orte_errmgr_base_register_migration_warning(struct timeval *tv)
|
||||
{
|
||||
/* stub function - ignore */
|
||||
return;
|
||||
}
|
||||
|
||||
/********************
|
||||
* Utility functions
|
||||
|
@ -63,7 +63,8 @@ orte_errmgr_base_module_t orte_errmgr = {
|
||||
orte_errmgr_base_update_state,
|
||||
NULL, /* predicted_fault */
|
||||
NULL, /* suggest_map_targets */
|
||||
NULL /* ft_event */
|
||||
NULL, /* ft_event */
|
||||
orte_errmgr_base_register_migration_warning
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -28,6 +28,10 @@
|
||||
#include "orte/constants.h"
|
||||
#include "orte/types.h"
|
||||
|
||||
#ifdef HAVE_UNISTD_H
|
||||
#include <unistd.h>
|
||||
#endif /* HAVE_UNISTD_H */
|
||||
|
||||
#include "opal/dss/dss_types.h"
|
||||
#include "orte/mca/plm/plm_types.h"
|
||||
#include "orte/runtime/orte_globals.h"
|
||||
@ -71,5 +75,7 @@ ORTE_DECLSPEC int orte_errmgr_base_update_state(orte_jobid_t job,
|
||||
pid_t pid,
|
||||
orte_exit_code_t exit_code);
|
||||
|
||||
ORTE_DECLSPEC void orte_errmgr_base_register_migration_warning(struct timeval *tv);
|
||||
|
||||
END_C_DECLS
|
||||
#endif
|
||||
|
@ -231,6 +231,18 @@ typedef int (*orte_errmgr_base_module_suggest_map_targets_fn_t)(orte_proc_t *pro
|
||||
*/
|
||||
typedef int (*orte_errmgr_base_module_ft_event_fn_t)(int state);
|
||||
|
||||
/**
|
||||
* Register a callback to alert caller when ORTE is preparing to
|
||||
* migrate the process to another location. This provides an
|
||||
* opportunity for the process to checkpoint any required state,
|
||||
* and to cleanly shutdown.
|
||||
*
|
||||
* @param[in] delay Time to delay before assuming process is stuck
|
||||
* and cannot exit on its own - and thus, go
|
||||
* ahead and migrate it
|
||||
*/
|
||||
typedef void (*orte_errmgr_base_module_register_migration_warning_fn_t)(struct timeval *tv);
|
||||
|
||||
/*
|
||||
* Module Structure
|
||||
*/
|
||||
@ -252,6 +264,9 @@ struct orte_errmgr_base_module_2_3_0_t {
|
||||
|
||||
/** Handle any FT Notifications */
|
||||
orte_errmgr_base_module_ft_event_fn_t ft_event;
|
||||
|
||||
/* Register to be warned of impending migration */
|
||||
orte_errmgr_base_module_register_migration_warning_fn_t register_migration_warning;
|
||||
};
|
||||
typedef struct orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_2_3_0_t;
|
||||
typedef orte_errmgr_base_module_2_3_0_t orte_errmgr_base_module_t;
|
||||
|
@ -72,7 +72,8 @@ static orte_errmgr_base_module_t global_module = {
|
||||
/* Suggest proc to node mapping */
|
||||
orte_errmgr_hnp_global_suggest_map_targets,
|
||||
/* FT Event hook */
|
||||
orte_errmgr_hnp_global_ft_event
|
||||
orte_errmgr_hnp_global_ft_event,
|
||||
orte_errmgr_base_register_migration_warning
|
||||
};
|
||||
|
||||
|
||||
|
@ -92,7 +92,8 @@ orte_errmgr_base_module_t orte_errmgr_orted_module = {
|
||||
update_state,
|
||||
predicted_fault,
|
||||
suggest_map_targets,
|
||||
ft_event
|
||||
ft_event,
|
||||
orte_errmgr_base_register_migration_warning
|
||||
};
|
||||
|
||||
/************************
|
||||
|
Загрузка…
Ссылка в новой задаче
Block a user