1
1

Per discussion at the Dec ORTE design meeting, add an "set_lifeline" API to the orte_routed framework. This allows the caller to define a "lifeline" process so that, if the connection to that lifeline is subsequently lost, the process will be terminated. This helps tools that connect to an mpirun to know when that mpirun completes and terminates.

This commit was SVN r20158.
Этот коммит содержится в:
Ralph Castain 2008-12-20 23:23:11 +00:00
родитель 4d5fbc5955
Коммит 9f6c1b9d07
4 изменённых файлов: 55 добавлений и 0 удалений

Просмотреть файл

@ -47,6 +47,7 @@ static int update_routing_tree(void);
static orte_vpid_t get_routing_tree(orte_jobid_t job, opal_list_t *children);
static bool proc_is_below(orte_vpid_t root, orte_vpid_t target);
static int get_wireup_info(opal_buffer_t *buf);
static int set_lifeline(orte_process_name_t *proc);
#if OPAL_ENABLE_FT == 1
static int binomial_ft_event(int state);
@ -61,6 +62,7 @@ orte_routed_module_t orte_routed_binomial_module = {
init_routes,
route_lost,
route_is_defined,
set_lifeline,
update_routing_tree,
get_routing_tree,
proc_is_below,
@ -78,6 +80,7 @@ static orte_process_name_t wildcard_route;
static opal_condition_t cond;
static opal_mutex_t lock;
static orte_process_name_t *lifeline=NULL;
static orte_process_name_t local_lifeline;
static orte_process_name_t my_parent;
static int num_children;
static opal_list_t my_children;
@ -759,6 +762,18 @@ static bool route_is_defined(const orte_process_name_t *target)
return true;
}
static int set_lifeline(orte_process_name_t *proc)
{
/* we have to copy the proc data because there is no
* guarantee that it will be preserved
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
lifeline = &local_lifeline;
return ORTE_SUCCESS;
}
static int binomial_tree(int rank, int parent, int me, int num_procs,
int *nchildren, opal_list_t *childrn, opal_bitmap_t *relatives)
{

Просмотреть файл

@ -45,6 +45,7 @@ static int update_routing_tree(void);
static orte_vpid_t get_routing_tree(orte_jobid_t job, opal_list_t *children);
static bool proc_is_below(orte_vpid_t root, orte_vpid_t target);
static int get_wireup_info(opal_buffer_t *buf);
static int set_lifeline(orte_process_name_t *proc);
#if OPAL_ENABLE_FT == 1
static int linear_ft_event(int state);
@ -59,6 +60,7 @@ orte_routed_module_t orte_routed_linear_module = {
init_routes,
route_lost,
route_is_defined,
set_lifeline,
update_routing_tree,
get_routing_tree,
proc_is_below,
@ -76,6 +78,7 @@ static orte_process_name_t wildcard_route;
static opal_condition_t cond;
static opal_mutex_t lock;
static orte_process_name_t *lifeline=NULL;
static orte_process_name_t local_lifeline;
static bool ack_recvd;
@ -753,6 +756,18 @@ static bool route_is_defined(const orte_process_name_t *target)
}
static int set_lifeline(orte_process_name_t *proc)
{
/* we have to copy the proc data because there is no
* guarantee that it will be preserved
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
lifeline = &local_lifeline;
return ORTE_SUCCESS;
}
static int update_routing_tree(void)
{
/* if I am anything other than a daemon or the HNP, this

Просмотреть файл

@ -48,6 +48,7 @@ static int update_routing_tree(void);
static orte_vpid_t get_routing_tree(orte_jobid_t job, opal_list_t *children);
static bool proc_is_below(orte_vpid_t root, orte_vpid_t target);
static int get_wireup_info(opal_buffer_t *buf);
static int set_lifeline(orte_process_name_t *proc);
#if OPAL_ENABLE_FT == 1
static int radix_ft_event(int state);
@ -62,6 +63,7 @@ orte_routed_module_t orte_routed_radix_module = {
init_routes,
route_lost,
route_is_defined,
set_lifeline,
update_routing_tree,
get_routing_tree,
proc_is_below,
@ -79,6 +81,7 @@ static orte_process_name_t wildcard_route;
static opal_condition_t cond;
static opal_mutex_t lock;
static orte_process_name_t *lifeline=NULL;
static orte_process_name_t local_lifeline;
static orte_process_name_t my_parent;
static int num_children;
static opal_list_t my_children;
@ -785,6 +788,18 @@ static bool route_is_defined(const orte_process_name_t *target)
return true;
}
static int set_lifeline(orte_process_name_t *proc)
{
/* we have to copy the proc data because there is no
* guarantee that it will be preserved
*/
local_lifeline.jobid = proc->jobid;
local_lifeline.vpid = proc->vpid;
lifeline = &local_lifeline;
return ORTE_SUCCESS;
}
static void radix_tree(int rank, int *num_children,
opal_list_t *children, opal_bitmap_t *relatives)
{

Просмотреть файл

@ -223,6 +223,15 @@ typedef orte_vpid_t (*orte_routed_module_get_routing_tree_fn_t)(orte_jobid_t job
*/
typedef bool (*orte_routed_module_proc_is_below_fn_t)(orte_vpid_t root, orte_vpid_t target);
/*
* Set lifeline process
*
* Defines the lifeline to be the specified process. Should contact to
* that process be lost, the errmgr will be called, possibly resulting
* in termination of the process and job.
*/
typedef int (*orte_routed_module_set_lifeline_fn_t)(orte_process_name_t *proc);
/**
* Handle fault tolerance updates
*
@ -254,6 +263,7 @@ struct orte_routed_module_t {
orte_routed_module_init_routes_fn_t init_routes;
orte_routed_module_route_lost_fn_t route_lost;
orte_routed_module_route_is_defined_fn_t route_is_defined;
orte_routed_module_set_lifeline_fn_t set_lifeline;
/* fns for daemons */
orte_routed_module_update_routing_tree_fn_t update_routing_tree;
orte_routed_module_get_routing_tree_fn_t get_routing_tree;