1
1

154 строки
4.6 KiB
C
Исходник Обычный вид История

/*
* Copyright (c) 2004-2005 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
* All rights reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*
* The Open MPI State-of-Health Monitoring Subsystem
*
*/
#ifndef ORTE_SOH_H
#define ORTE_SOH_H
/*
* includes
*/
#include "orte_config.h"
#include "include/orte_constants.h"
#include "include/orte_types.h"
#include "mca/mca.h"
#include "mca/ns/ns_types.h"
#include "mca/soh/soh_types.h"
/*
* Component functions - all MUST be provided!
*/
/*
* Query the state-of-health of a process
*/
typedef int (*orte_soh_base_module_get_proc_soh_fn_t)(orte_proc_state_t *state,
int *status,
orte_process_name_t *proc);
/*
* Set the state-of-health of a process
*/
typedef int (*orte_soh_base_module_set_proc_soh_fn_t)(orte_process_name_t *proc,
orte_proc_state_t state, int status);
/*
* Query SOH of a node
*/
typedef int (*orte_soh_base_module_get_node_soh_fn_t)(orte_node_state_t *state,
orte_cellid_t cell,
char *nodename);
/*
* Set SOH of a node
*/
typedef int (*orte_soh_base_module_set_node_soh_fn_t)(orte_cellid_t cell,
char *nodename,
orte_node_state_t state);
Add a job_info segment to the system that holds a container for each job. Within each container is a keyval indicating the job state (i.e., all procs at stage1, finalized, etc.). This provides a rough state-of-health for the job. This required a little fiddling with a number of areas. Biggest problem was that it uncovered a potential for an infinite loop to be created in the registry. If a callback function modified the registry, the registry checked the triggers to see if anything had fired. Well, if the original callback was due to a trigger firing, that condition hadn't changed - so the trigger fired again....which caused the callback to be called, which modified the registry, which checked the triggers, etc. etc. Triggers are now checked and then "flagged" as being "in process" so that the registry will NOT recheck that trigger until all callbacks have been processed. Tried doing this with subscriptions as well, but that caused a problem - when we release processes from a stagegate, they (at the moment) immediately place data on the registry that should cause a subscription to fire. Unfortunately, the system will just hang if that subscription doesn't get processed. So, I have left the subscription system alone - any callback function that modifies the registry in a fashion that will fire a subscription will indeed fire that subscription. We'll have to see if this causes problems - it shouldn't, but a careless user could lock things up if the callback generates a callback to itself. Also fixed the code that placed a process' RML contact info on the registry to eliminate the leading '/' from the string. This commit was SVN r6684.
2005-07-29 14:11:19 +00:00
/*
* Query the state-of-health of a job
*/
typedef int (*orte_soh_base_module_get_job_soh_fn_t)(orte_job_state_t *state,
orte_jobid_t jobid);
/*
* Set the state-of-health of a job
*/
typedef int (*orte_soh_base_module_set_job_soh_fn_t)(orte_jobid_t jobid,
orte_job_state_t state);
/*
* Initiate monitoring of a job
* This function notifies the soh that it should initiate monitoring of the specified
* jobid. It is called by the resource manager once a job has been launched. Calling
* the function, allows soh components (e.g., the BProc component that monitors daemons
* via the BProc-provided centralized alerting system) to make the necessary connections
* for monitoring the job.
*/
typedef int (*orte_soh_base_module_begin_monitoring_fn_t)(orte_jobid_t job);
/* Shutdown the module nicely
*/
typedef int (*orte_soh_base_module_finalize_fn_t)(void);
/* below are the prototypes needed by the MCA */
/*
* Ver 1.0.0
*/
struct orte_soh_base_module_1_0_0_t {
orte_soh_base_module_get_proc_soh_fn_t get_proc_soh;
orte_soh_base_module_set_proc_soh_fn_t set_proc_soh;
orte_soh_base_module_get_node_soh_fn_t get_node_soh;
orte_soh_base_module_set_node_soh_fn_t set_node_soh;
Add a job_info segment to the system that holds a container for each job. Within each container is a keyval indicating the job state (i.e., all procs at stage1, finalized, etc.). This provides a rough state-of-health for the job. This required a little fiddling with a number of areas. Biggest problem was that it uncovered a potential for an infinite loop to be created in the registry. If a callback function modified the registry, the registry checked the triggers to see if anything had fired. Well, if the original callback was due to a trigger firing, that condition hadn't changed - so the trigger fired again....which caused the callback to be called, which modified the registry, which checked the triggers, etc. etc. Triggers are now checked and then "flagged" as being "in process" so that the registry will NOT recheck that trigger until all callbacks have been processed. Tried doing this with subscriptions as well, but that caused a problem - when we release processes from a stagegate, they (at the moment) immediately place data on the registry that should cause a subscription to fire. Unfortunately, the system will just hang if that subscription doesn't get processed. So, I have left the subscription system alone - any callback function that modifies the registry in a fashion that will fire a subscription will indeed fire that subscription. We'll have to see if this causes problems - it shouldn't, but a careless user could lock things up if the callback generates a callback to itself. Also fixed the code that placed a process' RML contact info on the registry to eliminate the leading '/' from the string. This commit was SVN r6684.
2005-07-29 14:11:19 +00:00
orte_soh_base_module_get_job_soh_fn_t get_job_soh;
orte_soh_base_module_set_job_soh_fn_t set_job_soh;
orte_soh_base_module_begin_monitoring_fn_t begin_monitoring_job;
orte_soh_base_module_finalize_fn_t finalize;
};
typedef struct orte_soh_base_module_1_0_0_t orte_soh_base_module_1_0_0_t;
typedef orte_soh_base_module_1_0_0_t orte_soh_base_module_t;
/*
* SOH Component
*/
typedef orte_soh_base_module_t* (*orte_soh_base_component_init_fn_t)(
int *priority);
typedef int (*orte_soh_base_component_finalize_fn_t)(void);
/*
* the standard component data structure
*/
struct orte_soh_base_component_1_0_0_t {
mca_base_component_t soh_version;
mca_base_component_data_1_0_0_t soh_data;
orte_soh_base_component_init_fn_t soh_init;
orte_soh_base_component_finalize_fn_t soh_finalize;
};
typedef struct orte_soh_base_component_1_0_0_t orte_soh_base_component_1_0_0_t;
typedef orte_soh_base_component_1_0_0_t orte_soh_base_component_t;
/*
* Macro for use in components that are of type ns v1.0.0
*/
#define ORTE_SOH_BASE_VERSION_1_0_0 \
/* soh v1.0 is chained to MCA v1.0 */ \
MCA_BASE_VERSION_1_0_0, \
/* soh v1.0 */ \
"soh", 1, 0, 0
OMPI_DECLSPEC extern orte_soh_base_module_t orte_soh; /* holds selected module's function pointers */
#endif /* ORTE_SOH_H */