
1. file activity - can monitor file size, access and modification times. If these fail to change over a specified number of sampling iterations (rate is an mca param), then the errmgr is notified. 2. memory usage - checks amount of memory used by a process. Limit and sampling rate can be set. This support must be enabled by configuring --enable-sensors. ompi_info and orte-info have been updated to include the new framework. Also includes some initial steps toward restoring the recovery capability. Most notably, the ODLS API has been extended to include a "restart_proc" entry for restarting a local process, and organizes the various ERRMGR framework globals into a single struct as we do in the other ORTE frameworks. Fix an oversight in the ERRMGR framework where a pointer array was constructed, but not initialized. Implementation continues. This commit was SVN r23043.
84 строки
1.9 KiB
C
84 строки
1.9 KiB
C
/*
|
|
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
|
*
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
#include "opal/util/output.h"
|
|
|
|
#include "orte/mca/errmgr/errmgr.h"
|
|
#include "orte/mca/errmgr/base/base.h"
|
|
#include "errmgr_hnp.h"
|
|
|
|
/*
|
|
* Public string for version number
|
|
*/
|
|
const char *orte_errmgr_hnp_component_version_string =
|
|
"ORTE ERRMGR hnp MCA component version " ORTE_VERSION;
|
|
|
|
/*
|
|
* Local functionality
|
|
*/
|
|
static int errmgr_hnp_open(void);
|
|
static int errmgr_hnp_close(void);
|
|
static int errmgr_hnp_component_query(mca_base_module_t **module, int *priority);
|
|
|
|
/*
|
|
* Instantiate the public struct with all of our public information
|
|
* and pointer to our public functions in it
|
|
*/
|
|
orte_errmgr_base_component_t mca_errmgr_hnp_component =
|
|
{
|
|
/* Handle the general mca_component_t struct containing
|
|
* meta information about the component ithnp
|
|
*/
|
|
{
|
|
ORTE_ERRMGR_BASE_VERSION_3_0_0,
|
|
/* Component name and version */
|
|
"hnp",
|
|
ORTE_MAJOR_VERSION,
|
|
ORTE_MINOR_VERSION,
|
|
ORTE_RELEASE_VERSION,
|
|
|
|
/* Component open and close functions */
|
|
errmgr_hnp_open,
|
|
errmgr_hnp_close,
|
|
errmgr_hnp_component_query
|
|
},
|
|
{
|
|
/* The component is checkpoint ready */
|
|
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
|
}
|
|
};
|
|
|
|
static int errmgr_hnp_open(void)
|
|
{
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int errmgr_hnp_close(void)
|
|
{
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int errmgr_hnp_component_query(mca_base_module_t **module, int *priority)
|
|
{
|
|
if (ORTE_PROC_IS_HNP) {
|
|
/* keep our priority low so that other modules are higher
|
|
* and will run before us
|
|
*/
|
|
*priority = 10;
|
|
*module = (mca_base_module_t *)&orte_errmgr_hnp_module;
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
*priority = -1;
|
|
*module = NULL;
|
|
return ORTE_ERROR;
|
|
}
|