1
1
openmpi/orte/mca/errmgr/hnp/errmgr_hnp_component.c
Ralph Castain b9893aacc5 Add a sensor framework to ORTE that monitors applications and notifies the errmgr when they exceed specified boundaries. Two modules are included here:
1. file activity - can monitor file size, access and modification times. If these fail to change over a specified number of sampling iterations (rate is an mca param), then the errmgr is notified.

2. memory usage - checks amount of memory used by a process. Limit and sampling rate can be set.

This support must be enabled by configuring --enable-sensors.

ompi_info and orte-info have been updated to include the new framework.

Also includes some initial steps toward restoring the recovery capability. Most notably, the ODLS API has been extended to include a "restart_proc" entry for restarting a local process, and organizes the various ERRMGR framework globals into a single struct as we do in the other ORTE frameworks. Fix an oversight in the ERRMGR framework where a pointer array was constructed, but not initialized.

Implementation continues.

This commit was SVN r23043.
2010-04-26 22:15:57 +00:00

84 строки
1.9 KiB
C

/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "errmgr_hnp.h"
/*
* Public string for version number
*/
const char *orte_errmgr_hnp_component_version_string =
"ORTE ERRMGR hnp MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int errmgr_hnp_open(void);
static int errmgr_hnp_close(void);
static int errmgr_hnp_component_query(mca_base_module_t **module, int *priority);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_base_component_t mca_errmgr_hnp_component =
{
/* Handle the general mca_component_t struct containing
* meta information about the component ithnp
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"hnp",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
errmgr_hnp_open,
errmgr_hnp_close,
errmgr_hnp_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
}
};
static int errmgr_hnp_open(void)
{
return ORTE_SUCCESS;
}
static int errmgr_hnp_close(void)
{
return ORTE_SUCCESS;
}
static int errmgr_hnp_component_query(mca_base_module_t **module, int *priority)
{
if (ORTE_PROC_IS_HNP) {
/* keep our priority low so that other modules are higher
* and will run before us
*/
*priority = 10;
*module = (mca_base_module_t *)&orte_errmgr_hnp_module;
return ORTE_SUCCESS;
}
*priority = -1;
*module = NULL;
return ORTE_ERROR;
}