1
1
openmpi/orte/mca/errmgr/base/base.h
Ralph Castain b9893aacc5 Add a sensor framework to ORTE that monitors applications and notifies the errmgr when they exceed specified boundaries. Two modules are included here:
1. file activity - can monitor file size, access and modification times. If these fail to change over a specified number of sampling iterations (rate is an mca param), then the errmgr is notified.

2. memory usage - checks amount of memory used by a process. Limit and sampling rate can be set.

This support must be enabled by configuring --enable-sensors.

ompi_info and orte-info have been updated to include the new framework.

Also includes some initial steps toward restoring the recovery capability. Most notably, the ODLS API has been extended to include a "restart_proc" entry for restarting a local process, and organizes the various ERRMGR framework globals into a single struct as we do in the other ORTE frameworks. Fix an oversight in the ERRMGR framework where a pointer array was constructed, but not initialized.

Implementation continues.

This commit was SVN r23043.
2010-04-26 22:15:57 +00:00

66 строки
1.9 KiB
C

/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/** @file:
*/
#ifndef ORTE_MCA_ERRMGR_BASE_H
#define ORTE_MCA_ERRMGR_BASE_H
/*
* includes
*/
#include "orte_config.h"
#include "orte/constants.h"
#include "opal/class/opal_list.h"
#include "opal/mca/mca.h"
#include "orte/mca/errmgr/errmgr.h"
BEGIN_C_DECLS
/*
* MCA Framework functions
*/
ORTE_DECLSPEC int orte_errmgr_base_open(void);
ORTE_DECLSPEC int orte_errmgr_base_select(void);
ORTE_DECLSPEC int orte_errmgr_base_close(void);
/**
* Composite Stack states
*/
#define ORTE_ERRMGR_STACK_STATE_NONE 0x00 /* No actions have been performed */
#define ORTE_ERRMGR_STACK_STATE_UPDATED 0x01 /* Updated the runtime */
#define ORTE_ERRMGR_STACK_STATE_CONTINUE 0x02 /* Continue running without this process */
#define ORTE_ERRMGR_STACK_STATE_RECOVERED 0x04 /* Process has been recovered */
#define ORTE_ERRMGR_STACK_STATE_JOB_ABORT 0x08 /* Abort this job, cannot recover */
#define ORTE_ERRMGR_STACK_STATE_COMPLETE 0x10 /* done processing this command */
/**
* Output and component variables
*/
ORTE_DECLSPEC extern opal_list_t orte_errmgr_base_components_available;
/*
* Additional External API function declared in errmgr.h
*/
END_C_DECLS
#endif