1
1
openmpi/orte/mca/errmgr/base/errmgr_base_select.c
Ralph Castain b9893aacc5 Add a sensor framework to ORTE that monitors applications and notifies the errmgr when they exceed specified boundaries. Two modules are included here:
1. file activity - can monitor file size, access and modification times. If these fail to change over a specified number of sampling iterations (rate is an mca param), then the errmgr is notified.

2. memory usage - checks amount of memory used by a process. Limit and sampling rate can be set.

This support must be enabled by configuring --enable-sensors.

ompi_info and orte-info have been updated to include the new framework.

Also includes some initial steps toward restoring the recovery capability. Most notably, the ODLS API has been extended to include a "restart_proc" entry for restarting a local process, and organizes the various ERRMGR framework globals into a single struct as we do in the other ORTE frameworks. Fix an oversight in the ERRMGR framework where a pointer array was constructed, but not initialized.

Implementation continues.

This commit was SVN r23043.
2010-04-26 22:15:57 +00:00

178 строки
6.2 KiB
C

/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "orte/constants.h"
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
struct orte_errmgr_base_select_module_t {
mca_base_component_t *component;
mca_base_module_t *module;
int priority;
};
typedef struct orte_errmgr_base_select_module_t orte_errmgr_base_select_module_t;
int orte_errmgr_base_select(void)
{
int exit_status = OPAL_SUCCESS;
mca_base_component_list_item_t *cli = NULL;
mca_base_component_t *component = NULL;
mca_base_module_t *module = NULL;
opal_list_item_t *item = NULL;
int priority = 0, i, j, low_i;
orte_errmgr_base_select_module_t *tmp_module = NULL, *tmp_module_sw = NULL;
opal_pointer_array_t tmp_array;
orte_errmgr_base_module_t *i_module = NULL;
bool none_found;
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
opal_pointer_array_init(&tmp_array, 3, INT_MAX, 1);
opal_output_verbose(10, orte_errmgr_base.output,
"errmgr:base:select: Auto-selecting components");
/*
* Traverse the list of available components.
* For each call their 'query' functions to determine relative priority.
*/
none_found = true;
for (item = opal_list_get_first(&orte_errmgr_base_components_available);
item != opal_list_get_end(&orte_errmgr_base_components_available);
item = opal_list_get_next(item) ) {
cli = (mca_base_component_list_item_t *) item;
component = (mca_base_component_t *) cli->cli_component;
/*
* If there is a query function then use it.
*/
if (NULL == component->mca_query_component) {
opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Skipping component [%s]. It does not implement a query function",
component->mca_component_name );
continue;
}
/*
* Query this component for the module and priority
*/
opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Querying component [%s]",
component->mca_component_name);
component->mca_query_component(&module, &priority);
/*
* If no module was returned or negative priority, then skip component
*/
if (NULL == module || priority < 0) {
opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Skipping component [%s]. Query failed to return a module",
component->mca_component_name );
continue;
}
/*
* Append them to the temporary list, we will sort later
*/
opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Query of component [%s] set priority to %d",
component->mca_component_name, priority);
tmp_module = (orte_errmgr_base_select_module_t *)malloc(sizeof(orte_errmgr_base_select_module_t));
tmp_module->component = component;
tmp_module->module = module;
tmp_module->priority = priority;
opal_pointer_array_add(&tmp_array, (void*)tmp_module);
none_found = false;
}
if (none_found) {
/* must have at least one module */
return ORTE_ERR_MODULE_NOT_FOUND;
}
/*
* Sort the list by decending priority
*/
priority = 0;
for(j = 0; j < tmp_array.size; ++j) {
tmp_module_sw = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, j);
if( NULL == tmp_module_sw ) {
continue;
}
low_i = -1;
priority = tmp_module_sw->priority;
for(i = 0; i < tmp_array.size; ++i) {
tmp_module = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, i);
if( NULL == tmp_module ) {
continue;
}
if( tmp_module->priority > priority ) {
low_i = i;
priority = tmp_module->priority;
}
}
if( low_i >= 0 ) {
tmp_module = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, low_i);
opal_pointer_array_set_item(&tmp_array, low_i, NULL);
j--; /* Try this entry again, if it is not the lowest */
} else {
tmp_module = tmp_module_sw;
opal_pointer_array_set_item(&tmp_array, j, NULL);
}
opal_output_verbose(5, orte_errmgr_base.output,
"errmgr:base:select Add module with priority [%s] %d",
tmp_module->component->mca_component_name, tmp_module->priority);
opal_pointer_array_add(&orte_errmgr_base.modules, (void*)(tmp_module->module));
free(tmp_module);
}
OBJ_DESTRUCT(&tmp_array);
/*
* Initialize each of the Errmgr Modules
*/
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
i_module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
if( NULL == i_module ) {
continue;
}
if( NULL != i_module->init ) {
i_module->init();
}
}
return exit_status;
}