b9893aacc5
1. file activity - can monitor file size, access and modification times. If these fail to change over a specified number of sampling iterations (rate is an mca param), then the errmgr is notified. 2. memory usage - checks amount of memory used by a process. Limit and sampling rate can be set. This support must be enabled by configuring --enable-sensors. ompi_info and orte-info have been updated to include the new framework. Also includes some initial steps toward restoring the recovery capability. Most notably, the ODLS API has been extended to include a "restart_proc" entry for restarting a local process, and organizes the various ERRMGR framework globals into a single struct as we do in the other ORTE frameworks. Fix an oversight in the ERRMGR framework where a pointer array was constructed, but not initialized. Implementation continues. This commit was SVN r23043.
178 строки
6.2 KiB
C
178 строки
6.2 KiB
C
/*
|
|
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
|
|
* University Research and Technology
|
|
* Corporation. All rights reserved.
|
|
* Copyright (c) 2004-2005 The University of Tennessee and The University
|
|
* of Tennessee Research Foundation. All rights
|
|
* reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
|
|
#include "orte_config.h"
|
|
#include "orte/constants.h"
|
|
|
|
#ifdef HAVE_STRING_H
|
|
#include <string.h>
|
|
#endif
|
|
|
|
#include "opal/mca/mca.h"
|
|
#include "opal/mca/base/base.h"
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
#include "opal/util/output.h"
|
|
|
|
#include "orte/mca/errmgr/base/base.h"
|
|
#include "orte/mca/errmgr/base/errmgr_private.h"
|
|
|
|
|
|
struct orte_errmgr_base_select_module_t {
|
|
mca_base_component_t *component;
|
|
mca_base_module_t *module;
|
|
int priority;
|
|
};
|
|
typedef struct orte_errmgr_base_select_module_t orte_errmgr_base_select_module_t;
|
|
|
|
int orte_errmgr_base_select(void)
|
|
{
|
|
int exit_status = OPAL_SUCCESS;
|
|
mca_base_component_list_item_t *cli = NULL;
|
|
mca_base_component_t *component = NULL;
|
|
mca_base_module_t *module = NULL;
|
|
opal_list_item_t *item = NULL;
|
|
int priority = 0, i, j, low_i;
|
|
orte_errmgr_base_select_module_t *tmp_module = NULL, *tmp_module_sw = NULL;
|
|
opal_pointer_array_t tmp_array;
|
|
orte_errmgr_base_module_t *i_module = NULL;
|
|
bool none_found;
|
|
|
|
OBJ_CONSTRUCT(&tmp_array, opal_pointer_array_t);
|
|
opal_pointer_array_init(&tmp_array, 3, INT_MAX, 1);
|
|
|
|
opal_output_verbose(10, orte_errmgr_base.output,
|
|
"errmgr:base:select: Auto-selecting components");
|
|
|
|
/*
|
|
* Traverse the list of available components.
|
|
* For each call their 'query' functions to determine relative priority.
|
|
*/
|
|
none_found = true;
|
|
for (item = opal_list_get_first(&orte_errmgr_base_components_available);
|
|
item != opal_list_get_end(&orte_errmgr_base_components_available);
|
|
item = opal_list_get_next(item) ) {
|
|
cli = (mca_base_component_list_item_t *) item;
|
|
component = (mca_base_component_t *) cli->cli_component;
|
|
|
|
/*
|
|
* If there is a query function then use it.
|
|
*/
|
|
if (NULL == component->mca_query_component) {
|
|
opal_output_verbose(5, orte_errmgr_base.output,
|
|
"errmgr:base:select Skipping component [%s]. It does not implement a query function",
|
|
component->mca_component_name );
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* Query this component for the module and priority
|
|
*/
|
|
opal_output_verbose(5, orte_errmgr_base.output,
|
|
"errmgr:base:select Querying component [%s]",
|
|
component->mca_component_name);
|
|
|
|
component->mca_query_component(&module, &priority);
|
|
|
|
/*
|
|
* If no module was returned or negative priority, then skip component
|
|
*/
|
|
if (NULL == module || priority < 0) {
|
|
opal_output_verbose(5, orte_errmgr_base.output,
|
|
"errmgr:base:select Skipping component [%s]. Query failed to return a module",
|
|
component->mca_component_name );
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* Append them to the temporary list, we will sort later
|
|
*/
|
|
opal_output_verbose(5, orte_errmgr_base.output,
|
|
"errmgr:base:select Query of component [%s] set priority to %d",
|
|
component->mca_component_name, priority);
|
|
tmp_module = (orte_errmgr_base_select_module_t *)malloc(sizeof(orte_errmgr_base_select_module_t));
|
|
tmp_module->component = component;
|
|
tmp_module->module = module;
|
|
tmp_module->priority = priority;
|
|
|
|
opal_pointer_array_add(&tmp_array, (void*)tmp_module);
|
|
none_found = false;
|
|
}
|
|
|
|
if (none_found) {
|
|
/* must have at least one module */
|
|
return ORTE_ERR_MODULE_NOT_FOUND;
|
|
}
|
|
|
|
/*
|
|
* Sort the list by decending priority
|
|
*/
|
|
priority = 0;
|
|
for(j = 0; j < tmp_array.size; ++j) {
|
|
tmp_module_sw = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, j);
|
|
if( NULL == tmp_module_sw ) {
|
|
continue;
|
|
}
|
|
|
|
low_i = -1;
|
|
priority = tmp_module_sw->priority;
|
|
|
|
for(i = 0; i < tmp_array.size; ++i) {
|
|
tmp_module = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, i);
|
|
if( NULL == tmp_module ) {
|
|
continue;
|
|
}
|
|
if( tmp_module->priority > priority ) {
|
|
low_i = i;
|
|
priority = tmp_module->priority;
|
|
}
|
|
}
|
|
|
|
if( low_i >= 0 ) {
|
|
tmp_module = (orte_errmgr_base_select_module_t*)opal_pointer_array_get_item(&tmp_array, low_i);
|
|
opal_pointer_array_set_item(&tmp_array, low_i, NULL);
|
|
j--; /* Try this entry again, if it is not the lowest */
|
|
} else {
|
|
tmp_module = tmp_module_sw;
|
|
opal_pointer_array_set_item(&tmp_array, j, NULL);
|
|
}
|
|
opal_output_verbose(5, orte_errmgr_base.output,
|
|
"errmgr:base:select Add module with priority [%s] %d",
|
|
tmp_module->component->mca_component_name, tmp_module->priority);
|
|
opal_pointer_array_add(&orte_errmgr_base.modules, (void*)(tmp_module->module));
|
|
free(tmp_module);
|
|
}
|
|
OBJ_DESTRUCT(&tmp_array);
|
|
|
|
/*
|
|
* Initialize each of the Errmgr Modules
|
|
*/
|
|
for(i = 0; i < orte_errmgr_base.modules.size; ++i) {
|
|
i_module = (orte_errmgr_base_module_t*)opal_pointer_array_get_item(&orte_errmgr_base.modules, i);
|
|
if( NULL == i_module ) {
|
|
continue;
|
|
}
|
|
if( NULL != i_module->init ) {
|
|
i_module->init();
|
|
}
|
|
}
|
|
|
|
return exit_status;
|
|
}
|