1
1
openmpi/orte/mca/errmgr/hnp/errmgr_hnp_component.c
Josh Hursey fabd5cc153 Simplification of the ErrMgr framework by removing the 'stack'/composite functionality.
The composite functionality was becoming difficult to maintain, so we removed it for now which simplifies the framework design considerably.

Since the 'crmig' and 'autor' components were -very- similar to the 'hnp' component, this commit also merges them together. By moving the 'crmig' and 'autor' to a separate file under the 'hnp' component we are able to isolate the C/R logic to a large extent, thus being only minimally hooked into the previous 'hnp' component.

So other than some name changes, the functionality is all still in place. I will update the C/R documentation later this morning.

This commit was SVN r23628.
2010-08-19 13:09:20 +00:00

202 строки
7.8 KiB
C

/*
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
#include "orte_config.h"
#include "opal/util/output.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/errmgr/base/base.h"
#include "orte/mca/errmgr/base/errmgr_private.h"
#include "errmgr_hnp.h"
/*
* Public string for version number
*/
const char *orte_errmgr_hnp_component_version_string =
"ORTE ERRMGR Hnp MCA component version " ORTE_VERSION;
/*
* Local functionality
*/
static int orte_errmgr_hnp_open(void);
static int orte_errmgr_hnp_close(void);
/*
* Instantiate the public struct with all of our public information
* and pointer to our public functions in it
*/
orte_errmgr_hnp_component_t mca_errmgr_hnp_component = {
/* First do the base component stuff */
{
/* Handle the general mca_component_t struct containing
* meta information about the component hnp
*/
{
ORTE_ERRMGR_BASE_VERSION_3_0_0,
/* Component name and version */
"hnp",
ORTE_MAJOR_VERSION,
ORTE_MINOR_VERSION,
ORTE_RELEASE_VERSION,
/* Component open and close functions */
orte_errmgr_hnp_open,
orte_errmgr_hnp_close,
orte_errmgr_hnp_component_query
},
{
/* The component is checkpoint ready */
MCA_BASE_METADATA_PARAM_CHECKPOINT
},
/* Verbosity level */
0,
/* opal_output handler */
-1,
/* Default priority */
50
}
};
static int orte_errmgr_hnp_open(void)
{
int val;
/*
* This should be the last componet to ever get used since
* it doesn't do anything.
*/
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"priority",
"Priority of the ERRMGR hnp component",
false, false,
mca_errmgr_hnp_component.super.priority,
&mca_errmgr_hnp_component.super.priority);
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"verbose",
"Verbose level for the ERRMGR hnp component",
false, false,
mca_errmgr_hnp_component.super.verbose,
&mca_errmgr_hnp_component.super.verbose);
/* If there is a custom verbose level for this component than use it
* otherwise take our parents level and output channel
*/
if ( 0 != mca_errmgr_hnp_component.super.verbose) {
mca_errmgr_hnp_component.super.output_handle = opal_output_open(NULL);
opal_output_set_verbosity(mca_errmgr_hnp_component.super.output_handle,
mca_errmgr_hnp_component.super.verbose);
} else {
mca_errmgr_hnp_component.super.output_handle = orte_errmgr_base.output;
}
#if OPAL_ENABLE_FT_CR
/****************************
* CRMig (C/R Process Migration) MCA Options
****************************/
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"crmig_timing",
"Enable Process Migration timer",
false, false,
0, &val);
mca_errmgr_hnp_component.crmig_timing_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"crmig_enable",
"Enable Process Migration (Default: 0/off)",
false, false,
0, &val);
mca_errmgr_hnp_component.crmig_enabled = OPAL_INT_TO_BOOL(val);
/****************************
* AutoR (Automatic Recovery) MCA Options
****************************/
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"autor_timing",
"Enable Automatic Recovery timer",
false, false,
0, &val);
mca_errmgr_hnp_component.autor_timing_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"autor_enable",
"Enable Automatic Recovery (Default: 0/off)",
false, false,
0, &val);
mca_errmgr_hnp_component.autor_enabled = OPAL_INT_TO_BOOL(val);
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"recovery_delay",
"Number of seconds to wait before starting to recover the job after a failure"
" [Default: 1 sec]",
false, false,
1, &val);
mca_errmgr_hnp_component.autor_recovery_delay = val;
mca_base_param_reg_int(&mca_errmgr_hnp_component.super.base_version,
"skip_oldnode",
"Skip the old node from failed proc, even if it is still available"
" [Default: Enabled]",
false, false,
1, &val);
mca_errmgr_hnp_component.autor_skip_oldnode = OPAL_INT_TO_BOOL(val);
#else
val = 0; /* Silence compiler warning */
#endif /* OPAL_ENABLE_FT_CR */
/*
* Debug Output
*/
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open()");
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: priority = %d",
mca_errmgr_hnp_component.super.priority);
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: verbosity = %d",
mca_errmgr_hnp_component.super.verbose);
#if OPAL_ENABLE_FT_CR
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: --- CR Migration Options ---");
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: Process Migration = %s",
(mca_errmgr_hnp_component.crmig_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: timing = %s",
(mca_errmgr_hnp_component.crmig_timing_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: --- Auto. Recovery Options ---");
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: Auto. Recover = %s",
(mca_errmgr_hnp_component.autor_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: timing = %s",
(mca_errmgr_hnp_component.autor_timing_enabled ? "Enabled" : "Disabled"));
opal_output_verbose(20, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: open: recover_delay = %d",
mca_errmgr_hnp_component.autor_recovery_delay);
mca_errmgr_hnp_component.crmig_in_progress = false;
mca_errmgr_hnp_component.autor_in_progress = false;
mca_errmgr_hnp_component.term_in_progress = false;
#endif /* OPAL_ENABLE_FT_CR */
return ORTE_SUCCESS;
}
static int orte_errmgr_hnp_close(void)
{
opal_output_verbose(10, mca_errmgr_hnp_component.super.output_handle,
"errmgr:hnp: close()");
return ORTE_SUCCESS;
}