8b8bee04d6
This patch contains the following items: * Fix the flag passed to open() for the read side of the named pipe between the local and app coordinator. There is a race condition when using O_RDWR on a named pipe (not sure how that bug got in there in the first place). * Adjust control in the C/R thread timing * Clarify return code in BLCR component * Allow the user to adjust the max wait time for the named pipes in the FileM local coordinator by using the MCA parameter "snapc_full_max_wait_time" (Default: 20 seconds) * If the application terminates while there are active FileM operations, force mpirun to wait on these operations to complete. * Allow the user to set the local copy command (Default: cp) via MCA parameter "filem_rsh_cp" * Implement the ability to throttle the number of outgoing connections in FileM. At larger scales this type of explicit throttling helps prevent overwhelming the HNP machine. Default: 10, set via MCA parameter: {{{filem_rsh_max_outgoing}}} This commit was SVN r21167. The following SVN revision numbers were found above: r21131 --> open-mpi/ompi@0deb009225
169 строки
6.1 KiB
C
169 строки
6.1 KiB
C
/*
|
|
* Copyright (c) 2004-2009 The Trustees of Indiana University.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
#include "opal/util/output.h"
|
|
|
|
#include "orte/mca/snapc/snapc.h"
|
|
#include "orte/mca/snapc/base/base.h"
|
|
#include "snapc_full.h"
|
|
|
|
/*
|
|
* Public string for version number
|
|
*/
|
|
const char *orte_snapc_full_component_version_string =
|
|
"ORTE SNAPC full MCA component version " OMPI_VERSION;
|
|
|
|
/*
|
|
* Local functionality
|
|
*/
|
|
static int snapc_full_open(void);
|
|
static int snapc_full_close(void);
|
|
|
|
bool orte_snapc_full_skip_filem = false;
|
|
bool orte_snapc_full_skip_app = false;
|
|
bool orte_snapc_full_timing_enabled = false;
|
|
int orte_snapc_full_max_wait_time = 20;
|
|
|
|
/*
|
|
* Instantiate the public struct with all of our public information
|
|
* and pointer to our public functions in it
|
|
*/
|
|
orte_snapc_full_component_t mca_snapc_full_component = {
|
|
/* First do the base component stuff */
|
|
{
|
|
/* Handle the general mca_component_t struct containing
|
|
* meta information about the component itfull
|
|
*/
|
|
{
|
|
ORTE_SNAPC_BASE_VERSION_2_0_0,
|
|
/* Component name and version */
|
|
"full",
|
|
OMPI_MAJOR_VERSION,
|
|
OMPI_MINOR_VERSION,
|
|
OMPI_RELEASE_VERSION,
|
|
|
|
/* Component open and close functions */
|
|
snapc_full_open,
|
|
snapc_full_close,
|
|
orte_snapc_full_component_query
|
|
},
|
|
{
|
|
/* The component is checkpoint ready */
|
|
MCA_BASE_METADATA_PARAM_CHECKPOINT
|
|
},
|
|
|
|
/* Verbosity level */
|
|
0,
|
|
/* opal_output handler */
|
|
-1,
|
|
/* Default priority */
|
|
20
|
|
}
|
|
};
|
|
|
|
static int snapc_full_open(void)
|
|
{
|
|
int value;
|
|
|
|
/*
|
|
* This should be the last componet to ever get used since
|
|
* it doesn't do anything.
|
|
*/
|
|
mca_base_param_reg_int(&mca_snapc_full_component.super.base_version,
|
|
"priority",
|
|
"Priority of the SNAPC full component",
|
|
false, false,
|
|
mca_snapc_full_component.super.priority,
|
|
&mca_snapc_full_component.super.priority);
|
|
|
|
mca_base_param_reg_int(&mca_snapc_full_component.super.base_version,
|
|
"verbose",
|
|
"Verbose level for the SNAPC full component",
|
|
false, false,
|
|
mca_snapc_full_component.super.verbose,
|
|
&mca_snapc_full_component.super.verbose);
|
|
/* If there is a custom verbose level for this component than use it
|
|
* otherwise take our parents level and output channel
|
|
*/
|
|
if ( 0 != mca_snapc_full_component.super.verbose) {
|
|
mca_snapc_full_component.super.output_handle = opal_output_open(NULL);
|
|
opal_output_set_verbosity(mca_snapc_full_component.super.output_handle,
|
|
mca_snapc_full_component.super.verbose);
|
|
} else {
|
|
mca_snapc_full_component.super.output_handle = orte_snapc_base_output;
|
|
}
|
|
|
|
mca_base_param_reg_int(&mca_snapc_full_component.super.base_version,
|
|
"skip_filem",
|
|
"Not for general use! For debugging only! Pretend to move files. [Default = disabled]",
|
|
false, false,
|
|
0,
|
|
&value);
|
|
orte_snapc_full_skip_filem = OPAL_INT_TO_BOOL(value);
|
|
|
|
mca_base_param_reg_int(&mca_snapc_full_component.super.base_version,
|
|
"skip_app",
|
|
"Not for general use! For debugging only! Shortcut app level coord. [Default = disabled]",
|
|
false, false,
|
|
0,
|
|
&value);
|
|
orte_snapc_full_skip_app = OPAL_INT_TO_BOOL(value);
|
|
|
|
mca_base_param_reg_int(&mca_snapc_full_component.super.base_version,
|
|
"enable_timing",
|
|
"Enable timing information. [Default = disabled]",
|
|
false, false,
|
|
0,
|
|
&value);
|
|
orte_snapc_full_timing_enabled = OPAL_INT_TO_BOOL(value);
|
|
|
|
mca_base_param_reg_int(&mca_snapc_full_component.super.base_version,
|
|
"max_wait_time",
|
|
"Wait time before orted gives up on checkpoint (seconds)",
|
|
false, false,
|
|
20,
|
|
&orte_snapc_full_max_wait_time);
|
|
|
|
/*
|
|
* Debug Output
|
|
*/
|
|
opal_output_verbose(10, mca_snapc_full_component.super.output_handle,
|
|
"snapc:full: open()");
|
|
opal_output_verbose(20, mca_snapc_full_component.super.output_handle,
|
|
"snapc:full: open: priority = %d",
|
|
mca_snapc_full_component.super.priority);
|
|
opal_output_verbose(20, mca_snapc_full_component.super.output_handle,
|
|
"snapc:full: open: verbosity = %d",
|
|
mca_snapc_full_component.super.verbose);
|
|
opal_output_verbose(20, mca_snapc_full_component.super.output_handle,
|
|
"snapc:full: open: max_wait_time = %d",
|
|
orte_snapc_full_max_wait_time);
|
|
opal_output_verbose(20, mca_snapc_full_component.super.output_handle,
|
|
"snapc:full: open: skip_filem = %s",
|
|
(orte_snapc_full_skip_filem == true ? "True" : "False"));
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|
|
|
|
static int snapc_full_close(void)
|
|
{
|
|
opal_output_verbose(10, mca_snapc_full_component.super.output_handle,
|
|
"snapc:full: close()");
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|