a287c9cb65
This commit also cleans up the checkpoint and terminate case making it more precise than before. Previously the application could make a small amount of progress between checkpoint completion and application termination. Now the application will make no progress at all in this time span. Additional minor change: - Start using OPAL_INT_TO_BOOL instead of if/else logic This commit was SVN r16952.
186 строки
6.6 KiB
C
186 строки
6.6 KiB
C
/*
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
* All rights reserved.
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
* University of Stuttgart. All rights reserved.
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
* All rights reserved.
|
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
|
*
|
|
* $COPYRIGHT$
|
|
*
|
|
* Additional copyrights may follow
|
|
*
|
|
* $HEADER$
|
|
*/
|
|
|
|
#include "orte_config.h"
|
|
|
|
#include "orte/orte_constants.h"
|
|
#include "opal/mca/mca.h"
|
|
#include "opal/mca/base/base.h"
|
|
|
|
#include "opal/util/output.h"
|
|
#include "opal/mca/base/mca_base_param.h"
|
|
|
|
#include "orte/mca/snapc/snapc.h"
|
|
#include "orte/mca/snapc/base/base.h"
|
|
|
|
#include "orte/mca/snapc/base/static-components.h"
|
|
|
|
/*
|
|
* Globals
|
|
*/
|
|
int orte_snapc_base_output = -1;
|
|
bool orte_snapc_base_is_tool = false;
|
|
orte_snapc_base_module_t orte_snapc = {
|
|
NULL, /* snapc_init */
|
|
NULL, /* snapc_finalize */
|
|
NULL, /* setup_job */
|
|
NULL /* release_job */
|
|
};
|
|
|
|
opal_list_t orte_snapc_base_components_available;
|
|
orte_snapc_base_component_t orte_snapc_base_selected_component;
|
|
|
|
char * orte_snapc_base_global_snapshot_dir = NULL;
|
|
char * orte_snapc_base_global_snapshot_loc = NULL;
|
|
char * orte_snapc_base_global_snapshot_ref = NULL;
|
|
bool orte_snapc_base_store_in_place = true;
|
|
bool orte_snapc_base_store_only_one_seq = false;
|
|
bool orte_snapc_base_establish_gloabl_snapshot_dir = false;
|
|
|
|
/**
|
|
* Function for finding and opening either all MCA components,
|
|
* or the one that was specifically requested via a MCA parameter.
|
|
*/
|
|
int orte_snapc_base_open(void)
|
|
{
|
|
int value;
|
|
char * str_value = NULL;
|
|
char * home = NULL;
|
|
|
|
/* Debugging/Verbose output */
|
|
mca_base_param_reg_int_name("snapc",
|
|
"base_verbose",
|
|
"Verbosity level of the SNAPC framework",
|
|
false, false,
|
|
0, &value);
|
|
if(0 != value) {
|
|
orte_snapc_base_output = opal_output_open(NULL);
|
|
} else {
|
|
orte_snapc_base_output = -1;
|
|
}
|
|
opal_output_set_verbosity(orte_snapc_base_output, value);
|
|
|
|
/* We may need this later */
|
|
#if !defined(__WINDOWS__)
|
|
home = getenv("HOME");
|
|
#else
|
|
home = getenv("USERPROFILE");
|
|
#endif /* !defined(__WINDOWS__) */
|
|
|
|
/* Global Snapshot directory */
|
|
mca_base_param_reg_string_name("snapc",
|
|
"base_global_snapshot_dir",
|
|
"The base directory to use when storing global snapshots",
|
|
false, false,
|
|
home,
|
|
&orte_snapc_base_global_snapshot_dir);
|
|
/*
|
|
* Store the checkpoint files in their final location.
|
|
* This assumes that the storage place is on a shared file
|
|
* system that all nodes can access uniformly.
|
|
* Default = enabled
|
|
*/
|
|
mca_base_param_reg_int_name("snapc",
|
|
"base_store_in_place",
|
|
"If global_snapshot_dir is on a shared file system all nodes can access, "
|
|
"then the checkpoint files can be stored in place instead of incurring a "
|
|
"remote copy. [Default = enabled]",
|
|
false, false,
|
|
1,
|
|
&value);
|
|
orte_snapc_base_store_in_place = OPAL_INT_TO_BOOL(value);
|
|
|
|
/*
|
|
* Reuse sequence numbers
|
|
* This will create a directory and always use seq 0 for all checkpoints
|
|
* This *should* also enforce a 2-phase commit protocol
|
|
*/
|
|
mca_base_param_reg_int_name("snapc_base",
|
|
"only_one_seq",
|
|
"Only store the most recent checkpoint sequence. [Default = disabled]",
|
|
false, false,
|
|
0,
|
|
&value);
|
|
orte_snapc_base_store_only_one_seq = OPAL_INT_TO_BOOL(value);
|
|
|
|
/*
|
|
* Pre-establish the global snapshot directory upon job registration
|
|
*/
|
|
mca_base_param_reg_int_name("snapc_base",
|
|
"establish_global_snapshot_dir",
|
|
"Establish the global snapshot directory on job startup. [Default = disabled]",
|
|
false, false,
|
|
0,
|
|
&value);
|
|
orte_snapc_base_establish_gloabl_snapshot_dir = OPAL_INT_TO_BOOL(value);
|
|
|
|
/*
|
|
* User defined global snapshot directory name for this job
|
|
*/
|
|
mca_base_param_reg_string_name("snapc_base",
|
|
"global_snapshot_ref",
|
|
"The global snapshot reference to be used for this job. "
|
|
" [Default = ompi_global_snapshot_MPIRUNPID.ckpt]",
|
|
false, false,
|
|
NULL,
|
|
&orte_snapc_base_global_snapshot_ref);
|
|
|
|
|
|
/* Init the sequence (interval) number */
|
|
orte_snapc_base_snapshot_seq_number = 0;
|
|
|
|
if( NULL == orte_snapc_base_global_snapshot_loc ) {
|
|
char *t1 = NULL;
|
|
char *t2 = NULL;
|
|
t1 = strdup( orte_snapc_base_unique_global_snapshot_name( getpid() ) );
|
|
t2 = orte_snapc_base_get_global_snapshot_directory( t1 );
|
|
orte_snapc_base_global_snapshot_loc = strdup(t2);
|
|
free(t1);
|
|
free(t2);
|
|
}
|
|
|
|
|
|
/*
|
|
* Which SnapC component to open
|
|
* - NULL or "" = auto-select
|
|
* - "none" = Empty component
|
|
* - ow. select that specific component
|
|
* Note: Set the default to NULL here so ompi_info will work correctly,
|
|
* The 'real' default is set in base_select.c
|
|
*/
|
|
mca_base_param_reg_string_name("snapc", NULL,
|
|
"Which SNAPC component to use (empty = auto-select)",
|
|
false, false,
|
|
NULL, &str_value);
|
|
if( NULL != str_value ) {
|
|
free(str_value);
|
|
}
|
|
|
|
/* Open up all available components */
|
|
if (OPAL_SUCCESS !=
|
|
mca_base_components_open("snapc",
|
|
orte_snapc_base_output,
|
|
mca_snapc_base_static_components,
|
|
&orte_snapc_base_components_available,
|
|
true)) {
|
|
return ORTE_ERROR;
|
|
}
|
|
|
|
return ORTE_SUCCESS;
|
|
}
|