2007-03-16 23:11:45 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2004-2007 The Trustees of Indiana University.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
|
|
|
|
* All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
|
|
|
|
* University of Stuttgart. All rights reserved.
|
|
|
|
* Copyright (c) 2004-2005 The Regents of the University of California.
|
|
|
|
* All rights reserved.
|
2007-10-17 13:47:36 +00:00
|
|
|
* Copyright (c) 2007 Evergrid, Inc. All rights reserved.
|
|
|
|
*
|
2007-03-16 23:11:45 +00:00
|
|
|
* $COPYRIGHT$
|
|
|
|
*
|
|
|
|
* Additional copyrights may follow
|
|
|
|
*
|
|
|
|
* $HEADER$
|
|
|
|
*/
|
|
|
|
|
2007-04-01 16:08:27 +00:00
|
|
|
#include "opal_config.h"
|
2007-03-16 23:11:45 +00:00
|
|
|
|
|
|
|
#if HAVE_SYS_TYPES_H
|
|
|
|
#include <sys/types.h>
|
|
|
|
#endif
|
|
|
|
#if HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include "opal/mca/mca.h"
|
|
|
|
#include "opal/mca/base/base.h"
|
|
|
|
#include "opal/include/opal/constants.h"
|
|
|
|
#include "opal/util/os_dirpath.h"
|
|
|
|
#include "opal/util/output.h"
|
|
|
|
|
|
|
|
#include "opal/mca/crs/crs.h"
|
|
|
|
#include "opal/mca/crs/base/base.h"
|
|
|
|
|
2007-04-01 16:08:27 +00:00
|
|
|
static void opal_crs_base_construct(opal_crs_base_snapshot_t *snapshot)
|
|
|
|
{
|
2007-03-16 23:11:45 +00:00
|
|
|
snapshot->component_name = NULL;
|
|
|
|
snapshot->reference_name = opal_crs_base_unique_snapshot_name(getpid());
|
|
|
|
snapshot->local_location = opal_crs_base_get_snapshot_directory(snapshot->reference_name);
|
|
|
|
snapshot->remote_location = strdup(snapshot->local_location);
|
|
|
|
snapshot->cold_start = false;
|
|
|
|
}
|
|
|
|
|
2007-04-01 16:08:27 +00:00
|
|
|
static void opal_crs_base_destruct( opal_crs_base_snapshot_t *snapshot)
|
|
|
|
{
|
2007-03-16 23:11:45 +00:00
|
|
|
if(NULL != snapshot->reference_name) {
|
|
|
|
free(snapshot->reference_name);
|
|
|
|
snapshot->reference_name = NULL;
|
|
|
|
}
|
|
|
|
if(NULL != snapshot->local_location) {
|
|
|
|
free(snapshot->local_location);
|
|
|
|
snapshot->local_location = NULL;
|
|
|
|
}
|
|
|
|
if(NULL != snapshot->remote_location) {
|
|
|
|
free(snapshot->remote_location);
|
|
|
|
snapshot->remote_location = NULL;
|
|
|
|
}
|
|
|
|
if(NULL != snapshot->component_name) {
|
|
|
|
free(snapshot->component_name);
|
|
|
|
snapshot->component_name = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-04-01 16:08:27 +00:00
|
|
|
OBJ_CLASS_INSTANCE(opal_crs_base_snapshot_t,
|
|
|
|
opal_list_item_t,
|
|
|
|
opal_crs_base_construct,
|
|
|
|
opal_crs_base_destruct);
|
|
|
|
|
2007-03-16 23:11:45 +00:00
|
|
|
int opal_crs_base_none_open(void)
|
|
|
|
{
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int opal_crs_base_none_close(void)
|
|
|
|
{
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int opal_crs_base_none_module_init(void)
|
|
|
|
{
|
|
|
|
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int opal_crs_base_none_module_finalize(void)
|
|
|
|
{
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int opal_crs_base_none_checkpoint(pid_t pid, opal_crs_base_snapshot_t *snapshot, opal_crs_state_type_t *state)
|
|
|
|
{
|
|
|
|
*state = OPAL_CRS_CONTINUE;
|
|
|
|
|
|
|
|
snapshot->component_name = strdup("none");
|
|
|
|
snapshot->reference_name = strdup("none");
|
|
|
|
snapshot->local_location = strdup("");
|
|
|
|
snapshot->remote_location = strdup("");
|
|
|
|
snapshot->cold_start = false;
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
/* JJH - A more complete alternative if needed */
|
|
|
|
opal_crs_none_snapshot_t *snapshot = OBJ_NEW(opal_crs_none_snapshot_t);
|
|
|
|
|
|
|
|
if(NULL != snapshot->super.reference_name)
|
|
|
|
free(snapshot->super.reference_name);
|
|
|
|
snapshot->super.reference_name = strdup(base_snapshot->reference_name);
|
|
|
|
|
|
|
|
if(NULL != snapshot->super.local_location)
|
|
|
|
free(snapshot->super.local_location);
|
|
|
|
snapshot->super.local_location = strdup(base_snapshot->local_location);
|
|
|
|
|
|
|
|
if(NULL != snapshot->super.remote_location)
|
|
|
|
free(snapshot->super.remote_location);
|
|
|
|
snapshot->super.remote_location = strdup(base_snapshot->remote_location);
|
|
|
|
|
|
|
|
opal_output_verbose(10, mca_crs_none_component.super.output_handle,
|
|
|
|
"crs:none: checkpoint(%d, ---)", pid);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create the snapshot directory
|
|
|
|
*/
|
|
|
|
snapshot->super.component_name = strdup(mca_crs_none_component.super.crs_version.mca_component_name);
|
|
|
|
if( OPAL_SUCCESS != (ret = opal_crs_base_init_snapshot_directory(&snapshot->super) )) {
|
|
|
|
opal_output(mca_crs_none_component.super.output_handle,
|
|
|
|
"crs:none: checkpoint(): Error: Unable to initialize the directory for (%s).",
|
|
|
|
snapshot->super.reference_name);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return to the caller
|
|
|
|
*/
|
|
|
|
base_snapshot = &(snapshot->super);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int opal_crs_base_none_restart(opal_crs_base_snapshot_t *snapshot, bool spawn_child, pid_t *child_pid)
|
|
|
|
{
|
|
|
|
*child_pid = getpid();
|
|
|
|
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int opal_crs_base_none_disable_checkpoint(void)
|
|
|
|
{
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
int opal_crs_base_none_enable_checkpoint(void)
|
|
|
|
{
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2007-10-17 13:47:36 +00:00
|
|
|
int opal_crs_base_none_prelaunch(int32_t rank,
|
|
|
|
char *base_snapshot_dir,
|
|
|
|
char **app,
|
|
|
|
char **cwd,
|
|
|
|
char ***argv,
|
|
|
|
char ***env)
|
|
|
|
{
|
|
|
|
opal_setenv(mca_base_param_env_var("opal_cr_is_tool"),
|
|
|
|
"0", true, env);
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2008-02-19 22:15:52 +00:00
|
|
|
int opal_crs_base_none_reg_thread(void)
|
|
|
|
{
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2007-03-16 23:11:45 +00:00
|
|
|
/*
|
|
|
|
* Utility functions
|
|
|
|
*/
|
2007-04-01 16:08:27 +00:00
|
|
|
char * opal_crs_base_unique_snapshot_name(pid_t pid)
|
|
|
|
{
|
2007-10-17 13:47:36 +00:00
|
|
|
char * loc_str = NULL;
|
2007-03-16 23:11:45 +00:00
|
|
|
|
|
|
|
asprintf(&loc_str, "opal_snapshot_%d.ckpt", pid);
|
|
|
|
|
|
|
|
return loc_str;
|
|
|
|
}
|
|
|
|
|
2007-04-01 16:08:27 +00:00
|
|
|
FILE * opal_crs_base_open_read_metadata(char * location, char **component, int *prev_pid)
|
|
|
|
{
|
2007-03-16 23:11:45 +00:00
|
|
|
char * dir_name = NULL;
|
|
|
|
char * content = NULL;
|
|
|
|
char * tmp_str = NULL;
|
|
|
|
int len = 0;
|
|
|
|
FILE * meta_data = NULL;
|
|
|
|
|
|
|
|
*component = NULL;
|
|
|
|
*prev_pid = -1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the snapshot directory, read the metadata file
|
|
|
|
*/
|
|
|
|
asprintf(&dir_name, "%s/%s", location, opal_crs_base_metadata_filename);
|
|
|
|
if (NULL == (meta_data = fopen(dir_name, "r")) ) {
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Component Name
|
|
|
|
*/
|
|
|
|
len = 32; /* Max size for a CRS component name */
|
|
|
|
content = (char *) malloc(sizeof(char) * len);
|
|
|
|
if (NULL == fgets(content, len, meta_data) ) {
|
|
|
|
free(content);
|
|
|
|
content = NULL;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
/* Strip of newline */
|
|
|
|
len = strlen(content);
|
|
|
|
content[len - 1] = '\0';
|
|
|
|
|
|
|
|
*component = strdup(content);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get the PID
|
|
|
|
*/
|
|
|
|
len = 128;
|
|
|
|
tmp_str = (char *) malloc(sizeof(char) * len);
|
|
|
|
if (NULL == fgets(tmp_str, len, meta_data) ) {
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
/* Strip of newline */
|
|
|
|
len = strlen(tmp_str);
|
|
|
|
if(tmp_str[len - 1] == '\n')
|
|
|
|
tmp_str[len - 1] = '\0';
|
|
|
|
*prev_pid = atoi(tmp_str);
|
|
|
|
|
|
|
|
cleanup:
|
|
|
|
return meta_data;
|
|
|
|
}
|
|
|
|
|
2007-04-01 16:08:27 +00:00
|
|
|
char * opal_crs_base_extract_expected_component(char *snapshot_loc, int *prev_pid)
|
|
|
|
{
|
2007-03-16 23:11:45 +00:00
|
|
|
FILE * meta_data = NULL;
|
|
|
|
char * component_name = NULL;
|
|
|
|
|
|
|
|
*prev_pid = -1;
|
|
|
|
|
|
|
|
if( NULL == (meta_data = opal_crs_base_open_read_metadata(snapshot_loc, &component_name, prev_pid)) ) {
|
|
|
|
opal_output(opal_crs_base_output,
|
|
|
|
"opal:crs:base: extract_expected_component: Error: Unable to open the file (%s)\n",
|
|
|
|
snapshot_loc);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
cleanup:
|
|
|
|
if(NULL != meta_data)
|
|
|
|
fclose(meta_data);
|
|
|
|
|
|
|
|
return component_name;
|
|
|
|
}
|
|
|
|
|
2007-04-01 16:08:27 +00:00
|
|
|
char * opal_crs_base_get_snapshot_directory(char *uniq_snapshot_name)
|
|
|
|
{
|
2007-03-16 23:11:45 +00:00
|
|
|
char * dir_name = NULL;
|
|
|
|
|
|
|
|
asprintf(&dir_name, "%s/%s", opal_crs_base_snapshot_dir, uniq_snapshot_name);
|
|
|
|
|
|
|
|
return dir_name;
|
|
|
|
}
|
|
|
|
|
2007-04-01 16:08:27 +00:00
|
|
|
int opal_crs_base_init_snapshot_directory(opal_crs_base_snapshot_t *snapshot)
|
|
|
|
{
|
2007-03-16 23:11:45 +00:00
|
|
|
mode_t my_mode = S_IRWXU;
|
|
|
|
int ret, exit_status = OPAL_SUCCESS;
|
|
|
|
FILE * meta_data = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make the snapshot directory from the uniq_snapshot_name
|
|
|
|
*/
|
|
|
|
if(OPAL_SUCCESS != (ret = opal_os_dirpath_create(snapshot->local_location, my_mode)) ) {
|
|
|
|
exit_status = ret;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize the metadata file at the top of that directory.
|
|
|
|
*/
|
|
|
|
if (NULL == (meta_data = opal_crs_base_open_metadata(snapshot, 'w') ) ) {
|
|
|
|
opal_output(opal_crs_base_output,
|
|
|
|
"opal:crs:base: init_snapshot_directory: Error: Unable to open the file (%s/%s)\n",
|
|
|
|
snapshot->local_location, opal_crs_base_metadata_filename);
|
|
|
|
exit_status = OPAL_ERROR;
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
cleanup:
|
|
|
|
if(NULL != meta_data)
|
|
|
|
fclose(meta_data);
|
|
|
|
|
|
|
|
return OPAL_SUCCESS;
|
|
|
|
}
|
|
|
|
|
2007-04-01 16:08:27 +00:00
|
|
|
FILE *opal_crs_base_open_metadata(opal_crs_base_snapshot_t *snapshot, char mode )
|
|
|
|
{
|
2007-03-16 23:11:45 +00:00
|
|
|
char *meta_data_fname = NULL;
|
|
|
|
FILE * meta_data = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Construct path
|
|
|
|
*/
|
|
|
|
asprintf(&meta_data_fname, "%s/%s", snapshot->local_location, opal_crs_base_metadata_filename);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Open the metadata file
|
|
|
|
*/
|
|
|
|
if( mode == 'w' ) {
|
|
|
|
meta_data = fopen(meta_data_fname, "w");
|
|
|
|
}
|
|
|
|
else if( mode == 'a' ) {
|
|
|
|
meta_data = fopen(meta_data_fname, "a");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (NULL == meta_data ) {
|
|
|
|
opal_output(opal_crs_base_output,
|
|
|
|
"opal:crs:base: open_metadata (%c): Error: Unable to open the file (%s)\n",
|
|
|
|
mode, meta_data_fname);
|
|
|
|
goto cleanup;
|
|
|
|
}
|
|
|
|
|
|
|
|
if( mode == 'w' ) {
|
|
|
|
/*
|
|
|
|
* The first line is the component name,
|
|
|
|
* everything else here is defined by the component
|
|
|
|
*/
|
|
|
|
fprintf(meta_data, "%s\n", snapshot->component_name);
|
|
|
|
fprintf(meta_data, "%d\n", getpid());
|
|
|
|
}
|
|
|
|
|
|
|
|
cleanup:
|
|
|
|
if(NULL != meta_data_fname)
|
|
|
|
free(meta_data_fname);
|
|
|
|
|
|
|
|
return meta_data;
|
|
|
|
}
|
|
|
|
|
2007-04-01 16:08:27 +00:00
|
|
|
char * opal_crs_base_state_str(opal_crs_state_type_t state)
|
|
|
|
{
|
2007-03-16 23:11:45 +00:00
|
|
|
char *str = NULL;
|
|
|
|
|
|
|
|
switch(state) {
|
|
|
|
case OPAL_CRS_CHECKPOINT:
|
|
|
|
str = strdup("Checkpoint");
|
|
|
|
break;
|
|
|
|
case OPAL_CRS_RESTART:
|
|
|
|
str = strdup("Restart");
|
|
|
|
break;
|
|
|
|
case OPAL_CRS_CONTINUE:
|
|
|
|
str = strdup("Continue");
|
|
|
|
break;
|
|
|
|
case OPAL_CRS_TERM:
|
|
|
|
str = strdup("Terminate");
|
|
|
|
break;
|
|
|
|
case OPAL_CRS_RUNNING:
|
|
|
|
str = strdup("Running");
|
|
|
|
break;
|
|
|
|
case OPAL_CRS_ERROR:
|
|
|
|
str = strdup("Error");
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
str = strdup("Unknown");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return str;
|
|
|
|
}
|