Some checkpoint/restart cleanup.
* Remove the opal_only option. This was suffering from bit rot, and no one uses it. It can be added back fairly easily if wanted. * Cleanup metadata interactions at the local level. * Touch up some of the INC funcitonality (fix typos and a minor ordering issue) This commit was SVN r18416.
Этот коммит содержится в:
родитель
8739edc580
Коммит
da2f1c58e2
@ -35,6 +35,7 @@
|
|||||||
#include "ompi/mca/pml/pml.h"
|
#include "ompi/mca/pml/pml.h"
|
||||||
#include "ompi/mca/pml/base/base.h"
|
#include "ompi/mca/pml/base/base.h"
|
||||||
#include "orte/mca/rml/rml.h"
|
#include "orte/mca/rml/rml.h"
|
||||||
|
#include "orte/mca/grpcomm/grpcomm.h"
|
||||||
#include "ompi/proc/proc.h"
|
#include "ompi/proc/proc.h"
|
||||||
|
|
||||||
#include "bml_r2.h"
|
#include "bml_r2.h"
|
||||||
@ -93,18 +94,6 @@ int mca_bml_r2_ft_event(int state)
|
|||||||
* preparation for being shut down.
|
* preparation for being shut down.
|
||||||
*/
|
*/
|
||||||
for(btl_idx = 0; btl_idx < mca_bml_r2.num_btl_modules; btl_idx++) {
|
for(btl_idx = 0; btl_idx < mca_bml_r2.num_btl_modules; btl_idx++) {
|
||||||
/*
|
|
||||||
* Notify BTL
|
|
||||||
*/
|
|
||||||
if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event) {
|
|
||||||
opal_output_verbose(10, ompi_cr_output,
|
|
||||||
"bml:r2: ft_event: Notify the %s BTL.\n",
|
|
||||||
(mca_bml_r2.btl_modules[btl_idx])->btl_component->btl_version.mca_component_name);
|
|
||||||
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event(loc_state) ) ) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Notify Mpool
|
* Notify Mpool
|
||||||
*/
|
*/
|
||||||
@ -117,6 +106,18 @@ int mca_bml_r2_ft_event(int state)
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Notify BTL
|
||||||
|
*/
|
||||||
|
if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event) {
|
||||||
|
opal_output_verbose(10, ompi_cr_output,
|
||||||
|
"bml:r2: ft_event: Notify the %s BTL.\n",
|
||||||
|
(mca_bml_r2.btl_modules[btl_idx])->btl_component->btl_version.mca_component_name);
|
||||||
|
if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event(loc_state) ) ) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -140,7 +141,7 @@ int mca_bml_r2_ft_event(int state)
|
|||||||
}
|
}
|
||||||
|
|
||||||
opal_output_verbose(10, ompi_cr_output,
|
opal_output_verbose(10, ompi_cr_output,
|
||||||
"bml:r2: ft_event(reboot): Reselect BTLs\n");
|
"bml:r2: ft_event(Restart): Reselect BTLs\n");
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Close the BTLs
|
* Close the BTLs
|
||||||
@ -157,6 +158,20 @@ int mca_bml_r2_ft_event(int state)
|
|||||||
|
|
||||||
}
|
}
|
||||||
else if(OPAL_CRS_RESTART == state ) {
|
else if(OPAL_CRS_RESTART == state ) {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Barrier to make all processes have been successfully restarted before
|
||||||
|
* we try to remove some restart only files.
|
||||||
|
*/
|
||||||
|
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
|
||||||
|
opal_output(0, "bml:r2: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
opal_output_verbose(10, ompi_cr_output,
|
||||||
|
"bml:r2: ft_event(Restart): Cleanup restart files\n");
|
||||||
|
opal_crs_base_cleanup_flush();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Re-open the BTL framework to get the full list of components.
|
* Re-open the BTL framework to get the full list of components.
|
||||||
*/
|
*/
|
||||||
|
@ -542,13 +542,13 @@ int mca_pml_ob1_ft_event( int state )
|
|||||||
* Add the new procs (BTLs redo modex recv's)
|
* Add the new procs (BTLs redo modex recv's)
|
||||||
*/
|
*/
|
||||||
if( OMPI_SUCCESS != (ret = mca_pml_ob1_add_procs(procs, num_procs) ) ) {
|
if( OMPI_SUCCESS != (ret = mca_pml_ob1_add_procs(procs, num_procs) ) ) {
|
||||||
opal_output(0, "pml:ob1: fr_event(Restart): Failed in add_procs (%d)", ret);
|
opal_output(0, "pml:ob1: ft_event(Restart): Failed in add_procs (%d)", ret);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Is this barrier necessary ? JJH */
|
/* Is this barrier necessary ? JJH */
|
||||||
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
|
if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
|
||||||
opal_output(0, "pml:ob1: fr_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
|
opal_output(0, "pml:ob1: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -281,6 +281,9 @@ int ompi_proc_refresh(void) {
|
|||||||
/* Does not change: proc->proc_name.vpid */
|
/* Does not change: proc->proc_name.vpid */
|
||||||
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
proc->proc_name.jobid = ORTE_PROC_MY_NAME->jobid;
|
||||||
|
|
||||||
|
/* Make sure to clear the local flag before we set it below */
|
||||||
|
proc->proc_flags = 0;
|
||||||
|
|
||||||
if (i == ORTE_PROC_MY_NAME->vpid) {
|
if (i == ORTE_PROC_MY_NAME->vpid) {
|
||||||
ompi_proc_local_proc = proc;
|
ompi_proc_local_proc = proc;
|
||||||
proc->proc_flags |= OMPI_PROC_FLAG_LOCAL;
|
proc->proc_flags |= OMPI_PROC_FLAG_LOCAL;
|
||||||
|
@ -32,6 +32,14 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* Some local strings to use genericly with the local metadata file */
|
||||||
|
#define CRS_METADATA_BASE ("# ")
|
||||||
|
#define CRS_METADATA_COMP ("# Component: ")
|
||||||
|
#define CRS_METADATA_PID ("# PID: ")
|
||||||
|
#define CRS_METADATA_CONTEXT ("# CONTEXT: ")
|
||||||
|
#define CRS_METADATA_MKDIR ("# MKDIR: ")
|
||||||
|
#define CRS_METADATA_TOUCH ("# TOUCH: ")
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize the CRS MCA framework
|
* Initialize the CRS MCA framework
|
||||||
*
|
*
|
||||||
@ -106,24 +114,32 @@ extern "C" {
|
|||||||
OPAL_DECLSPEC char * opal_crs_base_state_str(opal_crs_state_type_t state);
|
OPAL_DECLSPEC char * opal_crs_base_state_str(opal_crs_state_type_t state);
|
||||||
|
|
||||||
OPAL_DECLSPEC char * opal_crs_base_unique_snapshot_name(pid_t pid);
|
OPAL_DECLSPEC char * opal_crs_base_unique_snapshot_name(pid_t pid);
|
||||||
OPAL_DECLSPEC char * opal_crs_base_extract_expected_component(char *snapshot_loc, int *prev_pid);
|
OPAL_DECLSPEC int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** component_name, int *prev_pid);
|
||||||
OPAL_DECLSPEC int opal_crs_base_init_snapshot_directory(opal_crs_base_snapshot_t *snapshot);
|
OPAL_DECLSPEC int opal_crs_base_init_snapshot_directory(opal_crs_base_snapshot_t *snapshot);
|
||||||
OPAL_DECLSPEC char * opal_crs_base_get_snapshot_directory(char *uniq_snapshot_name);
|
OPAL_DECLSPEC char * opal_crs_base_get_snapshot_directory(char *uniq_snapshot_name);
|
||||||
|
|
||||||
/* Opens the metadata file and places all the base information in the file.
|
/*
|
||||||
* Options:
|
* Read a token to the metadata file
|
||||||
* 'w' = Open for writing
|
* NULL can be passed for snapshot_loc if nit_snapshot_directory has been called.
|
||||||
* 'a' = Open for writing and appending information
|
|
||||||
*/
|
*/
|
||||||
OPAL_DECLSPEC FILE *opal_crs_base_open_metadata(opal_crs_base_snapshot_t *snapshot, char mode );
|
OPAL_DECLSPEC int opal_crs_base_metadata_read_token(char *snapshot_loc, char * token, char ***value);
|
||||||
|
|
||||||
/* Open the metadata file, read off the base information and
|
/*
|
||||||
* return the component and previous pid to the caller.
|
* Write a token to the metadata file
|
||||||
* Note: component is allocated inside this function, it is the
|
* NULL can be passed for snapshot_loc if nit_snapshot_directory has been called.
|
||||||
* callers responsibility to free this memory.
|
|
||||||
*/
|
*/
|
||||||
OPAL_DECLSPEC FILE * opal_crs_base_open_read_metadata(char *location, char **component, int *prev_pid);
|
OPAL_DECLSPEC int opal_crs_base_metadata_write_token(char *snapshot_loc, char * token, char *value);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Register a file for cleanup.
|
||||||
|
* Useful in C/R when files only need to temporarily exist for restart
|
||||||
|
*/
|
||||||
|
OPAL_DECLSPEC int opal_crs_base_cleanup_append(char* filename, bool is_dir);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Flush the cleanup of all registered files.
|
||||||
|
*/
|
||||||
|
OPAL_DECLSPEC int opal_crs_base_cleanup_flush(void);
|
||||||
|
|
||||||
#if defined(c_plusplus) || defined(__cplusplus)
|
#if defined(c_plusplus) || defined(__cplusplus)
|
||||||
}
|
}
|
||||||
|
@ -24,16 +24,36 @@
|
|||||||
#if HAVE_UNISTD_H
|
#if HAVE_UNISTD_H
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef HAVE_FCNTL_H
|
||||||
|
#include <fcntl.h>
|
||||||
|
#endif /* HAVE_FCNTL_H */
|
||||||
|
#ifdef HAVE_SYS_STAT_H
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "opal/mca/mca.h"
|
#include "opal/mca/mca.h"
|
||||||
#include "opal/mca/base/base.h"
|
#include "opal/mca/base/base.h"
|
||||||
#include "opal/include/opal/constants.h"
|
#include "opal/include/opal/constants.h"
|
||||||
#include "opal/util/os_dirpath.h"
|
#include "opal/util/os_dirpath.h"
|
||||||
#include "opal/util/output.h"
|
#include "opal/util/output.h"
|
||||||
|
#include "opal/util/argv.h"
|
||||||
|
|
||||||
#include "opal/mca/crs/crs.h"
|
#include "opal/mca/crs/crs.h"
|
||||||
#include "opal/mca/crs/base/base.h"
|
#include "opal/mca/crs/base/base.h"
|
||||||
|
|
||||||
|
/******************
|
||||||
|
* Local Functions
|
||||||
|
******************/
|
||||||
|
static int metadata_extract_next_token(FILE *file, char **token, char **value);
|
||||||
|
static int opal_crs_base_metadata_open(FILE ** meta_data, char * location, char * mode);
|
||||||
|
|
||||||
|
static char *last_metadata_file = NULL;
|
||||||
|
static char **cleanup_file_argv = NULL;
|
||||||
|
static char **cleanup_dir_argv = NULL;
|
||||||
|
|
||||||
|
/******************
|
||||||
|
* Object stuff
|
||||||
|
******************/
|
||||||
static void opal_crs_base_construct(opal_crs_base_snapshot_t *snapshot)
|
static void opal_crs_base_construct(opal_crs_base_snapshot_t *snapshot)
|
||||||
{
|
{
|
||||||
snapshot->component_name = NULL;
|
snapshot->component_name = NULL;
|
||||||
@ -198,78 +218,112 @@ char * opal_crs_base_unique_snapshot_name(pid_t pid)
|
|||||||
return loc_str;
|
return loc_str;
|
||||||
}
|
}
|
||||||
|
|
||||||
FILE * opal_crs_base_open_read_metadata(char * location, char **component, int *prev_pid)
|
int opal_crs_base_metadata_read_token(char *snapshot_loc, char * token, char ***value) {
|
||||||
{
|
int ret, exit_status = OPAL_SUCCESS;
|
||||||
char * dir_name = NULL;
|
|
||||||
char * content = NULL;
|
|
||||||
char * tmp_str = NULL;
|
|
||||||
int len = 0;
|
|
||||||
FILE * meta_data = NULL;
|
FILE * meta_data = NULL;
|
||||||
|
char * loc_token = NULL;
|
||||||
|
char * loc_value = NULL;
|
||||||
|
int argc = 0;
|
||||||
|
|
||||||
*component = NULL;
|
/* Dummy check */
|
||||||
*prev_pid = -1;
|
if( NULL == token ) {
|
||||||
|
|
||||||
/*
|
|
||||||
* Find the snapshot directory, read the metadata file
|
|
||||||
*/
|
|
||||||
asprintf(&dir_name, "%s/%s", location, opal_crs_base_metadata_filename);
|
|
||||||
if (NULL == (meta_data = fopen(dir_name, "r")) ) {
|
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Component Name
|
* Open the metadata file
|
||||||
*/
|
*/
|
||||||
len = 32; /* Max size for a CRS component name */
|
if( OPAL_SUCCESS != (ret = opal_crs_base_metadata_open(&meta_data, snapshot_loc, "r")) ) {
|
||||||
content = (char *) malloc(sizeof(char) * len);
|
|
||||||
if (NULL == fgets(content, len, meta_data) ) {
|
|
||||||
free(content);
|
|
||||||
content = NULL;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
/* Strip of newline */
|
|
||||||
len = strlen(content);
|
|
||||||
content[len - 1] = '\0';
|
|
||||||
|
|
||||||
*component = strdup(content);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Get the PID
|
|
||||||
*/
|
|
||||||
len = 128;
|
|
||||||
tmp_str = (char *) malloc(sizeof(char) * len);
|
|
||||||
if (NULL == fgets(tmp_str, len, meta_data) ) {
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
/* Strip of newline */
|
|
||||||
len = strlen(tmp_str);
|
|
||||||
if(tmp_str[len - 1] == '\n')
|
|
||||||
tmp_str[len - 1] = '\0';
|
|
||||||
*prev_pid = atoi(tmp_str);
|
|
||||||
|
|
||||||
cleanup:
|
|
||||||
return meta_data;
|
|
||||||
}
|
|
||||||
|
|
||||||
char * opal_crs_base_extract_expected_component(char *snapshot_loc, int *prev_pid)
|
|
||||||
{
|
|
||||||
FILE * meta_data = NULL;
|
|
||||||
char * component_name = NULL;
|
|
||||||
|
|
||||||
*prev_pid = -1;
|
|
||||||
|
|
||||||
if( NULL == (meta_data = opal_crs_base_open_read_metadata(snapshot_loc, &component_name, prev_pid)) ) {
|
|
||||||
opal_output(opal_crs_base_output,
|
opal_output(opal_crs_base_output,
|
||||||
"opal:crs:base: extract_expected_component: Error: Unable to open the file (%s)\n",
|
"opal:crs:base: opal_crs_base_metadata_read_token: Error: Unable to open the metadata file\n");
|
||||||
snapshot_loc);
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanup:
|
/*
|
||||||
if(NULL != meta_data)
|
* Extract each token and make the records
|
||||||
fclose(meta_data);
|
*/
|
||||||
|
do {
|
||||||
|
/* Get next token */
|
||||||
|
if( OPAL_SUCCESS != metadata_extract_next_token(meta_data, &loc_token, &loc_value) ) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
return component_name;
|
/* Check token to see if it matches */
|
||||||
|
if(0 == strncmp(token, loc_token, strlen(loc_token)) ) {
|
||||||
|
opal_argv_append(&argc, value, loc_value);
|
||||||
|
}
|
||||||
|
} while(0 == feof(meta_data) );
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
if(NULL != meta_data) {
|
||||||
|
fclose(meta_data);
|
||||||
|
meta_data = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return exit_status;
|
||||||
|
}
|
||||||
|
|
||||||
|
int opal_crs_base_metadata_write_token(char *snapshot_loc, char * token, char *value) {
|
||||||
|
int ret, exit_status = OPAL_SUCCESS;
|
||||||
|
FILE * meta_data = NULL;
|
||||||
|
|
||||||
|
/* Dummy check */
|
||||||
|
if( NULL == token || NULL == value) {
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Open the metadata file
|
||||||
|
*/
|
||||||
|
if( OPAL_SUCCESS != (ret = opal_crs_base_metadata_open(&meta_data, snapshot_loc, "a")) ) {
|
||||||
|
opal_output(opal_crs_base_output,
|
||||||
|
"opal:crs:base: opal_crs_base_metadata_write_token: Error: Unable to open the metadata file\n");
|
||||||
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(meta_data, "%s%s\n", token, value);
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
if(NULL != meta_data) {
|
||||||
|
fclose(meta_data);
|
||||||
|
meta_data = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return exit_status;
|
||||||
|
}
|
||||||
|
|
||||||
|
int opal_crs_base_extract_expected_component(char *snapshot_loc, char ** component_name, int *prev_pid)
|
||||||
|
{
|
||||||
|
char **pid_argv = NULL;
|
||||||
|
char **name_argv = NULL;
|
||||||
|
|
||||||
|
opal_crs_base_metadata_read_token(snapshot_loc, CRS_METADATA_PID, &pid_argv);
|
||||||
|
if( NULL != pid_argv && NULL != pid_argv[0] ) {
|
||||||
|
*prev_pid = atoi(pid_argv[0]);
|
||||||
|
} else {
|
||||||
|
opal_output(0, "Error: expected_component: PID information unavailable!");
|
||||||
|
}
|
||||||
|
|
||||||
|
opal_crs_base_metadata_read_token(snapshot_loc, CRS_METADATA_COMP, &name_argv);
|
||||||
|
if( NULL != name_argv && NULL != name_argv[0] ) {
|
||||||
|
*component_name = strdup(name_argv[0]);
|
||||||
|
} else {
|
||||||
|
opal_output(0, "Error: expected_component: Component Name information unavailable!");
|
||||||
|
}
|
||||||
|
|
||||||
|
if( NULL != pid_argv ) {
|
||||||
|
opal_argv_free(pid_argv);
|
||||||
|
pid_argv = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( NULL != name_argv ) {
|
||||||
|
opal_argv_free(name_argv);
|
||||||
|
name_argv = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return OPAL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
char * opal_crs_base_get_snapshot_directory(char *uniq_snapshot_name)
|
char * opal_crs_base_get_snapshot_directory(char *uniq_snapshot_name)
|
||||||
@ -283,77 +337,113 @@ char * opal_crs_base_get_snapshot_directory(char *uniq_snapshot_name)
|
|||||||
|
|
||||||
int opal_crs_base_init_snapshot_directory(opal_crs_base_snapshot_t *snapshot)
|
int opal_crs_base_init_snapshot_directory(opal_crs_base_snapshot_t *snapshot)
|
||||||
{
|
{
|
||||||
mode_t my_mode = S_IRWXU;
|
|
||||||
int ret, exit_status = OPAL_SUCCESS;
|
int ret, exit_status = OPAL_SUCCESS;
|
||||||
FILE * meta_data = NULL;
|
mode_t my_mode = S_IRWXU;
|
||||||
|
char * pid_str = NULL;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Make the snapshot directory from the uniq_snapshot_name
|
* Make the snapshot directory from the uniq_snapshot_name
|
||||||
*/
|
*/
|
||||||
if(OPAL_SUCCESS != (ret = opal_os_dirpath_create(snapshot->local_location, my_mode)) ) {
|
if(OPAL_SUCCESS != (ret = opal_os_dirpath_create(snapshot->local_location, my_mode)) ) {
|
||||||
|
opal_output(opal_crs_base_output,
|
||||||
|
"opal:crs:base: init_snapshot_directory: Error: Unable to create directory (%s)\n",
|
||||||
|
snapshot->local_location);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Initialize the metadata file at the top of that directory.
|
* Initialize the metadata file at the top of that directory.
|
||||||
|
* Add 'BASE' and 'PID'
|
||||||
*/
|
*/
|
||||||
if (NULL == (meta_data = opal_crs_base_open_metadata(snapshot, 'w') ) ) {
|
if( NULL != last_metadata_file ) {
|
||||||
|
free(last_metadata_file);
|
||||||
|
last_metadata_file = NULL;
|
||||||
|
}
|
||||||
|
last_metadata_file = strdup(snapshot->local_location);
|
||||||
|
|
||||||
|
if( OPAL_SUCCESS != (ret = opal_crs_base_metadata_write_token(NULL, CRS_METADATA_BASE, "") ) ) {
|
||||||
opal_output(opal_crs_base_output,
|
opal_output(opal_crs_base_output,
|
||||||
"opal:crs:base: init_snapshot_directory: Error: Unable to open the file (%s/%s)\n",
|
"opal:crs:base: init_snapshot_directory: Error: Unable to write BASE to the file (%s/%s)\n",
|
||||||
snapshot->local_location, opal_crs_base_metadata_filename);
|
snapshot->local_location, opal_crs_base_metadata_filename);
|
||||||
exit_status = OPAL_ERROR;
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
asprintf(&pid_str, "%d", getpid());
|
||||||
|
if( OPAL_SUCCESS != (ret = opal_crs_base_metadata_write_token(NULL, CRS_METADATA_PID, pid_str) ) ) {
|
||||||
|
opal_output(opal_crs_base_output,
|
||||||
|
"opal:crs:base: init_snapshot_directory: Error: Unable to write PID (%s) to the file (%s/%s)\n",
|
||||||
|
pid_str, snapshot->local_location, opal_crs_base_metadata_filename);
|
||||||
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
if(NULL != meta_data)
|
if( NULL != pid_str) {
|
||||||
fclose(meta_data);
|
free(pid_str);
|
||||||
|
pid_str = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
FILE *opal_crs_base_open_metadata(opal_crs_base_snapshot_t *snapshot, char mode )
|
int opal_crs_base_cleanup_append(char* filename, bool is_dir)
|
||||||
{
|
{
|
||||||
char *meta_data_fname = NULL;
|
if( NULL == filename ) {
|
||||||
FILE * meta_data = NULL;
|
return OPAL_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( is_dir ) {
|
||||||
|
opal_output_verbose(15, opal_crs_base_output,
|
||||||
|
"opal:crs: cleanup_append: Append Dir <%s>\n",
|
||||||
|
filename);
|
||||||
|
opal_argv_append_nosize(&cleanup_dir_argv, filename);
|
||||||
|
} else {
|
||||||
|
opal_output_verbose(15, opal_crs_base_output,
|
||||||
|
"opal:crs: cleanup_append: Append File <%s>\n",
|
||||||
|
filename);
|
||||||
|
opal_argv_append_nosize(&cleanup_file_argv, filename);
|
||||||
|
}
|
||||||
|
|
||||||
|
return OPAL_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
int opal_crs_base_cleanup_flush(void)
|
||||||
|
{
|
||||||
|
int argc, i;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Construct path
|
* Cleanup files first
|
||||||
*/
|
*/
|
||||||
asprintf(&meta_data_fname, "%s/%s", snapshot->local_location, opal_crs_base_metadata_filename);
|
if( NULL != cleanup_file_argv ) {
|
||||||
|
argc = opal_argv_count(cleanup_file_argv);
|
||||||
|
for( i = 0; i < argc; ++i) {
|
||||||
|
opal_output_verbose(15, opal_crs_base_output,
|
||||||
|
"opal:crs: cleanup_flush: Remove File <%s>\n", cleanup_dir_argv[i]);
|
||||||
|
unlink(cleanup_file_argv[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
opal_argv_free(cleanup_file_argv);
|
||||||
|
cleanup_file_argv = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Open the metadata file
|
* Try to cleanup directories next
|
||||||
*/
|
*/
|
||||||
if( mode == 'w' ) {
|
if( NULL != cleanup_dir_argv ) {
|
||||||
meta_data = fopen(meta_data_fname, "w");
|
argc = opal_argv_count(cleanup_dir_argv);
|
||||||
}
|
for( i = 0; i < argc; ++i) {
|
||||||
else if( mode == 'a' ) {
|
opal_output_verbose(15, opal_crs_base_output,
|
||||||
meta_data = fopen(meta_data_fname, "a");
|
"opal:crs: cleanup_flush: Remove Dir <%s>\n", cleanup_dir_argv[i]);
|
||||||
|
opal_os_dirpath_destroy(cleanup_dir_argv[i], true, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (NULL == meta_data ) {
|
opal_argv_free(cleanup_dir_argv);
|
||||||
opal_output(opal_crs_base_output,
|
cleanup_dir_argv = NULL;
|
||||||
"opal:crs:base: open_metadata (%c): Error: Unable to open the file (%s)\n",
|
|
||||||
mode, meta_data_fname);
|
|
||||||
goto cleanup;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if( mode == 'w' ) {
|
return OPAL_SUCCESS;
|
||||||
/*
|
|
||||||
* The first line is the component name,
|
|
||||||
* everything else here is defined by the component
|
|
||||||
*/
|
|
||||||
fprintf(meta_data, "%s\n", snapshot->component_name);
|
|
||||||
fprintf(meta_data, "%d\n", getpid());
|
|
||||||
}
|
|
||||||
|
|
||||||
cleanup:
|
|
||||||
if(NULL != meta_data_fname)
|
|
||||||
free(meta_data_fname);
|
|
||||||
|
|
||||||
return meta_data;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
char * opal_crs_base_state_str(opal_crs_state_type_t state)
|
char * opal_crs_base_state_str(opal_crs_state_type_t state)
|
||||||
@ -386,3 +476,154 @@ char * opal_crs_base_state_str(opal_crs_state_type_t state)
|
|||||||
|
|
||||||
return str;
|
return str;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/******************
|
||||||
|
* Local Functions
|
||||||
|
******************/
|
||||||
|
static int opal_crs_base_metadata_open(FILE **meta_data, char * location, char * mode)
|
||||||
|
{
|
||||||
|
int exit_status = OPAL_SUCCESS;
|
||||||
|
char * dir_name = NULL;
|
||||||
|
|
||||||
|
if( NULL == location ) {
|
||||||
|
if( NULL == last_metadata_file ) {
|
||||||
|
opal_output(0, "Error: No metadata filename specified!");
|
||||||
|
exit_status = OPAL_ERROR;
|
||||||
|
goto cleanup;
|
||||||
|
} else {
|
||||||
|
location = last_metadata_file;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Find the snapshot directory, read the metadata file
|
||||||
|
*/
|
||||||
|
asprintf(&dir_name, "%s/%s", location, opal_crs_base_metadata_filename);
|
||||||
|
if (NULL == (*meta_data = fopen(dir_name, mode)) ) {
|
||||||
|
exit_status = OPAL_ERROR;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
if( NULL != dir_name ) {
|
||||||
|
free(dir_name);
|
||||||
|
dir_name = NULL;
|
||||||
|
}
|
||||||
|
return exit_status;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int metadata_extract_next_token(FILE *file, char **token, char **value)
|
||||||
|
{
|
||||||
|
int exit_status = OPAL_SUCCESS;
|
||||||
|
int max_len = 256;
|
||||||
|
char * line = NULL;
|
||||||
|
int line_len = 0;
|
||||||
|
int c = 0, s = 0, v = 0;
|
||||||
|
char *local_token = NULL;
|
||||||
|
char *local_value = NULL;
|
||||||
|
bool end_of_line = false;
|
||||||
|
|
||||||
|
line = (char *) malloc(sizeof(char) * max_len);
|
||||||
|
|
||||||
|
try_again:
|
||||||
|
/*
|
||||||
|
* If we are at the end of the file, then just return
|
||||||
|
*/
|
||||||
|
if(0 != feof(file) ) {
|
||||||
|
exit_status = OPAL_ERROR;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Other wise grab the next token/value pair
|
||||||
|
*/
|
||||||
|
if (NULL == fgets(line, max_len, file) ) {
|
||||||
|
exit_status = OPAL_ERROR;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
line_len = strlen(line);
|
||||||
|
/* Strip off the new line if it it there */
|
||||||
|
if('\n' == line[line_len-1]) {
|
||||||
|
line[line_len-1] = '\0';
|
||||||
|
line_len--;
|
||||||
|
end_of_line = true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
end_of_line = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Ignore lines with just '#' too */
|
||||||
|
if(2 >= line_len)
|
||||||
|
goto try_again;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Extract the token from the set
|
||||||
|
*/
|
||||||
|
for(c = 0;
|
||||||
|
line[c] != ':' &&
|
||||||
|
c < line_len;
|
||||||
|
++c) {
|
||||||
|
;
|
||||||
|
}
|
||||||
|
c += 2; /* For the ' ' and the '\0' */
|
||||||
|
local_token = (char *)malloc(sizeof(char) * (c + 1));
|
||||||
|
|
||||||
|
for(s = 0; s < c; ++s) {
|
||||||
|
local_token[s] = line[s];
|
||||||
|
}
|
||||||
|
|
||||||
|
local_token[s] = '\0';
|
||||||
|
*token = strdup(local_token);
|
||||||
|
|
||||||
|
if( NULL != local_token) {
|
||||||
|
free(local_token);
|
||||||
|
local_token = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Extract the value from the set
|
||||||
|
*/
|
||||||
|
local_value = (char *)malloc(sizeof(char) * (line_len - c + 1));
|
||||||
|
for(v = 0, s = c;
|
||||||
|
s < line_len;
|
||||||
|
++s, ++v) {
|
||||||
|
local_value[v] = line[s];
|
||||||
|
}
|
||||||
|
|
||||||
|
while(!end_of_line) {
|
||||||
|
if (NULL == fgets(line, max_len, file) ) {
|
||||||
|
exit_status = OPAL_ERROR;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
line_len = strlen(line);
|
||||||
|
/* Strip off the new line if it it there */
|
||||||
|
if('\n' == line[line_len-1]) {
|
||||||
|
line[line_len-1] = '\0';
|
||||||
|
line_len--;
|
||||||
|
end_of_line = true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
end_of_line = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
local_value = (char *)realloc(local_value, sizeof(char) * line_len);
|
||||||
|
for(s = 0;
|
||||||
|
s < line_len;
|
||||||
|
++s, ++v) {
|
||||||
|
local_value[v] = line[s];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
local_value[v] = '\0';
|
||||||
|
*value = strdup(local_value);
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
if( NULL != local_token)
|
||||||
|
free(local_token);
|
||||||
|
if( NULL != local_value)
|
||||||
|
free(local_value);
|
||||||
|
if( NULL != line)
|
||||||
|
free(line);
|
||||||
|
|
||||||
|
return exit_status;
|
||||||
|
}
|
||||||
|
@ -252,7 +252,7 @@ int opal_crs_blcr_module_finalize(void)
|
|||||||
|
|
||||||
int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot, opal_crs_state_type_t *state)
|
int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot, opal_crs_state_type_t *state)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret, exit_status = OPAL_SUCCESS;
|
||||||
opal_crs_blcr_snapshot_t *snapshot = OBJ_NEW(opal_crs_blcr_snapshot_t);
|
opal_crs_blcr_snapshot_t *snapshot = OBJ_NEW(opal_crs_blcr_snapshot_t);
|
||||||
char * tmp_str = NULL;
|
char * tmp_str = NULL;
|
||||||
|
|
||||||
@ -272,15 +272,15 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
|||||||
snapshot->super.remote_location = strdup(base_snapshot->remote_location);
|
snapshot->super.remote_location = strdup(base_snapshot->remote_location);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Create the snapshot directory
|
* Update the snapshot metadata
|
||||||
*/
|
*/
|
||||||
snapshot->super.component_name = strdup(mca_crs_blcr_component.super.base_version.mca_component_name);
|
snapshot->super.component_name = strdup(mca_crs_blcr_component.super.base_version.mca_component_name);
|
||||||
if( OPAL_SUCCESS != (ret = opal_crs_base_init_snapshot_directory(&snapshot->super) )) {
|
if( OPAL_SUCCESS != (ret = opal_crs_base_metadata_write_token(NULL, CRS_METADATA_COMP, snapshot->super.component_name) ) ) {
|
||||||
*state = OPAL_CRS_ERROR;
|
|
||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
opal_output(mca_crs_blcr_component.super.output_handle,
|
||||||
"crs:blcr: checkpoint(): Error: Unable to initialize the directory for (%s).",
|
"crs:blcr: checkpoint(): Error: Unable to write component name to the directory for (%s).",
|
||||||
snapshot->super.reference_name);
|
snapshot->super.reference_name);
|
||||||
return ret;
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -328,7 +328,8 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
|||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
opal_output(mca_crs_blcr_component.super.output_handle,
|
||||||
"crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d)",
|
"crs:blcr: checkpoint(): Error: Unable to checkpoint pid (%d)",
|
||||||
pid);
|
pid);
|
||||||
return ret;
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
*state = blcr_current_state;
|
*state = blcr_current_state;
|
||||||
@ -347,7 +348,9 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
|||||||
snapshot->context_filename, tmp_str, ret);
|
snapshot->context_filename, tmp_str, ret);
|
||||||
perror("crs:blcr: checkpoint");
|
perror("crs:blcr: checkpoint");
|
||||||
free(tmp_str);
|
free(tmp_str);
|
||||||
return ret;
|
|
||||||
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -358,7 +361,8 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
|||||||
opal_output(mca_crs_blcr_component.super.output_handle,
|
opal_output(mca_crs_blcr_component.super.output_handle,
|
||||||
"crs:blcr: checkpoint(): Error: Unable to update metadata for snapshot (%s).",
|
"crs:blcr: checkpoint(): Error: Unable to update metadata for snapshot (%s).",
|
||||||
snapshot->super.reference_name);
|
snapshot->super.reference_name);
|
||||||
return ret;
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -367,10 +371,13 @@ int opal_crs_blcr_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
|||||||
*/
|
*/
|
||||||
base_snapshot = &(snapshot->super);
|
base_snapshot = &(snapshot->super);
|
||||||
|
|
||||||
if(NULL != tmp_str)
|
cleanup:
|
||||||
|
if(NULL != tmp_str) {
|
||||||
free(tmp_str);
|
free(tmp_str);
|
||||||
|
tmp_str = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
return exit_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
int opal_crs_blcr_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
|
int opal_crs_blcr_restart(opal_crs_base_snapshot_t *base_snapshot, bool spawn_child, pid_t *child_pid)
|
||||||
@ -760,8 +767,6 @@ static int blcr_get_checkpoint_filename(char **fname, pid_t pid)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int blcr_update_snapshot_metadata(opal_crs_blcr_snapshot_t *snapshot) {
|
static int blcr_update_snapshot_metadata(opal_crs_blcr_snapshot_t *snapshot) {
|
||||||
char * dir_name = NULL;
|
|
||||||
FILE * meta_data = NULL;
|
|
||||||
int exit_status = OPAL_SUCCESS;
|
int exit_status = OPAL_SUCCESS;
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
||||||
@ -779,34 +784,19 @@ static int blcr_update_snapshot_metadata(opal_crs_blcr_snapshot_t *snapshot) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Append to the metadata file:
|
* Append to the metadata file the context filename
|
||||||
* the relative path of the context filename
|
|
||||||
*/
|
*/
|
||||||
if( NULL == (meta_data = opal_crs_base_open_metadata(&snapshot->super, 'w') ) ) {
|
opal_crs_base_metadata_write_token(snapshot->super.local_location, CRS_METADATA_CONTEXT, snapshot->context_filename);
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Context Filename -- Relative path */
|
|
||||||
fprintf(meta_data, "%s\n", snapshot->context_filename);
|
|
||||||
|
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
if(NULL != meta_data)
|
|
||||||
fclose(meta_data);
|
|
||||||
if(NULL != dir_name)
|
|
||||||
free(dir_name);
|
|
||||||
|
|
||||||
return exit_status;
|
return exit_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) {
|
static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) {
|
||||||
char * content = NULL;
|
int ret, exit_status = OPAL_SUCCESS;
|
||||||
|
char **tmp_argv = NULL;
|
||||||
char * component_name = NULL;
|
char * component_name = NULL;
|
||||||
int prev_pid;
|
int prev_pid;
|
||||||
int len = 0;
|
|
||||||
FILE * meta_data = NULL;
|
|
||||||
int exit_status = OPAL_SUCCESS;
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
|
||||||
"crs:blcr: cold_start(%s)", snapshot->super.reference_name);
|
"crs:blcr: cold_start(%s)", snapshot->super.reference_name);
|
||||||
@ -814,9 +804,9 @@ static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) {
|
|||||||
/*
|
/*
|
||||||
* Find the snapshot directory, read the metadata file
|
* Find the snapshot directory, read the metadata file
|
||||||
*/
|
*/
|
||||||
if( NULL == (meta_data = opal_crs_base_open_read_metadata(snapshot->super.local_location,
|
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.local_location,
|
||||||
&component_name, &prev_pid) ) ) {
|
&component_name, &prev_pid) ) ) {
|
||||||
exit_status = OPAL_ERROR;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -835,19 +825,8 @@ static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) {
|
|||||||
/*
|
/*
|
||||||
* Context Filename
|
* Context Filename
|
||||||
*/
|
*/
|
||||||
len = 256; /* Max size for a BLCR filename */
|
opal_crs_base_metadata_read_token(snapshot->super.local_location, CRS_METADATA_CONTEXT, &tmp_argv);
|
||||||
content = (char *) malloc(sizeof(char) * len);
|
asprintf(&snapshot->context_filename, "%s/%s", snapshot->super.local_location, tmp_argv[0]);
|
||||||
if (NULL == fgets(content, len, meta_data) ) {
|
|
||||||
free(content);
|
|
||||||
content = NULL;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
/* Strip of newline */
|
|
||||||
len = strlen(content);
|
|
||||||
content[len - 1] = '\0';
|
|
||||||
|
|
||||||
/* save the filename in the structure */
|
|
||||||
asprintf(&snapshot->context_filename, "%s/%s", snapshot->super.local_location, content);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Reset the cold_start flag
|
* Reset the cold_start flag
|
||||||
@ -855,10 +834,10 @@ static int blcr_cold_start(opal_crs_blcr_snapshot_t *snapshot) {
|
|||||||
snapshot->super.cold_start = false;
|
snapshot->super.cold_start = false;
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
if(NULL != meta_data)
|
if(NULL != tmp_argv) {
|
||||||
fclose(meta_data);
|
opal_argv_free(tmp_argv);
|
||||||
if(NULL != content)
|
tmp_argv = NULL;
|
||||||
free(content);
|
}
|
||||||
|
|
||||||
return exit_status;
|
return exit_status;
|
||||||
}
|
}
|
||||||
|
@ -291,13 +291,12 @@ int opal_crs_self_checkpoint(pid_t pid, opal_crs_base_snapshot_t *base_snapshot,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Create the snapshot directory
|
* Update the snapshot metadata
|
||||||
*/
|
*/
|
||||||
snapshot->super.component_name = strdup(mca_crs_self_component.super.base_version.mca_component_name);
|
snapshot->super.component_name = strdup(mca_crs_self_component.super.base_version.mca_component_name);
|
||||||
if( OPAL_SUCCESS != (ret = opal_crs_base_init_snapshot_directory(&snapshot->super) )) {
|
if( OPAL_SUCCESS != (ret = opal_crs_base_metadata_write_token(NULL, CRS_METADATA_COMP, snapshot->super.component_name) ) ) {
|
||||||
*state = OPAL_CRS_ERROR;
|
|
||||||
opal_output(mca_crs_self_component.super.output_handle,
|
opal_output(mca_crs_self_component.super.output_handle,
|
||||||
"crs:self: checkpoint(): Error: Unable to initialize the directory for (%s).",
|
"crs:self: checkpoint(): Error: Unable to write component name to the directory for (%s).",
|
||||||
snapshot->super.reference_name);
|
snapshot->super.reference_name);
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
@ -592,12 +591,10 @@ static int opal_crs_self_restart_cmd(opal_crs_self_snapshot_t *snapshot, char **
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int self_cold_start(opal_crs_self_snapshot_t *snapshot) {
|
static int self_cold_start(opal_crs_self_snapshot_t *snapshot) {
|
||||||
char * content = NULL;
|
int ret, exit_status = OPAL_SUCCESS;
|
||||||
|
char **tmp_argv = NULL;
|
||||||
char * component_name = NULL;
|
char * component_name = NULL;
|
||||||
int prev_pid;
|
int prev_pid;
|
||||||
int len = 0;
|
|
||||||
FILE * meta_data = NULL;
|
|
||||||
int exit_status = OPAL_SUCCESS;
|
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
||||||
"crs:self: cold_start(%s)", snapshot->super.reference_name);
|
"crs:self: cold_start(%s)", snapshot->super.reference_name);
|
||||||
@ -605,9 +602,9 @@ static int self_cold_start(opal_crs_self_snapshot_t *snapshot) {
|
|||||||
/*
|
/*
|
||||||
* Find the snapshot directory, read the metadata file
|
* Find the snapshot directory, read the metadata file
|
||||||
*/
|
*/
|
||||||
if( NULL == (meta_data = opal_crs_base_open_read_metadata(snapshot->super.local_location,
|
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot->super.local_location,
|
||||||
&component_name, &prev_pid) ) ) {
|
&component_name, &prev_pid) ) ) {
|
||||||
exit_status = OPAL_ERROR;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -627,19 +624,8 @@ static int self_cold_start(opal_crs_self_snapshot_t *snapshot) {
|
|||||||
* Restart command
|
* Restart command
|
||||||
* JJH: Command lines limited to 256 chars.
|
* JJH: Command lines limited to 256 chars.
|
||||||
*/
|
*/
|
||||||
len = 256; /* Max size for a SELF filename */
|
opal_crs_base_metadata_read_token(snapshot->super.local_location, CRS_METADATA_CONTEXT, &tmp_argv);
|
||||||
content = (char *) malloc(sizeof(char) * len);
|
asprintf(&snapshot->cmd_line, "%s", tmp_argv[0]);
|
||||||
if (NULL == fgets(content, len, meta_data) ) {
|
|
||||||
free(content);
|
|
||||||
content = NULL;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
/* Strip of newline */
|
|
||||||
len = strlen(content);
|
|
||||||
content[len - 1] = '\0';
|
|
||||||
|
|
||||||
/* save the command line in the structure */
|
|
||||||
asprintf(&snapshot->cmd_line, "%s", content);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Reset the cold_start flag
|
* Reset the cold_start flag
|
||||||
@ -647,51 +633,35 @@ static int self_cold_start(opal_crs_self_snapshot_t *snapshot) {
|
|||||||
snapshot->super.cold_start = false;
|
snapshot->super.cold_start = false;
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
if(NULL != meta_data)
|
if(NULL != tmp_argv) {
|
||||||
fclose(meta_data);
|
opal_argv_free(tmp_argv);
|
||||||
if(NULL != content)
|
tmp_argv = NULL;
|
||||||
free(content);
|
}
|
||||||
|
|
||||||
return exit_status;
|
return exit_status;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int self_update_snapshot_metadata(opal_crs_self_snapshot_t *snapshot) {
|
static int self_update_snapshot_metadata(opal_crs_self_snapshot_t *snapshot) {
|
||||||
char * dir_name = NULL;
|
|
||||||
FILE *meta_data = NULL;
|
|
||||||
int exit_status = OPAL_SUCCESS;
|
int exit_status = OPAL_SUCCESS;
|
||||||
|
|
||||||
|
if(NULL == snapshot->cmd_line) {
|
||||||
|
opal_show_help("help-opal-crs-self.txt", "self:no-restart-cmd",
|
||||||
|
true);
|
||||||
|
exit_status = OPAL_ERROR;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
opal_output_verbose(10, mca_crs_self_component.super.output_handle,
|
||||||
"crs:self: update_snapshot_metadata(%s)",
|
"crs:self: update_snapshot_metadata(%s)",
|
||||||
snapshot->super.reference_name);
|
snapshot->super.reference_name);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Append to the metadata file:
|
* Append to the metadata file the command line to restart with
|
||||||
* the relative path of the context filename
|
* - How user wants us to restart
|
||||||
*/
|
*/
|
||||||
if( NULL == (meta_data = opal_crs_base_open_metadata(&snapshot->super, 'w') ) ) {
|
opal_crs_base_metadata_write_token(snapshot->super.local_location, CRS_METADATA_CONTEXT, snapshot->cmd_line);
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* How user wants us to restart */
|
|
||||||
if(NULL != snapshot->cmd_line) {
|
|
||||||
fprintf(meta_data, "%s\n", snapshot->cmd_line);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
opal_show_help("help-opal-crs-self.txt", "self:no-restart-cmd",
|
|
||||||
true);
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
cleanup:
|
cleanup:
|
||||||
if(NULL != meta_data) {
|
|
||||||
fclose(meta_data);
|
|
||||||
}
|
|
||||||
if(NULL != dir_name) {
|
|
||||||
free(dir_name);
|
|
||||||
dir_name = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return exit_status;
|
return exit_status;
|
||||||
}
|
}
|
||||||
|
@ -73,7 +73,6 @@
|
|||||||
/******************
|
/******************
|
||||||
* Global Var Decls
|
* Global Var Decls
|
||||||
******************/
|
******************/
|
||||||
bool opal_cr_allow_opal_only = false;
|
|
||||||
bool opal_cr_stall_check = false;
|
bool opal_cr_stall_check = false;
|
||||||
bool opal_cr_currently_stalled = false;
|
bool opal_cr_currently_stalled = false;
|
||||||
int opal_cr_output;
|
int opal_cr_output;
|
||||||
@ -81,18 +80,13 @@ int opal_cr_output;
|
|||||||
/******************
|
/******************
|
||||||
* Local Functions & Var Decls
|
* Local Functions & Var Decls
|
||||||
******************/
|
******************/
|
||||||
static int cr_notify_response(opal_cr_ckpt_cmd_state_t resp);
|
|
||||||
static int extract_env_vars(int prev_pid);
|
static int extract_env_vars(int prev_pid);
|
||||||
static int cr_entry_point_notify_reopen_files(int *prog_read_fd, int *prog_write_fd);
|
|
||||||
static void opal_cr_entry_point_signal_handler (int signo);
|
|
||||||
static void opal_cr_sigpipe_debug_signal_handler (int signo);
|
static void opal_cr_sigpipe_debug_signal_handler (int signo);
|
||||||
|
|
||||||
static opal_cr_coord_callback_fn_t cur_coord_callback = NULL;
|
static opal_cr_coord_callback_fn_t cur_coord_callback = NULL;
|
||||||
static opal_cr_notify_callback_fn_t cur_notify_callback = NULL;
|
static opal_cr_notify_callback_fn_t cur_notify_callback = NULL;
|
||||||
|
|
||||||
static char *prog_named_pipe_r = NULL;
|
|
||||||
static char *prog_named_pipe_w = NULL;
|
|
||||||
|
|
||||||
/******************
|
/******************
|
||||||
* Interface Functions & Vars
|
* Interface Functions & Vars
|
||||||
******************/
|
******************/
|
||||||
@ -100,8 +94,10 @@ char * opal_cr_pipe_dir = NULL;
|
|||||||
int opal_cr_entry_point_signal = 0;
|
int opal_cr_entry_point_signal = 0;
|
||||||
bool opal_cr_is_enabled = true;
|
bool opal_cr_is_enabled = true;
|
||||||
bool opal_cr_is_tool = false;
|
bool opal_cr_is_tool = false;
|
||||||
|
|
||||||
/* Current checkpoint state */
|
/* Current checkpoint state */
|
||||||
int opal_cr_checkpointing = OPAL_CR_STATUS_NONE;
|
int opal_cr_checkpointing_state = OPAL_CR_STATUS_NONE;
|
||||||
|
|
||||||
/* Current checkpoint request channel state */
|
/* Current checkpoint request channel state */
|
||||||
int opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE;
|
int opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE;
|
||||||
|
|
||||||
@ -241,22 +237,6 @@ int opal_cr_init(void )
|
|||||||
opal_cr_thread_sleep_check, opal_cr_thread_sleep_wait);
|
opal_cr_thread_sleep_check, opal_cr_thread_sleep_wait);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
|
||||||
* Whether or not to allow OPAL only checkpointing.
|
|
||||||
* By default we rely on ORTE to provide this functionality for us, but
|
|
||||||
* if the application is OPAL only then we need to fallback to the signal
|
|
||||||
* method which is activated by setting this MCA parameter to 'true'.
|
|
||||||
*/
|
|
||||||
mca_base_param_reg_int_name("opal_cr", "allow_opal_only",
|
|
||||||
"Enable OPAL Only checkpointing [Default: Disabled]",
|
|
||||||
true, false,
|
|
||||||
0, &val);
|
|
||||||
opal_cr_allow_opal_only = OPAL_INT_TO_BOOL(val);
|
|
||||||
|
|
||||||
opal_output_verbose(10, opal_cr_output,
|
|
||||||
"opal_cr: init: OPAL CR Allow OPAL Only: %d",
|
|
||||||
val);
|
|
||||||
|
|
||||||
mca_base_param_reg_int_name("opal_cr", "is_tool",
|
mca_base_param_reg_int_name("opal_cr", "is_tool",
|
||||||
"Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.",
|
"Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.",
|
||||||
false, false,
|
false, false,
|
||||||
@ -315,13 +295,6 @@ int opal_cr_init(void )
|
|||||||
opal_cr_stall_check = false;
|
opal_cr_stall_check = false;
|
||||||
opal_cr_currently_stalled = false;
|
opal_cr_currently_stalled = false;
|
||||||
|
|
||||||
/*
|
|
||||||
* Register the entry point
|
|
||||||
*/
|
|
||||||
if( OPAL_SUCCESS != (ret = opal_cr_entry_point_init()) ) {
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
} /* End opal_cr_is_tool = true */
|
} /* End opal_cr_is_tool = true */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -381,7 +354,7 @@ int opal_cr_init(void )
|
|||||||
|
|
||||||
int opal_cr_finalize(void)
|
int opal_cr_finalize(void)
|
||||||
{
|
{
|
||||||
int ret, exit_status = OPAL_SUCCESS;
|
int exit_status = OPAL_SUCCESS;
|
||||||
|
|
||||||
if( --opal_cr_initalized != 0 ) {
|
if( --opal_cr_initalized != 0 ) {
|
||||||
if( opal_cr_initalized < 0 ) {
|
if( opal_cr_initalized < 0 ) {
|
||||||
@ -407,12 +380,8 @@ int opal_cr_finalize(void)
|
|||||||
}
|
}
|
||||||
#endif /* OPAL_ENABLE_FT_THREAD == 1 */
|
#endif /* OPAL_ENABLE_FT_THREAD == 1 */
|
||||||
|
|
||||||
if( OPAL_SUCCESS != (ret = opal_cr_entry_point_finalize()) ) {
|
|
||||||
exit_status = ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Nothing to do for just process notifications */
|
/* Nothing to do for just process notifications */
|
||||||
opal_cr_checkpointing = OPAL_CR_STATUS_TERM;
|
opal_cr_checkpointing_state = OPAL_CR_STATUS_TERM;
|
||||||
opal_cr_checkpoint_request = OPAL_CR_STATUS_TERM;
|
opal_cr_checkpoint_request = OPAL_CR_STATUS_TERM;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -452,7 +421,7 @@ void opal_cr_test_if_checkpoint_ready(void)
|
|||||||
* - If a request is pending then cancel it
|
* - If a request is pending then cancel it
|
||||||
* - o.w., skip it.
|
* - o.w., skip it.
|
||||||
*/
|
*/
|
||||||
if(OPAL_CR_STATUS_RUNNING == opal_cr_checkpointing ) {
|
if(OPAL_CR_STATUS_RUNNING == opal_cr_checkpointing_state ) {
|
||||||
if( OPAL_SUCCESS != (ret = cur_notify_callback(OPAL_CHECKPOINT_CMD_IN_PROGRESS) ) ) {
|
if( OPAL_SUCCESS != (ret = cur_notify_callback(OPAL_CHECKPOINT_CMD_IN_PROGRESS) ) ) {
|
||||||
opal_output(opal_cr_output,
|
opal_output(opal_cr_output,
|
||||||
"Error: opal_cr: test_if_checkpoint_ready: Respond [In Progress] Failed. (%d)",
|
"Error: opal_cr: test_if_checkpoint_ready: Respond [In Progress] Failed. (%d)",
|
||||||
@ -478,7 +447,7 @@ void opal_cr_test_if_checkpoint_ready(void)
|
|||||||
/*
|
/*
|
||||||
* Start the checkpoint
|
* Start the checkpoint
|
||||||
*/
|
*/
|
||||||
opal_cr_checkpointing = OPAL_CR_STATUS_RUNNING;
|
opal_cr_checkpointing_state = OPAL_CR_STATUS_RUNNING;
|
||||||
opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE;
|
opal_cr_checkpoint_request = OPAL_CR_STATUS_NONE;
|
||||||
|
|
||||||
STAGE_1:
|
STAGE_1:
|
||||||
@ -525,8 +494,12 @@ int opal_cr_inc_core(pid_t pid, opal_crs_base_snapshot_t *snapshot, bool term, i
|
|||||||
}
|
}
|
||||||
|
|
||||||
if(*state == OPAL_CRS_CONTINUE) {
|
if(*state == OPAL_CRS_CONTINUE) {
|
||||||
if(term)
|
if(term) {
|
||||||
*state = OPAL_CRS_TERM;
|
*state = OPAL_CRS_TERM;
|
||||||
|
opal_cr_checkpointing_state = OPAL_CR_STATUS_TERM;
|
||||||
|
} else {
|
||||||
|
opal_cr_checkpointing_state = OPAL_CR_STATUS_CONTINUE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
term = false;
|
term = false;
|
||||||
@ -537,6 +510,7 @@ int opal_cr_inc_core(pid_t pid, opal_crs_base_snapshot_t *snapshot, bool term, i
|
|||||||
*/
|
*/
|
||||||
if(*state == OPAL_CRS_RESTART) {
|
if(*state == OPAL_CRS_RESTART) {
|
||||||
extract_env_vars(prev_pid);
|
extract_env_vars(prev_pid);
|
||||||
|
opal_cr_checkpointing_state = OPAL_CR_STATUS_RESTART_PRE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -594,11 +568,10 @@ int opal_cr_coord(int state)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Here we are returning to either:
|
* Here we are returning to either:
|
||||||
* - opal_notify()
|
|
||||||
* If we have an OPAL only opplication.
|
|
||||||
* - [orte | ompi]_notify()
|
* - [orte | ompi]_notify()
|
||||||
* If we have an ORTE or OPAL application.
|
|
||||||
*/
|
*/
|
||||||
|
opal_cr_checkpointing_state = OPAL_CR_STATUS_RESTART_POST;
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
return OPAL_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -720,83 +693,6 @@ static int extract_env_vars(int prev_pid)
|
|||||||
/*****************************************
|
/*****************************************
|
||||||
* OPAL CR Entry Point Functionality
|
* OPAL CR Entry Point Functionality
|
||||||
*****************************************/
|
*****************************************/
|
||||||
int opal_cr_entry_point_init(void)
|
|
||||||
{
|
|
||||||
int exit_status = OPAL_SUCCESS;
|
|
||||||
char *tmp_pid = NULL;
|
|
||||||
opal_cr_notify_callback_fn_t prev_notify_func;
|
|
||||||
|
|
||||||
if( !opal_cr_allow_opal_only ) {
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
opal_cr_reg_notify_callback(cr_notify_response, &prev_notify_func);
|
|
||||||
|
|
||||||
/* String representation of the PID */
|
|
||||||
asprintf(&tmp_pid, "%d", getpid());
|
|
||||||
|
|
||||||
asprintf(&prog_named_pipe_r, "%s/%s.%s", opal_cr_pipe_dir, OPAL_CR_NAMED_PROG_R, tmp_pid);
|
|
||||||
asprintf(&prog_named_pipe_w, "%s/%s.%s", opal_cr_pipe_dir, OPAL_CR_NAMED_PROG_W, tmp_pid);
|
|
||||||
|
|
||||||
opal_output_verbose(15, opal_cr_output,
|
|
||||||
"opal_cr: init: Named Pipes (%s) (%s)",
|
|
||||||
prog_named_pipe_r, prog_named_pipe_w);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Setup a signal handler to catch and start the proper thread
|
|
||||||
* to handle the checkpoint
|
|
||||||
*/
|
|
||||||
if( SIG_ERR == signal(opal_cr_entry_point_signal, opal_cr_entry_point_signal_handler) ) {
|
|
||||||
exit_status = OPAL_ERROR;
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
cleanup:
|
|
||||||
if( NULL != tmp_pid) {
|
|
||||||
free(tmp_pid);
|
|
||||||
tmp_pid = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return exit_status;
|
|
||||||
}
|
|
||||||
|
|
||||||
int opal_cr_entry_point_finalize(void)
|
|
||||||
{
|
|
||||||
if( !opal_cr_allow_opal_only ) {
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( NULL != prog_named_pipe_r) {
|
|
||||||
free(prog_named_pipe_r);
|
|
||||||
prog_named_pipe_r = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if( NULL != prog_named_pipe_w) {
|
|
||||||
free(prog_named_pipe_w);
|
|
||||||
prog_named_pipe_w = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* C/R Signal Handler.
|
|
||||||
* Once a signal is received then the notification thread is notified
|
|
||||||
* so it can communicate with the checkpoint command to take the approprate
|
|
||||||
* action.
|
|
||||||
*/
|
|
||||||
static void opal_cr_entry_point_signal_handler (int signo)
|
|
||||||
{
|
|
||||||
if( opal_cr_entry_point_signal != signo ) {
|
|
||||||
/* Not our signal */
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* Signal thread to start checkpoint handshake
|
|
||||||
*/
|
|
||||||
opal_cr_checkpoint_request = OPAL_CR_STATUS_REQUESTED;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Used only for debugging SIGPIPE problems
|
* Used only for debugging SIGPIPE problems
|
||||||
*/
|
*/
|
||||||
@ -818,346 +714,6 @@ static void opal_cr_sigpipe_debug_signal_handler (int signo)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Respond to an asynchronous checkpoint request
|
|
||||||
*/
|
|
||||||
int cr_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
|
||||||
{
|
|
||||||
static int app_term = 0, app_pid = 0;
|
|
||||||
static opal_crs_base_snapshot_t *snapshot = NULL;
|
|
||||||
static int prog_named_read_pipe_fd, prog_named_write_pipe_fd;
|
|
||||||
static int len = 0;
|
|
||||||
static int cr_state;
|
|
||||||
int ret, exit_status = OPAL_SUCCESS;
|
|
||||||
int tmp_resp;
|
|
||||||
char *tmp_str = NULL;
|
|
||||||
ssize_t tmp_size = 0;
|
|
||||||
/* Commands from the command line tool */
|
|
||||||
unsigned char app_cmd;
|
|
||||||
|
|
||||||
if( opal_cr_currently_stalled ) {
|
|
||||||
goto STAGE_1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Open a named pipe for our application
|
|
||||||
*/
|
|
||||||
if (OPAL_SUCCESS != (ret = cr_entry_point_notify_reopen_files(&prog_named_read_pipe_fd, &prog_named_write_pipe_fd))) {
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Get the initial handshake command
|
|
||||||
*/
|
|
||||||
if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &len, sizeof(int))) ) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Error: Unable to read the first handshake from named pipe (%s). %d\n",
|
|
||||||
prog_named_pipe_r, ret);
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp_resp = (int)resp;
|
|
||||||
if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &tmp_resp, sizeof(int)) ) ) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: %d: Error: Unable to write to pipe (%s) ret = %d [Line %d]\n",
|
|
||||||
tmp_resp, prog_named_pipe_w, ret, __LINE__);
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Respond that the checkpoint is currently in progress
|
|
||||||
*/
|
|
||||||
if( OPAL_CHECKPOINT_CMD_IN_PROGRESS == resp ) {
|
|
||||||
opal_output_verbose(10, opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Checkpoint in progress, cannot start (%d)",
|
|
||||||
getpid());
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* Respond that the application is unable to be checkpointed
|
|
||||||
*/
|
|
||||||
else if( OPAL_CHECKPOINT_CMD_NULL == resp ) {
|
|
||||||
opal_output_verbose(10, opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Non-checkpointable application, cannot start (%d)",
|
|
||||||
getpid());
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* Respond that some error has occurred such that the application is
|
|
||||||
* not able to be checkpointed
|
|
||||||
*/
|
|
||||||
else if( OPAL_CHECKPOINT_CMD_ERROR == resp ) {
|
|
||||||
opal_output_verbose(10, opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Error generated, cannot start (%d)",
|
|
||||||
getpid());
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Respond signalng that we wish to respond to this request
|
|
||||||
*/
|
|
||||||
opal_output_verbose(10, opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Starting checkpoint request (%d)",
|
|
||||||
getpid());
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Wait for a notify command from command line tool
|
|
||||||
*/
|
|
||||||
if( sizeof(app_cmd) != (ret = read(prog_named_read_pipe_fd, &app_cmd, sizeof(app_cmd))) ) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Error: Unable to read the requested command from named pipe (%s). %d\n",
|
|
||||||
prog_named_pipe_r, ret);
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* get PID argument */
|
|
||||||
if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &app_pid, sizeof(int))) ) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Error: Unable to read the pid from named pipe (%s). %d\n",
|
|
||||||
prog_named_pipe_r, ret);
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* get term argument */
|
|
||||||
if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &app_term, sizeof(int))) ) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Error: Unable to read the term from named pipe (%s). %d\n",
|
|
||||||
prog_named_pipe_r, ret);
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* get Snapshot Handle argument */
|
|
||||||
if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &len, sizeof(int))) ) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Error: Unable to read the snapshot_handle len from named pipe (%s). %d\n",
|
|
||||||
prog_named_pipe_r, ret);
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp_size = sizeof(char) * len;
|
|
||||||
tmp_str = (char *) malloc(sizeof(char) * len);
|
|
||||||
if( tmp_size != (ret = read(prog_named_read_pipe_fd, tmp_str, (sizeof(char) * len))) ) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Error: Unable to read the snapshot_handle from named pipe (%s). %d\n",
|
|
||||||
prog_named_pipe_r, ret);
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If they didn't send anything of meaning then use the defaults
|
|
||||||
*/
|
|
||||||
snapshot = OBJ_NEW(opal_crs_base_snapshot_t);
|
|
||||||
|
|
||||||
if( 1 < strlen(tmp_str) ) {
|
|
||||||
if( NULL != snapshot->reference_name)
|
|
||||||
free( snapshot->reference_name );
|
|
||||||
snapshot->reference_name = strdup(tmp_str);
|
|
||||||
|
|
||||||
if( NULL != snapshot->local_location )
|
|
||||||
free( snapshot->local_location );
|
|
||||||
snapshot->local_location = opal_crs_base_get_snapshot_directory(snapshot->reference_name);
|
|
||||||
|
|
||||||
if( NULL != snapshot->remote_location )
|
|
||||||
free( snapshot->remote_location );
|
|
||||||
snapshot->remote_location = strdup(snapshot->local_location);
|
|
||||||
|
|
||||||
free(tmp_str);
|
|
||||||
tmp_str = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* get Snapshot location argument */
|
|
||||||
if( sizeof(int) != (ret = read(prog_named_read_pipe_fd, &len, sizeof(int))) ) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Error: Unable to read the snapshot_location len from named pipe (%s). %d\n",
|
|
||||||
prog_named_pipe_r, ret);
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
tmp_str = (char *) malloc(sizeof(char) * len);
|
|
||||||
tmp_size = sizeof(char) * len;
|
|
||||||
if( tmp_size != (ret = read(prog_named_read_pipe_fd, tmp_str, (sizeof(char) * len))) ) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Error: Unable to read the snapshot_location from named pipe (%s). %d\n",
|
|
||||||
prog_named_pipe_r, ret);
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If they didn't send anything of meaning then use the defaults
|
|
||||||
*/
|
|
||||||
if( 1 < strlen(tmp_str) ) {
|
|
||||||
if( NULL != snapshot->local_location)
|
|
||||||
free( snapshot->local_location );
|
|
||||||
asprintf(&(snapshot->local_location), "%s/%s", tmp_str, snapshot->reference_name);
|
|
||||||
|
|
||||||
if( NULL != snapshot->remote_location)
|
|
||||||
free( snapshot->remote_location );
|
|
||||||
snapshot->remote_location = strdup(snapshot->local_location);
|
|
||||||
|
|
||||||
free(tmp_str);
|
|
||||||
tmp_str = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Raise the notification flag.
|
|
||||||
* This will trigger the coordination, and checkpoint of the
|
|
||||||
* application if it is possible
|
|
||||||
*/
|
|
||||||
STAGE_1:
|
|
||||||
opal_cr_currently_stalled = false;
|
|
||||||
|
|
||||||
ret = opal_cr_inc_core(app_pid, snapshot, app_term, &cr_state);
|
|
||||||
if( OPAL_EXISTS == ret ) {
|
|
||||||
opal_output_verbose(5, opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Stalling the checkpoint progress until state is stable again (PID = %d)\n",
|
|
||||||
getpid());
|
|
||||||
opal_cr_currently_stalled = true;
|
|
||||||
return exit_status;
|
|
||||||
}
|
|
||||||
else if(OPAL_SUCCESS != ret) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Error: checkpoint notification failed. %d\n", ret);
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Don't stall any longer */
|
|
||||||
opal_cr_stall_check = false;
|
|
||||||
|
|
||||||
if(OPAL_CRS_RESTART == cr_state) {
|
|
||||||
opal_output_verbose(10, opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Restarting...(%d)\n",
|
|
||||||
getpid());
|
|
||||||
|
|
||||||
app_term = false;
|
|
||||||
/* Do not respond to the non-existent command line tool */
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
else if(cr_state == OPAL_CRS_CONTINUE) {
|
|
||||||
; /* Don't need to do anything here */
|
|
||||||
}
|
|
||||||
else if(cr_state == OPAL_CRS_TERM ) {
|
|
||||||
; /* Don't need to do anything here */
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
opal_output_verbose(5, opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Unknown cr_state(%d) [%d]",
|
|
||||||
cr_state, getpid());
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Return the expected variables to the command line tool
|
|
||||||
*/
|
|
||||||
len = strlen(snapshot->reference_name);
|
|
||||||
len++; /* To account for the Null character */
|
|
||||||
if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &len, sizeof(int))) ) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Error: Unable to write fname length to named pipe (%s). %d.\n",
|
|
||||||
prog_named_pipe_w, ret);
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(len > 0) {
|
|
||||||
if( (ssize_t)(sizeof(char) * len) !=
|
|
||||||
(ret = write(prog_named_write_pipe_fd, snapshot->reference_name, (sizeof(char) * len))) ) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Error: Unable to write snapshot->reference_name to named pipe (%s). %d\n",
|
|
||||||
prog_named_pipe_w, ret);
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if( sizeof(int) != (ret = write(prog_named_write_pipe_fd, &cr_state, sizeof(int))) ) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: Error: Unable to write cr_state to named pipe (%s). %d\n",
|
|
||||||
prog_named_pipe_w, ret);
|
|
||||||
goto ckpt_cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
ckpt_cleanup:
|
|
||||||
close(prog_named_write_pipe_fd);
|
|
||||||
close(prog_named_read_pipe_fd);
|
|
||||||
remove(prog_named_pipe_r);
|
|
||||||
remove(prog_named_pipe_w);
|
|
||||||
|
|
||||||
if(app_term) {
|
|
||||||
opal_output_verbose(10, opal_cr_output,
|
|
||||||
"opal_cr: cr_notify_response: User has asked to terminate the application");
|
|
||||||
exit(OPAL_SUCCESS);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Prepare to wait for another checkpoint action */
|
|
||||||
opal_cr_checkpointing = OPAL_CR_STATUS_NONE;
|
|
||||||
|
|
||||||
opal_cr_currently_stalled = false;
|
|
||||||
|
|
||||||
return exit_status;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int cr_entry_point_notify_reopen_files(int *prog_read_fd, int *prog_write_fd)
|
|
||||||
{
|
|
||||||
int ret = OPAL_ERR_NOT_IMPLEMENTED;
|
|
||||||
|
|
||||||
#ifndef HAVE_MKFIFO
|
|
||||||
return ret;
|
|
||||||
#else
|
|
||||||
#ifdef __WINDOWS__
|
|
||||||
return ret;
|
|
||||||
#else
|
|
||||||
/*
|
|
||||||
* Open up the read pipe
|
|
||||||
*/
|
|
||||||
if( (ret = mkfifo(prog_named_pipe_r, 0660)) < 0) {
|
|
||||||
if(EEXIST == ret || -1 == ret ) {
|
|
||||||
opal_output_verbose(10, opal_cr_output,
|
|
||||||
"opal_cr: notify_reopen_files: mkfifo failed because file (%s) already exists, attempting to use this pipe. (%d)",
|
|
||||||
prog_named_pipe_r, ret);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: notify_reopen_files: Error: mkfifo failed to make named pipe (%s). (%d)\n",
|
|
||||||
prog_named_pipe_r, ret);
|
|
||||||
return OPAL_ERROR;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
*prog_read_fd = open(prog_named_pipe_r, O_RDWR);
|
|
||||||
if(*prog_read_fd < 0) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: init: Error: open failed to open the named pipe (%s). %d\n",
|
|
||||||
prog_named_pipe_r, *prog_read_fd);
|
|
||||||
return OPAL_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Open up the write pipe
|
|
||||||
*/
|
|
||||||
if( (ret = mkfifo(prog_named_pipe_w, 0660)) < 0) {
|
|
||||||
if(EEXIST == ret || -1 == ret ) {
|
|
||||||
opal_output_verbose(10, opal_cr_output,
|
|
||||||
"opal_cr: notify_reopen_files: mkfifo failed because file (%s) already exists, attempting to use this pipe. (%d)",
|
|
||||||
prog_named_pipe_w, ret);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: notify_reopen_files: Error: mkfifo failed to make named pipe (%s). (%d)\n",
|
|
||||||
prog_named_pipe_w, ret);
|
|
||||||
return OPAL_ERROR;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
*prog_write_fd = open(prog_named_pipe_w, O_WRONLY);
|
|
||||||
if(*prog_write_fd < 0) {
|
|
||||||
opal_output(opal_cr_output,
|
|
||||||
"opal_cr: notify_reopen_files: Error: open failed to open the named pipe (%s). (%d)\n",
|
|
||||||
prog_named_pipe_w, *prog_write_fd);
|
|
||||||
return OPAL_ERROR;
|
|
||||||
}
|
|
||||||
|
|
||||||
return OPAL_SUCCESS;
|
|
||||||
#endif /* __WINDOWS__ */
|
|
||||||
#endif /* HAVE_MKFIFO */
|
|
||||||
}
|
|
||||||
|
|
||||||
#if OPAL_ENABLE_FT_THREAD == 1
|
#if OPAL_ENABLE_FT_THREAD == 1
|
||||||
static void* opal_cr_thread_fn(opal_object_t *obj)
|
static void* opal_cr_thread_fn(opal_object_t *obj)
|
||||||
{
|
{
|
||||||
|
@ -61,28 +61,39 @@ enum opal_cr_ckpt_cmd_state_t {
|
|||||||
OPAL_CR_STATUS_NONE, /* No checkpoint in progress */
|
OPAL_CR_STATUS_NONE, /* No checkpoint in progress */
|
||||||
OPAL_CR_STATUS_REQUESTED, /* Checkpoint has been requested */
|
OPAL_CR_STATUS_REQUESTED, /* Checkpoint has been requested */
|
||||||
OPAL_CR_STATUS_RUNNING, /* Checkpoint is currently running */
|
OPAL_CR_STATUS_RUNNING, /* Checkpoint is currently running */
|
||||||
OPAL_CR_STATUS_TERM /* Checkpoint is running and will terminate process upon completion */
|
OPAL_CR_STATUS_TERM, /* Checkpoint is running and will terminate process upon completion */
|
||||||
|
/* State of the continue operation */
|
||||||
|
OPAL_CR_STATUS_CONTINUE,
|
||||||
|
/* State of the restart operation */
|
||||||
|
OPAL_CR_STATUS_RESTART_PRE,
|
||||||
|
OPAL_CR_STATUS_RESTART_POST
|
||||||
};
|
};
|
||||||
typedef enum opal_cr_ckpt_cmd_state_t opal_cr_ckpt_cmd_state_t;
|
typedef enum opal_cr_ckpt_cmd_state_t opal_cr_ckpt_cmd_state_t;
|
||||||
|
|
||||||
|
/* An output handle to be used by the cr runtime
|
||||||
|
* functionality as an argument to opal_output() */
|
||||||
|
OPAL_DECLSPEC extern int opal_cr_output;
|
||||||
|
|
||||||
/* Directory containing the named pipes for communication
|
/* Directory containing the named pipes for communication
|
||||||
* with the opal-checkpoint tool */
|
* with the opal-checkpoint tool */
|
||||||
OPAL_DECLSPEC extern char * opal_cr_pipe_dir;
|
OPAL_DECLSPEC extern char * opal_cr_pipe_dir;
|
||||||
|
|
||||||
/* Signal that opal-checkpoint uses to contact the
|
/* Signal that opal-checkpoint uses to contact the
|
||||||
* application process */
|
* application process */
|
||||||
OPAL_DECLSPEC extern int opal_cr_entry_point_signal;
|
OPAL_DECLSPEC extern int opal_cr_entry_point_signal;
|
||||||
|
|
||||||
/* If Checkpointing is enabled in this application */
|
/* If Checkpointing is enabled in this application */
|
||||||
OPAL_DECLSPEC extern bool opal_cr_is_enabled;
|
OPAL_DECLSPEC extern bool opal_cr_is_enabled;
|
||||||
|
|
||||||
/* If the application running is a tool
|
/* If the application running is a tool
|
||||||
* (e.g., opal-checkpoint, orted, ...) */
|
* (e.g., opal-checkpoint, orted, ...) */
|
||||||
OPAL_DECLSPEC extern bool opal_cr_is_tool;
|
OPAL_DECLSPEC extern bool opal_cr_is_tool;
|
||||||
/* An output handle to be used by the cr runtime
|
|
||||||
* functionality as an argument to opal_output() */
|
|
||||||
OPAL_DECLSPEC extern int opal_cr_output;
|
|
||||||
/* If a checkpoint has been requested */
|
/* If a checkpoint has been requested */
|
||||||
OPAL_DECLSPEC extern int opal_cr_checkpoint_request;
|
OPAL_DECLSPEC extern int opal_cr_checkpoint_request;
|
||||||
|
|
||||||
/* The current state of a checkpoint operation */
|
/* The current state of a checkpoint operation */
|
||||||
OPAL_DECLSPEC extern int opal_cr_checkpointing;
|
OPAL_DECLSPEC extern int opal_cr_checkpointing_state;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If this is an application that doesn't want to have
|
* If this is an application that doesn't want to have
|
||||||
@ -217,16 +228,6 @@ typedef enum opal_cr_ckpt_cmd_state_t opal_cr_ckpt_cmd_state_t;
|
|||||||
/*******************************
|
/*******************************
|
||||||
* Notification Routines
|
* Notification Routines
|
||||||
*******************************/
|
*******************************/
|
||||||
/*
|
|
||||||
* Init OPAL entry point functionality
|
|
||||||
*/
|
|
||||||
OPAL_DECLSPEC int opal_cr_entry_point_init(void);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Finalize OPAL entry point functionality
|
|
||||||
*/
|
|
||||||
OPAL_DECLSPEC int opal_cr_entry_point_finalize(void);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A function to respond to the async checkpoint request
|
* A function to respond to the async checkpoint request
|
||||||
* this is useful when figuring out who should respond
|
* this is useful when figuring out who should respond
|
||||||
|
@ -41,6 +41,9 @@
|
|||||||
#ifdef HAVE_SYS_STAT_H
|
#ifdef HAVE_SYS_STAT_H
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef HAVE_FCNTL_H
|
||||||
|
#include <fcntl.h>
|
||||||
|
#endif /* HAVE_FCNTL_H */
|
||||||
#ifdef HAVE_SYS_TYPES_H
|
#ifdef HAVE_SYS_TYPES_H
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#endif
|
#endif
|
||||||
@ -75,7 +78,7 @@ static int initialize(int argc, char *argv[]);
|
|||||||
static int finalize(void);
|
static int finalize(void);
|
||||||
static int parse_args(int argc, char *argv[]);
|
static int parse_args(int argc, char *argv[]);
|
||||||
static int check_file(char *given_filename);
|
static int check_file(char *given_filename);
|
||||||
static int post_env_vars(int prev_pid);
|
static int post_env_vars(int prev_pid, char *location);
|
||||||
|
|
||||||
/*****************************************
|
/*****************************************
|
||||||
* Global Vars for Command line Arguments
|
* Global Vars for Command line Arguments
|
||||||
@ -185,7 +188,7 @@ main(int argc, char *argv[])
|
|||||||
char * base = NULL;
|
char * base = NULL;
|
||||||
|
|
||||||
base = opal_crs_base_get_snapshot_directory(opal_restart_globals.filename);
|
base = opal_crs_base_get_snapshot_directory(opal_restart_globals.filename);
|
||||||
expected_crs_comp = strdup(opal_crs_base_extract_expected_component(base, &prev_pid));
|
opal_crs_base_extract_expected_component(base, &expected_crs_comp, &prev_pid);
|
||||||
|
|
||||||
free(base);
|
free(base);
|
||||||
}
|
}
|
||||||
@ -260,12 +263,18 @@ main(int argc, char *argv[])
|
|||||||
|
|
||||||
/* Since some checkpoint/restart systems don't pass along env vars to the
|
/* Since some checkpoint/restart systems don't pass along env vars to the
|
||||||
* restarted app, we need to take care of that.
|
* restarted app, we need to take care of that.
|
||||||
|
*
|
||||||
|
* Included here is the creation of any files or directories that need to be
|
||||||
|
* created before the process is restarted.
|
||||||
*/
|
*/
|
||||||
if(OPAL_SUCCESS != (ret = post_env_vars(prev_pid) ) ) {
|
if(OPAL_SUCCESS != (ret = post_env_vars(prev_pid, snapshot->local_location) ) ) {
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Do the actual restart
|
||||||
|
*/
|
||||||
ret = opal_crs.crs_restart(snapshot,
|
ret = opal_crs.crs_restart(snapshot,
|
||||||
opal_restart_globals.forked,
|
opal_restart_globals.forked,
|
||||||
&child_pid);
|
&child_pid);
|
||||||
@ -513,11 +522,14 @@ static int check_file(char *given_filename)
|
|||||||
return exit_status;
|
return exit_status;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int post_env_vars(int prev_pid)
|
static int post_env_vars(int prev_pid, char *location)
|
||||||
{
|
{
|
||||||
int ret, exit_status = OPAL_SUCCESS;
|
int ret, exit_status = OPAL_SUCCESS;
|
||||||
char *command = NULL;
|
char *command = NULL;
|
||||||
char *proc_file = NULL;
|
char *proc_file = NULL;
|
||||||
|
char **loc_touch = NULL;
|
||||||
|
char **loc_mkdir = NULL;
|
||||||
|
int argc, i;
|
||||||
|
|
||||||
if( 0 > prev_pid ) {
|
if( 0 > prev_pid ) {
|
||||||
opal_output(opal_restart_globals.output,
|
opal_output(opal_restart_globals.output,
|
||||||
@ -535,17 +547,82 @@ static int post_env_vars(int prev_pid)
|
|||||||
asprintf(&proc_file, "/tmp/%s-%d", OPAL_CR_BASE_ENV_NAME, prev_pid);
|
asprintf(&proc_file, "/tmp/%s-%d", OPAL_CR_BASE_ENV_NAME, prev_pid);
|
||||||
asprintf(&command, "env | grep OMPI_ > %s", proc_file);
|
asprintf(&command, "env | grep OMPI_ > %s", proc_file);
|
||||||
|
|
||||||
|
opal_output_verbose(5, opal_restart_globals.output,
|
||||||
|
"post_env_vars: Execute: <%s>", command);
|
||||||
|
|
||||||
ret = system(command);
|
ret = system(command);
|
||||||
if( 0 > ret) {
|
if( 0 > ret) {
|
||||||
exit_status = ret;
|
exit_status = ret;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanup:
|
/*
|
||||||
if( NULL != command)
|
* Any directories that need to be created
|
||||||
|
*/
|
||||||
|
opal_crs_base_metadata_read_token(location, CRS_METADATA_MKDIR, &loc_mkdir);
|
||||||
|
argc = opal_argv_count(loc_mkdir);
|
||||||
|
for( i = 0; i < argc; ++i ) {
|
||||||
|
if( NULL != command ) {
|
||||||
free(command);
|
free(command);
|
||||||
if( NULL != proc_file)
|
command = NULL;
|
||||||
|
}
|
||||||
|
asprintf(&command, "mkdir -p %s", loc_mkdir[i]);
|
||||||
|
|
||||||
|
opal_output_verbose(5, opal_restart_globals.output,
|
||||||
|
"post_env_vars: Execute: <%s>", command);
|
||||||
|
|
||||||
|
ret = system(command);
|
||||||
|
if( 0 > ret) {
|
||||||
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if( 0 < argc ) {
|
||||||
|
system("sync ; sync");
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Any files that need to exist
|
||||||
|
*/
|
||||||
|
opal_crs_base_metadata_read_token(location, CRS_METADATA_TOUCH, &loc_touch);
|
||||||
|
argc = opal_argv_count(loc_touch);
|
||||||
|
for( i = 0; i < argc; ++i ) {
|
||||||
|
if( NULL != command ) {
|
||||||
|
free(command);
|
||||||
|
command = NULL;
|
||||||
|
}
|
||||||
|
asprintf(&command, "touch %s", loc_touch[i]);
|
||||||
|
|
||||||
|
opal_output_verbose(5, opal_restart_globals.output,
|
||||||
|
"post_env_vars: Execute: <%s>", command);
|
||||||
|
|
||||||
|
ret = system(command);
|
||||||
|
if( 0 > ret) {
|
||||||
|
exit_status = ret;
|
||||||
|
goto cleanup;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if( 0 < argc ) {
|
||||||
|
system("sync ; sync");
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanup:
|
||||||
|
if( NULL != command) {
|
||||||
|
free(command);
|
||||||
|
command = NULL;
|
||||||
|
}
|
||||||
|
if( NULL != proc_file) {
|
||||||
free(proc_file);
|
free(proc_file);
|
||||||
|
proc_file = NULL;
|
||||||
|
}
|
||||||
|
if( NULL != loc_mkdir ) {
|
||||||
|
opal_argv_free(loc_mkdir);
|
||||||
|
loc_mkdir = NULL;
|
||||||
|
}
|
||||||
|
if( NULL != loc_touch ) {
|
||||||
|
opal_argv_free(loc_touch);
|
||||||
|
loc_touch = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return exit_status;
|
return exit_status;
|
||||||
}
|
}
|
||||||
|
@ -631,7 +631,7 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
|
|||||||
char *snapshot_ref,
|
char *snapshot_ref,
|
||||||
char *snapshot_location)
|
char *snapshot_location)
|
||||||
{
|
{
|
||||||
int exit_status = ORTE_SUCCESS;
|
int ret, exit_status = ORTE_SUCCESS;
|
||||||
FILE * meta_data = NULL;
|
FILE * meta_data = NULL;
|
||||||
char * meta_data_fname = NULL;
|
char * meta_data_fname = NULL;
|
||||||
char * crs_comp = NULL;
|
char * crs_comp = NULL;
|
||||||
@ -659,8 +659,7 @@ int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
|
|||||||
orte_util_convert_process_name_to_string(&proc_name, proc);
|
orte_util_convert_process_name_to_string(&proc_name, proc);
|
||||||
|
|
||||||
/* Extract the checkpointer */
|
/* Extract the checkpointer */
|
||||||
crs_comp = opal_crs_base_extract_expected_component(snapshot_location, &prev_pid);
|
if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(snapshot_location, &crs_comp, &prev_pid)) ) {
|
||||||
if( NULL == crs_comp ) {
|
|
||||||
exit_status = ORTE_ERROR;
|
exit_status = ORTE_ERROR;
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
@ -86,7 +86,6 @@ int app_coord_init() {
|
|||||||
* Register the INC notification callback
|
* Register the INC notification callback
|
||||||
*/
|
*/
|
||||||
opal_cr_reg_notify_callback(snapc_full_app_notify_response, &prev_notify_func);
|
opal_cr_reg_notify_callback(snapc_full_app_notify_response, &prev_notify_func);
|
||||||
opal_cr_entry_point_finalize();
|
|
||||||
|
|
||||||
/* String representation of the PID */
|
/* String representation of the PID */
|
||||||
asprintf(&tmp_pid, "%d", getpid());
|
asprintf(&tmp_pid, "%d", getpid());
|
||||||
@ -198,9 +197,15 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Begin checkpoint
|
* Begin checkpoint
|
||||||
|
* - Init the checkpoint metadata file
|
||||||
*/
|
*/
|
||||||
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
OPAL_OUTPUT_VERBOSE((10, mca_snapc_full_component.super.output_handle,
|
||||||
"App) notify_response: Start checkpoint..."));
|
"App) notify_response: Start checkpoint..."));
|
||||||
|
if( OPAL_SUCCESS != (ret = opal_crs_base_init_snapshot_directory(local_snapshot) ) ) {
|
||||||
|
opal_output(0, "App) Error: Unable to initalize the snapshot directory!\n");
|
||||||
|
exit_status = ret;
|
||||||
|
goto ckpt_cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
STAGE_1:
|
STAGE_1:
|
||||||
opal_cr_currently_stalled = false;
|
opal_cr_currently_stalled = false;
|
||||||
@ -269,7 +274,7 @@ int snapc_full_app_notify_response(opal_cr_ckpt_cmd_state_t resp)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Prepare to wait for another checkpoint action */
|
/* Prepare to wait for another checkpoint action */
|
||||||
opal_cr_checkpointing = OPAL_CR_STATUS_NONE;
|
opal_cr_checkpointing_state = OPAL_CR_STATUS_NONE;
|
||||||
opal_cr_currently_stalled = false;
|
opal_cr_currently_stalled = false;
|
||||||
|
|
||||||
return exit_status;
|
return exit_status;
|
||||||
|
Загрузка…
x
Ссылка в новой задаче
Block a user