1
1

Fix the way we determine which sequence number to restart with.

Create a sentinel value in the metadata file to clearly indicate
that the sequence number is complete (versus in progress). This
way we do not try to restart from an invalid sequence number
which can lead to badness.

This commit was SVN r14423.
Этот коммит содержится в:
Josh Hursey 2007-04-19 15:04:27 +00:00
родитель 12e5d0e817
Коммит b9da59ebc3
3 изменённых файлов: 64 добавлений и 3 удалений

Просмотреть файл

@ -136,6 +136,7 @@ extern "C" {
char * global_snapshot_ref, char * global_snapshot_ref,
char *snapshot_ref, char *snapshot_ref,
char *snapshot_location); char *snapshot_location);
ORTE_DECLSPEC int orte_snapc_base_finalize_metadata(char * global_snapshot_ref);
ORTE_DECLSPEC int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *snapshot); ORTE_DECLSPEC int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *snapshot);
/******************************* /*******************************

Просмотреть файл

@ -47,6 +47,7 @@
******************/ ******************/
/* Some local strings to use genericly with the global metadata file */ /* Some local strings to use genericly with the global metadata file */
#define SNAPC_METADATA_SEQ ("# Seq: ") #define SNAPC_METADATA_SEQ ("# Seq: ")
#define SNAPC_METADATA_DONE_SEQ ("# Finished Seq: ")
#define SNAPC_METADATA_TIME ("# Timestamp: ") #define SNAPC_METADATA_TIME ("# Timestamp: ")
#define SNAPC_METADATA_PROCESS ("# Process: ") #define SNAPC_METADATA_PROCESS ("# Process: ")
#define SNAPC_METADATA_CRS_COMP ("# OPAL CRS Component: ") #define SNAPC_METADATA_CRS_COMP ("# OPAL CRS Component: ")
@ -55,6 +56,7 @@
static int snapc_base_reg_gpr_request( orte_jobid_t jobid, orte_gpr_notify_cb_fn_t gpr_cbfunc, void* gpr_cbdata ); static int snapc_base_reg_gpr_request( orte_jobid_t jobid, orte_gpr_notify_cb_fn_t gpr_cbfunc, void* gpr_cbdata );
static int get_next_seq_number(FILE *file); static int get_next_seq_number(FILE *file);
static int get_next_valid_seq_number(FILE *file);
static int metadata_extract_next_token(FILE *file, char **token, char **value); static int metadata_extract_next_token(FILE *file, char **token, char **value);
size_t orte_snapc_base_snapshot_seq_number = 0; size_t orte_snapc_base_snapshot_seq_number = 0;
@ -1340,6 +1342,37 @@ int orte_snapc_base_add_timestamp(char * global_snapshot_ref)
return exit_status; return exit_status;
} }
int orte_snapc_base_finalize_metadata(char * global_snapshot_ref)
{
int exit_status = ORTE_SUCCESS;
FILE * meta_data = NULL;
char * meta_data_fname = NULL;
/* Add the final timestamp */
orte_snapc_base_add_timestamp(global_snapshot_ref);
meta_data_fname = orte_snapc_base_get_global_snapshot_metadata_file(global_snapshot_ref);
if (NULL == (meta_data = fopen(meta_data_fname, "a")) ) {
opal_output(orte_snapc_base_output,
"orte:snapc:base: orte_snapc_base_add_timestamp: Error: Unable to open the file (%s)\n",
meta_data_fname);
exit_status = ORTE_ERROR;
goto cleanup;
}
fprintf(meta_data, "%s%d\n", SNAPC_METADATA_DONE_SEQ, (int)orte_snapc_base_snapshot_seq_number);
cleanup:
if( NULL != meta_data )
fclose(meta_data);
if( NULL != meta_data_fname)
free(meta_data_fname);
return exit_status;
}
int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc, int orte_snapc_base_add_vpid_metadata( orte_process_name_t *proc,
char * global_snapshot_ref, char * global_snapshot_ref,
char *snapshot_ref, char *snapshot_ref,
@ -1416,10 +1449,10 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s
} }
/* /*
* If we were not given a sequence number, first find the largest seq number * If we were not given a sequence number, first find the largest valid seq number
*/ */
if(0 > global_snapshot->seq_num ) { if(0 > global_snapshot->seq_num ) {
while(0 <= (next_seq_int = get_next_seq_number(meta_data)) ){ while(0 <= (next_seq_int = get_next_valid_seq_number(meta_data)) ){
global_snapshot->seq_num = next_seq_int; global_snapshot->seq_num = next_seq_int;
} }
rewind(meta_data); rewind(meta_data);
@ -1524,6 +1557,33 @@ static int get_next_seq_number(FILE *file)
return seq_int; return seq_int;
} }
/*
* Extract the next Valid sequence number from the file
*/
static int get_next_valid_seq_number(FILE *file)
{
char *token = NULL;
char *value = NULL;
int seq_int = -1;
do {
if( ORTE_SUCCESS != metadata_extract_next_token(file, &token, &value) ) {
seq_int = -1;
goto cleanup;
}
} while(0 != strncmp(token, SNAPC_METADATA_DONE_SEQ, strlen(SNAPC_METADATA_DONE_SEQ)) );
seq_int = atoi(value);
cleanup:
if( NULL != token)
free(token);
if( NULL != value)
free(value);
return seq_int;
}
static int metadata_extract_next_token(FILE *file, char **token, char **value) static int metadata_extract_next_token(FILE *file, char **token, char **value)
{ {
int exit_status = ORTE_SUCCESS; int exit_status = ORTE_SUCCESS;

Просмотреть файл

@ -935,7 +935,7 @@ static int snapc_full_global_gather_all_files(void) {
/* /*
* Now that we gathered all the files, finish off the metadata file * Now that we gathered all the files, finish off the metadata file
*/ */
orte_snapc_base_add_timestamp(global_snapshot.reference_name); orte_snapc_base_finalize_metadata(global_snapshot.reference_name);
cleanup: cleanup:
if(NULL != local_dir) if(NULL != local_dir)