From 5d0607395df80faffdfc2d6fcce67179ccfe2a8c Mon Sep 17 00:00:00 2001 From: Josh Hursey Date: Fri, 8 May 2009 19:41:11 +0000 Subject: [PATCH] A couple of C/R related commits that have been sitting off-trunk for a while. * Add 'orte-checkpoint -l' option that lists all checkpoints currently available on the system. * Add 'orte-restart -i' which prints information regarding the checkpoint targeted for restart. * Add ability to extract the timing metadata. * Fix show_help() in the orte-checkpoint and orte-restart tools. They should be using the opal versions instead of the orte versions (otherwise nothing is printed). This commit was SVN r21194. --- opal/mca/crs/blcr/crs_blcr_module.c | 2 +- opal/tools/opal-restart/opal-restart.c | 5 + orte/mca/snapc/base/base.h | 3 + orte/mca/snapc/base/snapc_base_fns.c | 166 ++++++++++++++++++- orte/mca/snapc/snapc.h | 6 + orte/tools/orte-checkpoint/orte-checkpoint.c | 106 ++++++++++-- orte/tools/orte-restart/orte-restart.c | 144 +++++++++++++++- 7 files changed, 411 insertions(+), 21 deletions(-) diff --git a/opal/mca/crs/blcr/crs_blcr_module.c b/opal/mca/crs/blcr/crs_blcr_module.c index 89f85d4f4e..b860dfac69 100644 --- a/opal/mca/crs/blcr/crs_blcr_module.c +++ b/opal/mca/crs/blcr/crs_blcr_module.c @@ -587,7 +587,7 @@ static int blcr_checkpoint_peer(pid_t pid, char * local_dir, char ** fname) int ret; pid_t child_pid; int exit_status = OPAL_SUCCESS; - int status, child_status; + int status, child_status; opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint_peer(%d, --)", pid); diff --git a/opal/tools/opal-restart/opal-restart.c b/opal/tools/opal-restart/opal-restart.c index 6c365ad71b..7588052cfe 100644 --- a/opal/tools/opal-restart/opal-restart.c +++ b/opal/tools/opal-restart/opal-restart.c @@ -480,6 +480,8 @@ static int check_file(char *given_filename) char **argv = NULL; if(NULL == given_filename) { + opal_output(opal_restart_globals.output, + "Error: No filename provided!"); exit_status = OPAL_ERROR; goto cleanup; } @@ -509,6 +511,9 @@ static int check_file(char *given_filename) path_to_check); if (0 > (ret = access(path_to_check, F_OK)) ) { + opal_output(opal_restart_globals.output, + "Error: Unable to access the path [%s]!", + path_to_check); exit_status = OPAL_ERROR; goto cleanup; } diff --git a/orte/mca/snapc/base/base.h b/orte/mca/snapc/base/base.h index 1a949494fc..4287ccda5c 100644 --- a/orte/mca/snapc/base/base.h +++ b/orte/mca/snapc/base/base.h @@ -155,6 +155,9 @@ ORTE_DECLSPEC extern orte_snapc_coord_type_t orte_snapc_coord_type; ORTE_DECLSPEC int orte_snapc_base_finalize_metadata(char * global_snapshot_ref); ORTE_DECLSPEC int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *snapshot); + ORTE_DECLSPEC int orte_snapc_base_get_all_snapshot_refs(char *base_dir, int *num_refs, char ***snapshot_refs); + ORTE_DECLSPEC int orte_snapc_base_get_all_snapshot_ref_seqs(char *base_dir, char *snapshot_name, int *num_seqs, int **snapshot_ref_seqs); + /******************************* * Global Coordinator functions *******************************/ diff --git a/orte/mca/snapc/base/snapc_base_fns.c b/orte/mca/snapc/base/snapc_base_fns.c index 95b689cbed..88f5c0ae2a 100644 --- a/orte/mca/snapc/base/snapc_base_fns.c +++ b/orte/mca/snapc/base/snapc_base_fns.c @@ -27,6 +27,15 @@ #ifdef HAVE_UNISTD_H #include #endif /* HAVE_UNISTD_H */ +#if HAVE_SYS_TYPES_H +#include +#endif /* HAVE_SYS_TYPES_H */ +#if HAVE_SYS_STAT_H +#include +#endif /* HAVE_SYS_STAT_H */ +#ifdef HAVE_DIRENT_H +#include +#endif /* HAVE_DIRENT_H */ #include #include "opal/mca/mca.h" @@ -36,6 +45,7 @@ #include "opal/util/os_dirpath.h" #include "opal/util/output.h" #include "opal/util/basename.h" +#include "opal/util/argv.h" #include "opal/mca/crs/crs.h" #include "opal/mca/crs/base/base.h" @@ -135,6 +145,9 @@ void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t free(tmp_dir); snapshot->seq_num = 0; + + snapshot->start_time = NULL; + snapshot->end_time = NULL; } void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *snapshot) @@ -156,6 +169,16 @@ void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t snapshot->local_location = NULL; } + if(NULL != snapshot->start_time) { + free(snapshot->start_time); + snapshot->start_time = NULL; + } + + if(NULL != snapshot->end_time) { + free(snapshot->end_time); + snapshot->end_time = NULL; + } + snapshot->seq_num = 0; } @@ -456,6 +479,142 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer, /***************************** * Snapshot metadata functions *****************************/ +int orte_snapc_base_get_all_snapshot_refs(char *base_dir, int *num_refs, char ***snapshot_refs) +{ +#ifndef HAVE_DIRENT_H + return OMPI_ERR_NOT_SUPPORTED; +#else + int ret, exit_status = ORTE_SUCCESS; + char * tmp_str = NULL, * metadata_file = NULL; + DIR *dirp = NULL; + struct dirent *dir_entp = NULL; + struct stat file_status; + + if( NULL == base_dir ) { + if( NULL == orte_snapc_base_global_snapshot_dir ) { + exit_status = ORTE_ERROR; + goto cleanup; + } + base_dir = strdup(orte_snapc_base_global_snapshot_dir); + } + + /* + * Get all subdirectories under the base directory + */ + dirp = opendir(base_dir); + while( NULL != (dir_entp = readdir(dirp))) { + /* Skip "." and ".." if they are in the list */ + if( 0 == strncmp("..", dir_entp->d_name, strlen("..") ) || + 0 == strncmp(".", dir_entp->d_name, strlen(".") ) ) { + continue; + } + + /* Add the full path */ + asprintf(&tmp_str, "%s/%s", base_dir, dir_entp->d_name); + if(0 != (ret = stat(tmp_str, &file_status) ) ){ + free( tmp_str); + tmp_str = NULL; + continue; + } else { + /* Is it a directory? */ + if(S_ISDIR(file_status.st_mode) ) { + asprintf(&metadata_file, "%s/%s", + tmp_str, + orte_snapc_base_metadata_filename); + if(0 != (ret = stat(metadata_file, &file_status) ) ){ + free( tmp_str); + tmp_str = NULL; + free( metadata_file); + metadata_file = NULL; + continue; + } else { + if(S_ISREG(file_status.st_mode) ) { + opal_argv_append(num_refs, snapshot_refs, dir_entp->d_name); + } + } + free( metadata_file); + metadata_file = NULL; + } + } + + free( tmp_str); + tmp_str = NULL; + } + + closedir(dirp); + + cleanup: + if( NULL != tmp_str) { + free( tmp_str); + tmp_str = NULL; + } + + return exit_status; +#endif /* HAVE_DIRENT_H */ +} + +int orte_snapc_base_get_all_snapshot_ref_seqs(char *base_dir, char *snapshot_name, int *num_seqs, int **snapshot_ref_seqs) +{ + int exit_status = ORTE_SUCCESS; + char * metadata_file = NULL; + FILE * meta_data = NULL; + int s, next_seq_int; + + if( NULL == base_dir ) { + if( NULL == orte_snapc_base_global_snapshot_dir ) { + exit_status = ORTE_ERROR; + goto cleanup; + } + base_dir = strdup(orte_snapc_base_global_snapshot_dir); + } + + asprintf(&metadata_file, "%s/%s/%s", + base_dir, + snapshot_name, + orte_snapc_base_metadata_filename); + + + if (NULL == (meta_data = fopen(metadata_file, "r")) ) { + opal_output(0, "Error: Unable to open the file <%s>\n", metadata_file); + exit_status = ORTE_ERROR; + goto cleanup; + } + + /* First pass to count the number of sequence numbers */ + *num_seqs = 0; + while(0 <= (next_seq_int = get_next_valid_seq_number(meta_data)) ){ + *num_seqs += 1; + } + + /* If there are no valid seq numbers then just return here */ + if( 0 == *num_seqs ) { + exit_status = ORTE_SUCCESS; + goto cleanup; + } + + rewind(meta_data); + + /* Second pass to add them to the list */ + (*snapshot_ref_seqs) = (int *) malloc(sizeof(int) * (*num_seqs)); + s = 0; + while(0 <= (next_seq_int = get_next_valid_seq_number(meta_data)) ){ + (*snapshot_ref_seqs)[s] = next_seq_int; + ++s; + } + + cleanup: + if(NULL != meta_data) { + fclose(meta_data); + meta_data = NULL; + } + if(NULL != metadata_file) { + free(metadata_file); + metadata_file = NULL; + } + + return exit_status; +} + int orte_snapc_base_unique_global_snapshot_name(char **name_str, pid_t pid) { if( NULL == orte_snapc_base_global_snapshot_ref ) { @@ -767,7 +926,12 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s break; } else if(0 == strncmp(SNAPC_METADATA_TIME, token, strlen(SNAPC_METADATA_TIME)) ) { - ; + if( NULL == global_snapshot->start_time) { + global_snapshot->start_time = strdup(value); + } + else { + global_snapshot->end_time = strdup(value); + } } else if(0 == strncmp(SNAPC_METADATA_PROCESS, token, strlen(SNAPC_METADATA_PROCESS)) ) { orte_process_name_t proc; diff --git a/orte/mca/snapc/snapc.h b/orte/mca/snapc/snapc.h index 1df1ee3ceb..b5c94c4c1a 100644 --- a/orte/mca/snapc/snapc.h +++ b/orte/mca/snapc/snapc.h @@ -165,6 +165,12 @@ struct orte_snapc_base_global_snapshot_1_0_0_t { /** Sequence Number */ int seq_num; + + /** Start Timestamp */ + char * start_time; + + /** End Timestamp */ + char * end_time; }; typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_1_0_0_t; typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_t; diff --git a/orte/tools/orte-checkpoint/orte-checkpoint.c b/orte/tools/orte-checkpoint/orte-checkpoint.c index 49598363cf..d93134a0f1 100644 --- a/orte/tools/orte-checkpoint/orte-checkpoint.c +++ b/orte/tools/orte-checkpoint/orte-checkpoint.c @@ -68,7 +68,7 @@ #include "orte/util/hnp_contact.h" #include "orte/runtime/orte_globals.h" #include "orte/util/name_fns.h" -#include "orte/util/show_help.h" +#include "opal/util/show_help.h" #include "orte/util/proc_info.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml_types.h" @@ -102,6 +102,8 @@ static int notify_process_for_checkpoint(int term); static int pretty_print_status(void); static int pretty_print_reference(void); +static int list_all_snapshots(void); + static orte_hnp_contact_t *orterun_hnp = NULL; static char * global_snapshot_handle = NULL; static int global_sequence_num = 0; @@ -126,6 +128,7 @@ typedef struct { bool status; /* Display status messages while checkpoint is progressing */ int output; int ckpt_status; + bool list_only; /* List available checkpoints only */ } orte_checkpoint_globals_t; orte_checkpoint_globals_t orte_checkpoint_globals; @@ -180,7 +183,13 @@ opal_cmd_line_init_t cmd_line_opts[] = { &orte_checkpoint_globals.pid, OPAL_CMD_LINE_TYPE_INT, "This should be the pid of the mpirun whose applications you wish " "to checkpoint." }, - + + { NULL, NULL, NULL, + 'l', NULL, "list", + 0, + &orte_checkpoint_globals.list_only, OPAL_CMD_LINE_TYPE_BOOL, + "Display a list of checkpoint files available on this machine" }, + /* End of list */ { NULL, NULL, NULL, '\0', NULL, NULL, 0, NULL, OPAL_CMD_LINE_TYPE_NULL, @@ -200,6 +209,18 @@ main(int argc, char *argv[]) goto cleanup; } + /************************************* + * Listing only Checkpoint References + *************************************/ + if( orte_checkpoint_globals.list_only ) { + if (ORTE_SUCCESS != (ret = list_all_snapshots())) { + exit_status = ret; + goto cleanup; + } + exit_status = ORTE_SUCCESS; + goto cleanup; + } + /*************************** * Find the HNP that we want to connect to, if it exists ***************************/ @@ -238,7 +259,7 @@ main(int argc, char *argv[]) } if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint( orte_checkpoint_globals.term)) ) { - orte_show_help("help-orte-checkpoint.txt", "ckpt_failure", true, + opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true, orte_checkpoint_globals.pid, ret); exit_status = ret; goto cleanup; @@ -255,7 +276,7 @@ main(int argc, char *argv[]) } if( ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) { - orte_show_help("help-orte-checkpoint.txt", "ckpt_failure", true, + opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true, orte_checkpoint_globals.pid, ORTE_ERROR); exit_status = ORTE_ERROR; goto cleanup; @@ -299,6 +320,7 @@ static int parse_args(int argc, char *argv[]) { orte_checkpoint_globals.status = false; orte_checkpoint_globals.output = -1; orte_checkpoint_globals.ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE; + orte_checkpoint_globals.list_only = false; /* Parse the command line options */ opal_cmd_line_create(&cmd_line, cmd_line_opts); @@ -334,12 +356,17 @@ static int parse_args(int argc, char *argv[]) { /* get the remaining bits */ opal_cmd_line_get_tail(&cmd_line, &argc, &argv); + if(orte_checkpoint_globals.list_only ) { + exit_status = ORTE_SUCCESS; + goto cleanup; + } + #if OPAL_ENABLE_FT == 0 /* Warn and exit if not configured with Checkpoint/Restart */ { char *args = NULL; args = opal_cmd_line_get_usage_msg(&cmd_line); - orte_show_help("help-orte-checkpoint.txt", "usage-no-cr", + opal_show_help("help-orte-checkpoint.txt", "usage-no-cr", true, args); free(args); exit_status = ORTE_ERROR; @@ -352,7 +379,7 @@ static int parse_args(int argc, char *argv[]) { (0 >= argc && ORTE_JOBID_INVALID == orte_checkpoint_globals.req_hnp)) { char *args = NULL; args = opal_cmd_line_get_usage_msg(&cmd_line); - orte_show_help("help-orte-checkpoint.txt", "usage", true, + opal_show_help("help-orte-checkpoint.txt", "usage", true, args); free(args); exit_status = ORTE_ERROR; @@ -379,7 +406,7 @@ static int parse_args(int argc, char *argv[]) { orte_checkpoint_globals.pid = atoi(argv[0]); if ( 0 >= orte_checkpoint_globals.pid ) { - orte_show_help("help-orte-checkpoint.txt", "invalid_pid", true, + opal_show_help("help-orte-checkpoint.txt", "invalid_pid", true, orte_checkpoint_globals.pid); exit_status = ORTE_ERROR; goto cleanup; @@ -390,7 +417,7 @@ static int parse_args(int argc, char *argv[]) { */ if(orte_checkpoint_globals.nowait) { orte_checkpoint_globals.nowait = false; - orte_show_help("help-orte-checkpoint.txt", "not_impl", + opal_show_help("help-orte-checkpoint.txt", "not_impl", true, "Disconnected checkpoint"); } @@ -648,7 +675,7 @@ static void process_ckpt_update_cmd(orte_process_name_t* sender, * If the job is not able to be checkpointed, then return */ if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status) { - orte_show_help("help-orte-checkpoint.txt", "non-ckptable", + opal_show_help("help-orte-checkpoint.txt", "non-ckptable", true, orte_checkpoint_globals.pid); exit_status = ORTE_ERROR; @@ -724,7 +751,7 @@ notify_process_for_checkpoint(int term) } if( ORTE_SUCCESS != exit_status ) { - orte_show_help("help-orte-checkpoint.txt", "unable_to_connect", true, + opal_show_help("help-orte-checkpoint.txt", "unable_to_connect", true, orte_checkpoint_globals.pid); } @@ -790,3 +817,62 @@ static int pretty_print_reference(void) { return ORTE_SUCCESS; } + +static int list_all_snapshots(void) { + int ret, exit_status = ORTE_SUCCESS; + char **snapshot_refs = NULL; + int i, num_snapshot_refs = 0; + int *snapshot_ref_seqs = NULL; + int s, num_snapshot_ref_seqs = 0; + + /* Get all of the snapshot references */ + if( ORTE_SUCCESS != (ret = orte_snapc_base_get_all_snapshot_refs(NULL, &num_snapshot_refs, &snapshot_refs) ) ) { + opal_output(0, "Error: Unable to list the checkpoints in the directory <%s>\n", + orte_snapc_base_global_snapshot_dir); + exit_status = ret; + goto cleanup; + } + + /* For each snapshot reference, get a list of the valid seq numbers */ + for(i = 0; i < num_snapshot_refs; ++i) { + if( ORTE_SUCCESS != (ret = orte_snapc_base_get_all_snapshot_ref_seqs(NULL, snapshot_refs[i], + &num_snapshot_ref_seqs, + &snapshot_ref_seqs) ) ) { + opal_output(0, "Error: Unable to list the sequence numbers for the checkpoint <%s> in directory <%s>\n", + snapshot_refs[i], + orte_snapc_base_global_snapshot_dir); + exit_status = ret; + goto cleanup; + } + + /* Pretty print the result */ + printf("Snapshot Ref.: %s\t[", snapshot_refs[i]); + if( 0 >= num_snapshot_ref_seqs ) { + printf("No Valid Checkpoints"); + } + for(s = 0; s < num_snapshot_ref_seqs; ++s) { + if( s != 0 ) { + printf(","); + } + printf("%d", snapshot_ref_seqs[s]); + } + printf("]\n"); + + if( NULL != snapshot_ref_seqs ) { + free(snapshot_ref_seqs); + snapshot_ref_seqs = NULL; + } + } + + cleanup: + if( NULL != snapshot_ref_seqs ) { + free(snapshot_ref_seqs); + snapshot_ref_seqs = NULL; + } + if( NULL != snapshot_refs ) { + free(snapshot_refs); + snapshot_refs = NULL; + } + + return exit_status; +} diff --git a/orte/tools/orte-restart/orte-restart.c b/orte/tools/orte-restart/orte-restart.c index b846727503..bb188ebc77 100644 --- a/orte/tools/orte-restart/orte-restart.c +++ b/orte/tools/orte-restart/orte-restart.c @@ -65,7 +65,7 @@ #include "orte/mca/snapc/snapc.h" #include "orte/mca/snapc/base/base.h" #include "orte/mca/filem/base/base.h" -#include "orte/util/show_help.h" +#include "opal/util/show_help.h" #include "orte/util/proc_info.h" /****************** @@ -77,6 +77,7 @@ static int parse_args(int argc, char *argv[]); static int check_file(orte_snapc_base_global_snapshot_t *snapshot); static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot); static int spawn_children(orte_snapc_base_global_snapshot_t *snapshot, pid_t *child_pid); +static int snapshot_info(orte_snapc_base_global_snapshot_t *snapshot); /***************************************** * Global Vars for Command line Arguments @@ -91,6 +92,7 @@ typedef struct { int seq_number; char *hostfile; int output; + bool info_only; } orte_restart_globals_t; orte_restart_globals_t orte_restart_globals; @@ -140,6 +142,12 @@ opal_cmd_line_init_t cmd_line_opts[] = { &orte_restart_globals.hostfile, OPAL_CMD_LINE_TYPE_STRING, "Provide a hostfile to use for launch" }, + { NULL, NULL, NULL, + 'i', NULL, "info", + 0, + &orte_restart_globals.info_only, OPAL_CMD_LINE_TYPE_BOOL, + "Display information about the checkpoint" }, + /* End of list */ { NULL, NULL, NULL, '\0', NULL, NULL, @@ -175,12 +183,21 @@ main(int argc, char *argv[]) * Check for existence of the file */ if( ORTE_SUCCESS != (ret = check_file(snapshot)) ) { - orte_show_help("help-orte-restart.txt", "invalid_filename", true, + opal_show_help("help-orte-restart.txt", "invalid_filename", true, orte_restart_globals.filename); exit_status = ret; goto cleanup; } + if(orte_restart_globals.info_only ) { + if (ORTE_SUCCESS != (ret = snapshot_info(snapshot))) { + exit_status = ret; + goto cleanup; + } + exit_status = ORTE_SUCCESS; + goto cleanup; + } + /****************************** * Create the app file to use with mpirun/orterun ******************************/ @@ -207,7 +224,7 @@ main(int argc, char *argv[]) } if( ORTE_SUCCESS != (ret = spawn_children(snapshot, &child_pid)) ) { - orte_show_help("help-orte-restart.txt", "restart_cmd_failure", true, + opal_show_help("help-orte-restart.txt", "restart_cmd_failure", true, orte_restart_globals.filename, ret); exit_status = ret; goto cleanup; @@ -220,8 +237,10 @@ main(int argc, char *argv[]) * Cleanup ***************/ cleanup: - if(NULL != snapshot ) + if(NULL != snapshot ) { OBJ_RELEASE(snapshot); + snapshot = NULL; + } if (OPAL_SUCCESS != (ret = finalize())) { return ret; @@ -323,7 +342,8 @@ static int parse_args(int argc, char *argv[]) false, /* preload */ -1, /* seq_number */ NULL, /* hostfile */ - -1 }; /* output*/ + -1, /* output*/ + false };/* info only */ orte_restart_globals = tmp; @@ -365,7 +385,7 @@ static int parse_args(int argc, char *argv[]) { char *args = NULL; args = opal_cmd_line_get_usage_msg(&cmd_line); - orte_show_help("help-orte-restart.txt", "usage-no-cr", + opal_show_help("help-orte-restart.txt", "usage-no-cr", true, args); free(args); return ORTE_ERROR; @@ -377,7 +397,7 @@ static int parse_args(int argc, char *argv[]) 1 >= argc) { char *args = NULL; args = opal_cmd_line_get_usage_msg(&cmd_line); - orte_show_help("help-orte-restart.txt", "usage", true, + opal_show_help("help-orte-restart.txt", "usage", true, args); free(args); return ORTE_ERROR; @@ -388,7 +408,7 @@ static int parse_args(int argc, char *argv[]) if ( 1 > argc ) { char *args = NULL; args = opal_cmd_line_get_usage_msg(&cmd_line); - orte_show_help("help-orte-restart.txt", "usage", true, + opal_show_help("help-orte-restart.txt", "usage", true, args); free(args); return ORTE_ERROR; @@ -397,7 +417,7 @@ static int parse_args(int argc, char *argv[]) orte_restart_globals.filename = strdup(argv[0]); if ( NULL == orte_restart_globals.filename || 0 >= strlen(orte_restart_globals.filename) ) { - orte_show_help("help-orte-restart.txt", "invalid_filename", true, + opal_show_help("help-orte-restart.txt", "invalid_filename", true, orte_restart_globals.filename); return ORTE_ERROR; } @@ -589,3 +609,109 @@ static int spawn_children(orte_snapc_base_global_snapshot_t *snapshot, pid_t *ch return exit_status; } + +int snapshot_info(orte_snapc_base_global_snapshot_t *snapshot) +{ + int ret, exit_status = ORTE_SUCCESS; + int num_seqs, processes, i; + int *snapshot_ref_seqs; + opal_list_item_t* item = NULL; + orte_snapc_base_local_snapshot_t *vpid_snapshot; + + if (orte_restart_globals.seq_number == -1) { + if( ORTE_SUCCESS != (ret = orte_snapc_base_get_all_snapshot_ref_seqs(NULL, orte_restart_globals.filename, &num_seqs, &snapshot_ref_seqs) ) ) { + exit_status = ret; + goto cleanup; + } + opal_output(orte_restart_globals.output, + "Sequences: %d\n", + num_seqs); + } else { + num_seqs = 1; + snapshot_ref_seqs = &orte_restart_globals.seq_number; + } + + for (i=0; iseq_num = snapshot_ref_seqs[i]; + + while (NULL != (item = opal_list_remove_first(&snapshot->local_snapshots))) { + OBJ_RELEASE(item); + } + + if( NULL != snapshot->start_time ) { + free( snapshot->start_time ); + snapshot->start_time = NULL; + } + + if( NULL != snapshot->end_time ) { + free( snapshot->end_time ); + snapshot->end_time = NULL; + } + + opal_output(orte_restart_globals.output, + "Seq: %d\n", + snapshot->seq_num); + + if( ORTE_SUCCESS != (ret = orte_snapc_base_extract_metadata( snapshot ) ) ) { + exit_status = ret; + goto cleanup; + } + + item = opal_list_get_first(&snapshot->local_snapshots); + vpid_snapshot = (orte_snapc_base_local_snapshot_t*)item; + + if (NULL != snapshot->start_time ) { + opal_output(orte_restart_globals.output, + "Begin Timestamp: %s\n", + snapshot->start_time); + } + + if (NULL != vpid_snapshot->opal_crs ) { + opal_output(orte_restart_globals.output, + "OPAL CRS Component: %s\n", + vpid_snapshot->opal_crs); + } + + if (NULL != snapshot->reference_name) { + opal_output(orte_restart_globals.output, + "Snapshot Reference: %s\n", + snapshot->reference_name); + } + + if (NULL != snapshot->local_location) { + opal_output(orte_restart_globals.output, + "Snapshot Location: %s\n", + snapshot->local_location); + } + + if (NULL != snapshot->end_time ) { + opal_output(orte_restart_globals.output, + "End Timestamp: %s\n", + snapshot->end_time); + } + + processes = 0; + for(item = opal_list_get_first(&snapshot->local_snapshots); + item != opal_list_get_end(&snapshot->local_snapshots); + item = opal_list_get_next(item) ) { + processes++; + } + opal_output(orte_restart_globals.output, + "Processes: %d\n", + processes); + + for(item = opal_list_get_first(&snapshot->local_snapshots); + item != opal_list_get_end(&snapshot->local_snapshots); + item = opal_list_get_next(item) ) { + vpid_snapshot = (orte_snapc_base_local_snapshot_t*)item; + + opal_output_verbose(10, orte_restart_globals.output, + "Process: %u.%u", + vpid_snapshot->process_name.jobid, + vpid_snapshot->process_name.vpid); + } + } + + cleanup: + return exit_status; +}