1
1
openmpi/orte/tools/orte-restart/orte-restart.c
Jeff Squyres 97b3603036 A bunch of fixes and improvements to Open MPI's various command line tools.
* fixed some bugs where "unknown" tokens were allowed on the command
   line (which should really only be used for ortertun).
 * if an unknown token is encountered, print a short error to stderr
   and quit with a nonzero exit status
 * if we don't find the right number of parameters to an option, print
   a short error to stderr and quit with a nonzero exit status
 * when --help is given, print the help message to stdout (not stderr)
   and quit with a zero exit status
 * added --showme:help option to the wrapper compilers
 * updated docs in opal/util/cmd_line.h
 * other small/miscellaneous CLI parsing bugs in various tools

I won't bore you with what we did before.  :-)  Here's some examples
of what the new behavior looks like:

{{{
% ompi_info --bogus
ompi_info: Error: unknown option "--bogus"
Type 'ompi_info --help' for usage.
% ompi_info --param bogus
ompi_info: Error: option "--param" did not have enough parameters (2)
Type 'ompi_info --help' for usage.
%
}}}

This commit was SVN r26072.
2012-02-29 17:52:38 +00:00

870 строки
27 KiB
C

/*
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2007 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2011-2012 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
* ORTE Restart Tool for restarting a previously checkpointed multiprocess job
*
*/
#include "orte_config.h"
#include "orte/constants.h"
#include <stdio.h>
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif /* HAVE_UNISTD_H */
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif /* HAVE_STDLIB_H */
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif /* HAVE_SYS_STAT_H */
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif /* HAVE_SYS_WAIT_H */
#ifdef HAVE_STRING_H
#include <string.h>
#endif /* HAVE_STRING_H */
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"
#include "opal/util/cmd_line.h"
#include "opal/util/output.h"
#include "opal/util/argv.h"
#include "opal/util/opal_environ.h"
#include "opal/util/basename.h"
#include "opal/util/error.h"
#include "opal/util/path.h"
#include "opal/mca/base/base.h"
#include "opal/mca/base/mca_base_param.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_cr.h"
#include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h"
#include "orte/mca/sstore/sstore.h"
#include "orte/mca/sstore/base/base.h"
#include "orte/mca/filem/base/base.h"
#include "opal/util/show_help.h"
#include "orte/util/proc_info.h"
/******************
* Local Functions
******************/
static int initialize(int argc, char *argv[]);
static int finalize(void);
static int parse_args(int argc, char *argv[]);
static int create_appfile(orte_sstore_base_global_snapshot_info_t *snapshot);
static int spawn_children(orte_sstore_base_global_snapshot_info_t *snapshot, pid_t *child_pid);
static int snapshot_info(orte_sstore_base_global_snapshot_info_t *snapshot);
static int snapshot_sort_compare_fn(opal_list_item_t **a,
opal_list_item_t **b);
/*****************************************
* Global Vars for Command line Arguments
*****************************************/
typedef struct {
bool help;
char *snapshot_ref;
char *appfile;
bool verbose;
bool forked;
int seq_number;
char *hostfile;
int output;
bool info_only;
bool app_only;
bool showme;
char *mpirun_opts;
#if OPAL_ENABLE_CRDEBUG == 1
bool enable_crdebug;
#endif
} orte_restart_globals_t;
orte_restart_globals_t orte_restart_globals;
opal_cmd_line_init_t cmd_line_opts[] = {
{ NULL, NULL, NULL,
'h', NULL, "help",
0,
&orte_restart_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
"This help message" },
{ NULL, NULL, NULL,
'v', NULL, "verbose",
0,
&orte_restart_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
"Be Verbose" },
{ NULL, NULL, NULL,
'\0', NULL, "fork",
0,
&orte_restart_globals.forked, OPAL_CMD_LINE_TYPE_BOOL,
"Fork off a new process which is the restarted process instead of "
"replacing orte_restart" },
{ NULL, NULL, NULL,
's', NULL, "seq",
1,
&orte_restart_globals.seq_number, OPAL_CMD_LINE_TYPE_INT,
"The sequence number of the checkpoint to start from. "
"(Default: -1, or most recent)" },
{ NULL, NULL, NULL,
'\0', "hostfile", "hostfile",
1,
&orte_restart_globals.hostfile, OPAL_CMD_LINE_TYPE_STRING,
"Provide a hostfile to use for launch" },
{ NULL, NULL, NULL,
'\0', "machinefile", "machinefile",
1,
&orte_restart_globals.hostfile, OPAL_CMD_LINE_TYPE_STRING,
"Provide a hostfile to use for launch" },
{ NULL, NULL, NULL,
'i', NULL, "info",
0,
&orte_restart_globals.info_only, OPAL_CMD_LINE_TYPE_BOOL,
"Display information about the checkpoint" },
{ NULL, NULL, NULL,
'a', NULL, "apponly",
0,
&orte_restart_globals.app_only, OPAL_CMD_LINE_TYPE_BOOL,
"Only create the app context file, do not restart from it" },
{ NULL, NULL, NULL,
'\0', NULL, "showme",
0,
&orte_restart_globals.showme, OPAL_CMD_LINE_TYPE_BOOL,
"Display the full command line that would have been exec'ed." },
{ NULL, NULL, NULL,
'\0', "mpirun_opts", "mpirun_opts",
1,
&orte_restart_globals.mpirun_opts, OPAL_CMD_LINE_TYPE_STRING,
"Command line options to pass directly to mpirun (be sure to quote long strings, and escape internal quotes)" },
#if OPAL_ENABLE_CRDEBUG == 1
{ NULL, NULL, NULL,
'\0', "crdebug", "crdebug",
0,
&orte_restart_globals.enable_crdebug, OPAL_CMD_LINE_TYPE_BOOL,
"Enable C/R Enhanced Debugging" },
#endif
/* End of list */
{ NULL, NULL, NULL,
'\0', NULL, NULL,
0,
NULL, OPAL_CMD_LINE_TYPE_NULL,
NULL }
};
int
main(int argc, char *argv[])
{
int ret, exit_status = ORTE_SUCCESS;
pid_t child_pid = 0;
orte_sstore_base_global_snapshot_info_t *snapshot = NULL;
char *basedir = NULL;
char *tmp_str = NULL;
/***************
* Initialize
***************/
if (ORTE_SUCCESS != (ret = initialize(argc, argv))) {
exit_status = ret;
goto cleanup;
}
snapshot = OBJ_NEW(orte_sstore_base_global_snapshot_info_t);
if( opal_path_is_absolute(orte_restart_globals.snapshot_ref) ) {
basedir = opal_dirname(orte_restart_globals.snapshot_ref);
tmp_str = opal_basename(orte_restart_globals.snapshot_ref);
free(orte_restart_globals.snapshot_ref);
orte_restart_globals.snapshot_ref = strdup(tmp_str);
free(tmp_str);
tmp_str = NULL;
} else if( NULL != strchr(orte_restart_globals.snapshot_ref, '/') ) {
basedir = opal_dirname(orte_restart_globals.snapshot_ref);
tmp_str = opal_basename(orte_restart_globals.snapshot_ref);
free(orte_restart_globals.snapshot_ref);
orte_restart_globals.snapshot_ref = strdup(tmp_str);
free(tmp_str);
tmp_str = NULL;
} else {
basedir = NULL; /* Use MCA parameter */
}
/*
* Note: If the seq # passed is -1, then the largest seq # is selected,
* ow the seq # requested is selected if available
* 'basedir': Snapshot Base location to look in. If NULL then MCA parameter is used
*/
if( ORTE_SUCCESS != (ret = orte_sstore.request_restart_handle(&(snapshot->ss_handle),
basedir,
orte_restart_globals.snapshot_ref,
orte_restart_globals.seq_number,
snapshot))) {
opal_show_help("help-orte-restart.txt", "invalid_filename", true,
orte_restart_globals.snapshot_ref);
exit_status = ret;
goto cleanup;
}
orte_restart_globals.seq_number = snapshot->seq_num;
if(orte_restart_globals.info_only ) {
if (ORTE_SUCCESS != (ret = snapshot_info(snapshot))) {
exit_status = ret;
goto cleanup;
}
exit_status = ORTE_SUCCESS;
goto cleanup;
}
/******************************
* Create the app file to use with mpirun/orterun
******************************/
if( ORTE_SUCCESS != (ret = create_appfile(snapshot) ) ) {
exit_status = ret;
goto cleanup;
}
if( orte_restart_globals.app_only ) {
printf("Created Appfile:\n\t%s\n", orte_restart_globals.appfile);
exit_status = ORTE_SUCCESS;
goto cleanup;
}
/******************************
* Restart in this process [mpirun/orterun]
******************************/
if( orte_restart_globals.verbose ) {
opal_output_verbose(10, orte_restart_globals.output,
"Restarting from file (%s)",
orte_restart_globals.snapshot_ref);
if( orte_restart_globals.forked ) {
opal_output_verbose(10, orte_restart_globals.output,
"\t Forking off a child");
} else {
opal_output_verbose(10, orte_restart_globals.output,
"\t Exec in self");
}
}
if( ORTE_SUCCESS != (ret = spawn_children(snapshot, &child_pid)) ) {
opal_show_help("help-orte-restart.txt", "restart_cmd_failure", true,
orte_restart_globals.snapshot_ref, ret);
exit_status = ret;
goto cleanup;
}
opal_output_verbose(10, orte_restart_globals.output,
"orte_restart: Restarted Child with PID = %d\n", child_pid);
/***************
* Cleanup
***************/
cleanup:
if( NULL != basedir ) {
free(basedir);
basedir = NULL;
}
if( NULL != tmp_str ) {
free(tmp_str);
tmp_str = NULL;
}
if( NULL != snapshot ) {
OBJ_RELEASE(snapshot);
snapshot = NULL;
}
if (OPAL_SUCCESS != (ret = finalize())) {
return ret;
}
return exit_status;
}
static int initialize(int argc, char *argv[]) {
int ret, exit_status = ORTE_SUCCESS;
char * tmp_env_var = NULL;
/*
* Make sure to init util before parse_args
* to ensure installdirs is setup properly
* before calling mca_base_open();
*/
if( ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
return ret;
}
/*
* Parse command line arguments
*/
if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
exit_status = ret;
goto cleanup;
}
/*
* Setup OPAL Output handle from the verbose argument
*/
if( orte_restart_globals.verbose ) {
orte_restart_globals.output = opal_output_open(NULL);
opal_output_set_verbosity(orte_restart_globals.output, 10);
} else {
orte_restart_globals.output = 0; /* Default=STDERR */
}
/* Disable the checkpoint notification routine for this
* tool. As we will never need to checkpoint this tool.
* Note: This must happen before opal_init().
*/
opal_cr_set_enabled(false);
/* Select the none component, since we don't actually use a checkpointer */
tmp_env_var = mca_base_param_env_var("crs");
opal_setenv(tmp_env_var,
"none",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/*
* Setup any ORTE stuff we might need
*/
if (OPAL_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
exit_status = ret;
goto cleanup;
}
/* Unset these now that we no longer need them */
tmp_env_var = mca_base_param_env_var("crs");
opal_unsetenv(tmp_env_var, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_unsetenv(tmp_env_var, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
cleanup:
return exit_status;
}
static int finalize(void)
{
int ret;
if (OPAL_SUCCESS != (ret = orte_finalize())) {
return ret;
}
return ORTE_SUCCESS;
}
static int parse_args(int argc, char *argv[])
{
int i, ret, len;
opal_cmd_line_t cmd_line;
char **app_env = NULL, **global_env = NULL;
char * tmp_env_var = NULL;
char *argv0 = NULL;
orte_restart_globals_t tmp = { false, /* help */
NULL, /* filename */
NULL, /* appfile */
false, /* verbose */
false, /* forked */
-1, /* seq_number */
NULL, /* hostfile */
-1, /* output*/
false, /* info only */
false, /* app only */
false, /* showme */
NULL}; /* mpirun_opts */
orte_restart_globals = tmp;
#if OPAL_ENABLE_CRDEBUG == 1
orte_restart_globals.enable_crdebug = false;
#endif
#if OPAL_ENABLE_FT_CR == 0
/* Warn and exit if not configured with Checkpoint/Restart */
{
char *str, *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
str = opal_show_help_string("help-orte-restart.txt", "usage-no-cr",
true, args);
if (NULL != str) {
printf("%s", str);
free(str);
}
free(args);
return ORTE_ERROR;
}
#endif
/* Parse the command line options */
opal_cmd_line_create(&cmd_line, cmd_line_opts);
mca_base_open();
mca_base_cmd_line_setup(&cmd_line);
ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
if (OPAL_SUCCESS != ret) {
if (OPAL_ERR_SILENT != ret) {
fprintf(stderr, "%s: command line error (%s)\n", argv[0],
opal_strerror(ret));
}
return 1;
}
if (orte_restart_globals.help) {
char *str, *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line);
str = opal_show_help_string("help-orte-restart.txt", "usage", true,
args);
if (NULL != str) {
printf("%s", str);
free(str);
}
free(args);
/* If we show the help message, that should be all we do */
exit(0);
}
/**
* Put all of the MCA arguments in the environment
*/
mca_base_cmd_line_process_args(&cmd_line, &app_env, &global_env);
len = opal_argv_count(app_env);
for(i = 0; i < len; ++i) {
putenv(app_env[i]);
}
len = opal_argv_count(global_env);
for(i = 0; i < len; ++i) {
putenv(global_env[i]);
}
tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
opal_setenv(tmp_env_var,
"1",
true, &environ);
free(tmp_env_var);
tmp_env_var = NULL;
/**
* Now start parsing our specific arguments
*/
/* get the remaining bits */
argv0 = strdup(argv[0]);
opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
if (0 == argc) {
fprintf(stderr, "%s: Nothing to do\n", argv0);
fprintf(stderr, "Type '%s --help' for usge.\n", argv0);
free(argv0);
return ORTE_ERROR;
}
free(argv0);
orte_restart_globals.snapshot_ref = strdup(argv[0]);
if ( NULL == orte_restart_globals.snapshot_ref ||
0 >= strlen(orte_restart_globals.snapshot_ref) ) {
opal_show_help("help-orte-restart.txt", "invalid_filename", true,
"<none provided>");
return ORTE_ERROR;
}
/* If we have arguments after the command, then assume they
* need to be grouped together.
*/
if(argc > 1) {
orte_restart_globals.snapshot_ref = strdup(opal_argv_join(argv, ' '));
}
return ORTE_SUCCESS;
}
static int create_appfile(orte_sstore_base_global_snapshot_info_t *snapshot)
{
int exit_status = ORTE_SUCCESS;
FILE *appfile = NULL;
opal_list_item_t* item = NULL;
char *tmp_str = NULL;
char *amca_param = NULL;
char *reference_fmt_str = NULL;
char *location_str = NULL;
char *ref_location_fmt_str = NULL;
orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;
/*
* Create the appfile
*/
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_GLOBAL_SNAP_LOC_ABS,
&tmp_str);
asprintf(&orte_restart_globals.appfile, "%s/%s",
tmp_str,
strdup("restart-appfile"));
if( NULL != tmp_str ) {
free(tmp_str);
tmp_str = NULL;
}
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_GLOBAL_AMCA_PARAM,
&amca_param);
if (NULL == (appfile = fopen(orte_restart_globals.appfile, "w")) ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
/* This will give a format string that we can use */
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_LOCAL_SNAP_REF_FMT,
&reference_fmt_str);
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_LOCAL_SNAP_LOC,
&location_str);
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_LOCAL_SNAP_REF_LOC_FMT,
&ref_location_fmt_str);
/*
* Sort the snapshots so that they are in order
*/
opal_list_sort(&snapshot->local_snapshots, snapshot_sort_compare_fn);
/*
* Construct the appfile
*/
for(item = opal_list_get_first(&snapshot->local_snapshots);
item != opal_list_get_end(&snapshot->local_snapshots);
item = opal_list_get_next(item) ) {
vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item;
fprintf(appfile, "#\n");
fprintf(appfile, "# Old Process Name: %u.%u\n",
vpid_snapshot->process_name.jobid,
vpid_snapshot->process_name.vpid);
fprintf(appfile, "#\n");
fprintf(appfile, "-np 1 ");
fprintf(appfile, "--sstore-load ");
/* loc:ref:postfix:seq */
fprintf(appfile, "%s:%s:",
location_str,
orte_restart_globals.snapshot_ref);
fprintf(appfile, reference_fmt_str, vpid_snapshot->process_name.vpid);
fprintf(appfile, ":%s:%s:%d ",
(vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
(vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
orte_restart_globals.seq_number);
if( NULL == amca_param ) {
amca_param = strdup("ft-enable-cr");
opal_show_help("help-orte-restart.txt", "amca_param_not_found", true,
amca_param);
}
fprintf(appfile, "-am %s ", amca_param);
fprintf(appfile, " opal-restart ");
/*
* By default, point to the central storage location of the checkpoint.
* The active SStore module at restart time will determine if files
* need to be preloaded.
*/
fprintf(appfile, "-l %s", location_str);
fprintf(appfile, " -m %s ", orte_sstore_base_local_metadata_filename);
fprintf(appfile, "-r ");
fprintf(appfile, reference_fmt_str, vpid_snapshot->process_name.vpid);
fprintf(appfile, "\n");
}
cleanup:
if(NULL != appfile) {
fclose(appfile);
appfile = NULL;
}
if( NULL != tmp_str ) {
free(tmp_str);
tmp_str = NULL;
}
if( NULL != location_str ) {
free(location_str);
location_str = NULL;
}
if( NULL != reference_fmt_str ) {
free(reference_fmt_str);
reference_fmt_str = NULL;
}
if( NULL != ref_location_fmt_str ) {
free(ref_location_fmt_str);
ref_location_fmt_str = NULL;
}
return exit_status;
}
static int spawn_children(orte_sstore_base_global_snapshot_info_t *snapshot, pid_t *child_pid)
{
int ret, exit_status = ORTE_SUCCESS;
char *amca_param = NULL;
char **argv = NULL;
int argc = 0, i;
int status;
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_GLOBAL_AMCA_PARAM,
&amca_param);
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "mpirun")) ) {
exit_status = ret;
goto cleanup;
}
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "-am")) ) {
exit_status = ret;
goto cleanup;
}
if( NULL == amca_param ) {
amca_param = strdup("ft-enable-cr");
opal_show_help("help-orte-restart.txt", "amca_param_not_found", true,
amca_param);
}
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, amca_param)) ) {
exit_status = ret;
goto cleanup;
}
if( NULL != orte_restart_globals.hostfile ) {
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "--default-hostfile")) ) {
exit_status = ret;
goto cleanup;
}
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, orte_restart_globals.hostfile)) ) {
exit_status = ret;
goto cleanup;
}
}
if( orte_restart_globals.mpirun_opts ) {
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, orte_restart_globals.mpirun_opts)) ) {
exit_status = ret;
goto cleanup;
}
}
#if OPAL_ENABLE_CRDEBUG == 1
if( orte_restart_globals.enable_crdebug ) {
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "--crdebug")) ) {
exit_status = ret;
goto cleanup;
}
}
#endif
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, "--app")) ) {
exit_status = ret;
goto cleanup;
}
if( ORTE_SUCCESS != (ret = opal_argv_append(&argc, &argv, orte_restart_globals.appfile)) ) {
exit_status = ret;
goto cleanup;
}
if( orte_restart_globals.showme ) {
for(i = 0; i < argc; ++i ) {
/*printf("%2d: (%s)\n", i, argv[i]);*/
printf("%s ", argv[i]);
}
printf("\n");
return ORTE_SUCCESS;
}
/* To fork off a child */
if( orte_restart_globals.forked ) {
*child_pid = fork();
if( 0 == *child_pid) {
/* Child Process */
status = execvp(strdup(argv[0]), argv);
if( 0 > status) {
opal_output(orte_restart_globals.output,
"orte_restart: execv failed with status = %d\n",
status);
}
exit_status = status;
goto cleanup;
}
else if(0 < *child_pid) {
/* Parent is done once it is started */
;
}
else {
opal_output(orte_restart_globals.output,
"orte_restart: fork failed: This should never happen!");
/* Fork failed :( */
exit_status = *child_pid;
goto cleanup;
}
}
/* ... or not to fork off a child */
else {
/* Make sure to finalize so we don't leave our session directory */
orte_finalize();
status = execvp(strdup(argv[0]), argv);
if( 0 > status) {
/* execv failed */
}
exit_status = status;
goto cleanup;
}
cleanup:
if( NULL != argv)
opal_argv_free(argv);
return exit_status;
}
int snapshot_info(orte_sstore_base_global_snapshot_info_t *snapshot)
{
int ret, exit_status = ORTE_SUCCESS;
int num_seqs, processes, i;
char **snapshot_ref_seqs = NULL;
opal_list_item_t* item = NULL;
orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;
char *tmp_str = NULL;
/*
* Find all sequence numbers
*/
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_GLOBAL_SNAP_NUM_SEQ,
&tmp_str);
num_seqs = atoi(tmp_str);
if( NULL != tmp_str ) {
free(tmp_str);
tmp_str = NULL;
}
orte_sstore.get_attr(snapshot->ss_handle,
SSTORE_METADATA_GLOBAL_SNAP_ALL_SEQ,
&tmp_str);
snapshot_ref_seqs = opal_argv_split(tmp_str, ',');
if( NULL != tmp_str ) {
free(tmp_str);
tmp_str = NULL;
}
if( 0 > orte_restart_globals.seq_number ) {
opal_output(orte_restart_globals.output,
"Sequences: %d\n",
num_seqs);
}
for(i=0; i < num_seqs; ++i) {
snapshot->seq_num = atoi(snapshot_ref_seqs[i]);
if( 0 <= orte_restart_globals.seq_number &&
snapshot->seq_num != orte_restart_globals.seq_number ) {
continue;
}
if( ORTE_SUCCESS != (ret = orte_sstore_base_extract_global_metadata( snapshot ) ) ) {
exit_status = ret;
goto cleanup;
}
opal_output(orte_restart_globals.output,
"Seq: %d\n",
snapshot->seq_num);
if (NULL != snapshot->start_time ) {
opal_output(orte_restart_globals.output,
"\tBegin Timestamp: %s\n",
snapshot->start_time);
}
if (NULL != snapshot->end_time ) {
opal_output(orte_restart_globals.output,
"\tEnd Timestamp : %s\n",
snapshot->end_time);
}
processes = opal_list_get_size(&snapshot->local_snapshots);
opal_output(orte_restart_globals.output,
"\tProcesses: %d\n",
processes);
for(item = opal_list_get_first(&snapshot->local_snapshots);
item != opal_list_get_end(&snapshot->local_snapshots);
item = opal_list_get_next(item) ) {
vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item;
opal_output_verbose(10, orte_restart_globals.output,
"\t\tProcess: %u.%u \t CRS: %s \t Compress: %s (%s)",
vpid_snapshot->process_name.jobid,
vpid_snapshot->process_name.vpid,
vpid_snapshot->crs_comp,
vpid_snapshot->compress_comp,
vpid_snapshot->compress_postfix);
}
}
cleanup:
return exit_status;
}
static int snapshot_sort_compare_fn(opal_list_item_t **a,
opal_list_item_t **b)
{
orte_sstore_base_local_snapshot_info_t *snap_a, *snap_b;
snap_a = (orte_sstore_base_local_snapshot_info_t*)(*a);
snap_b = (orte_sstore_base_local_snapshot_info_t*)(*b);
if( snap_a->process_name.vpid > snap_b->process_name.vpid ) {
return 1;
}
else if( snap_a->process_name.vpid == snap_b->process_name.vpid ) {
return 0;
}
else {
return -1;
}
}