1
1

A couple of C/R related commits that have been sitting off-trunk for a while.

* Add 'orte-checkpoint -l' option that lists all checkpoints currently available on the system.
 * Add 'orte-restart -i' which prints information regarding the checkpoint targeted for restart.
 * Add ability to extract the timing metadata.
 * Fix show_help() in the orte-checkpoint and orte-restart tools. They should be using the opal versions instead of the orte versions (otherwise nothing is printed).

This commit was SVN r21194.
Этот коммит содержится в:
Josh Hursey 2009-05-08 19:41:11 +00:00
родитель 94ff03a3eb
Коммит 5d0607395d
7 изменённых файлов: 411 добавлений и 21 удалений

Просмотреть файл

@ -587,7 +587,7 @@ static int blcr_checkpoint_peer(pid_t pid, char * local_dir, char ** fname)
int ret; int ret;
pid_t child_pid; pid_t child_pid;
int exit_status = OPAL_SUCCESS; int exit_status = OPAL_SUCCESS;
int status, child_status; int status, child_status;
opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, opal_output_verbose(10, mca_crs_blcr_component.super.output_handle,
"crs:blcr: checkpoint_peer(%d, --)", pid); "crs:blcr: checkpoint_peer(%d, --)", pid);

Просмотреть файл

@ -480,6 +480,8 @@ static int check_file(char *given_filename)
char **argv = NULL; char **argv = NULL;
if(NULL == given_filename) { if(NULL == given_filename) {
opal_output(opal_restart_globals.output,
"Error: No filename provided!");
exit_status = OPAL_ERROR; exit_status = OPAL_ERROR;
goto cleanup; goto cleanup;
} }
@ -509,6 +511,9 @@ static int check_file(char *given_filename)
path_to_check); path_to_check);
if (0 > (ret = access(path_to_check, F_OK)) ) { if (0 > (ret = access(path_to_check, F_OK)) ) {
opal_output(opal_restart_globals.output,
"Error: Unable to access the path [%s]!",
path_to_check);
exit_status = OPAL_ERROR; exit_status = OPAL_ERROR;
goto cleanup; goto cleanup;
} }

Просмотреть файл

@ -155,6 +155,9 @@ ORTE_DECLSPEC extern orte_snapc_coord_type_t orte_snapc_coord_type;
ORTE_DECLSPEC int orte_snapc_base_finalize_metadata(char * global_snapshot_ref); ORTE_DECLSPEC int orte_snapc_base_finalize_metadata(char * global_snapshot_ref);
ORTE_DECLSPEC int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *snapshot); ORTE_DECLSPEC int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *snapshot);
ORTE_DECLSPEC int orte_snapc_base_get_all_snapshot_refs(char *base_dir, int *num_refs, char ***snapshot_refs);
ORTE_DECLSPEC int orte_snapc_base_get_all_snapshot_ref_seqs(char *base_dir, char *snapshot_name, int *num_seqs, int **snapshot_ref_seqs);
/******************************* /*******************************
* Global Coordinator functions * Global Coordinator functions
*******************************/ *******************************/

Просмотреть файл

@ -27,6 +27,15 @@
#ifdef HAVE_UNISTD_H #ifdef HAVE_UNISTD_H
#include <unistd.h> #include <unistd.h>
#endif /* HAVE_UNISTD_H */ #endif /* HAVE_UNISTD_H */
#if HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif /* HAVE_SYS_TYPES_H */
#if HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif /* HAVE_SYS_STAT_H */
#ifdef HAVE_DIRENT_H
#include <dirent.h>
#endif /* HAVE_DIRENT_H */
#include <time.h> #include <time.h>
#include "opal/mca/mca.h" #include "opal/mca/mca.h"
@ -36,6 +45,7 @@
#include "opal/util/os_dirpath.h" #include "opal/util/os_dirpath.h"
#include "opal/util/output.h" #include "opal/util/output.h"
#include "opal/util/basename.h" #include "opal/util/basename.h"
#include "opal/util/argv.h"
#include "opal/mca/crs/crs.h" #include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h" #include "opal/mca/crs/base/base.h"
@ -135,6 +145,9 @@ void orte_snapc_base_global_snapshot_construct(orte_snapc_base_global_snapshot_t
free(tmp_dir); free(tmp_dir);
snapshot->seq_num = 0; snapshot->seq_num = 0;
snapshot->start_time = NULL;
snapshot->end_time = NULL;
} }
void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *snapshot) void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t *snapshot)
@ -156,6 +169,16 @@ void orte_snapc_base_global_snapshot_destruct( orte_snapc_base_global_snapshot_t
snapshot->local_location = NULL; snapshot->local_location = NULL;
} }
if(NULL != snapshot->start_time) {
free(snapshot->start_time);
snapshot->start_time = NULL;
}
if(NULL != snapshot->end_time) {
free(snapshot->end_time);
snapshot->end_time = NULL;
}
snapshot->seq_num = 0; snapshot->seq_num = 0;
} }
@ -456,6 +479,142 @@ int orte_snapc_base_global_coord_ckpt_update_cmd(orte_process_name_t* peer,
/***************************** /*****************************
* Snapshot metadata functions * Snapshot metadata functions
*****************************/ *****************************/
int orte_snapc_base_get_all_snapshot_refs(char *base_dir, int *num_refs, char ***snapshot_refs)
{
#ifndef HAVE_DIRENT_H
return OMPI_ERR_NOT_SUPPORTED;
#else
int ret, exit_status = ORTE_SUCCESS;
char * tmp_str = NULL, * metadata_file = NULL;
DIR *dirp = NULL;
struct dirent *dir_entp = NULL;
struct stat file_status;
if( NULL == base_dir ) {
if( NULL == orte_snapc_base_global_snapshot_dir ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
base_dir = strdup(orte_snapc_base_global_snapshot_dir);
}
/*
* Get all subdirectories under the base directory
*/
dirp = opendir(base_dir);
while( NULL != (dir_entp = readdir(dirp))) {
/* Skip "." and ".." if they are in the list */
if( 0 == strncmp("..", dir_entp->d_name, strlen("..") ) ||
0 == strncmp(".", dir_entp->d_name, strlen(".") ) ) {
continue;
}
/* Add the full path */
asprintf(&tmp_str, "%s/%s", base_dir, dir_entp->d_name);
if(0 != (ret = stat(tmp_str, &file_status) ) ){
free( tmp_str);
tmp_str = NULL;
continue;
} else {
/* Is it a directory? */
if(S_ISDIR(file_status.st_mode) ) {
asprintf(&metadata_file, "%s/%s",
tmp_str,
orte_snapc_base_metadata_filename);
if(0 != (ret = stat(metadata_file, &file_status) ) ){
free( tmp_str);
tmp_str = NULL;
free( metadata_file);
metadata_file = NULL;
continue;
} else {
if(S_ISREG(file_status.st_mode) ) {
opal_argv_append(num_refs, snapshot_refs, dir_entp->d_name);
}
}
free( metadata_file);
metadata_file = NULL;
}
}
free( tmp_str);
tmp_str = NULL;
}
closedir(dirp);
cleanup:
if( NULL != tmp_str) {
free( tmp_str);
tmp_str = NULL;
}
return exit_status;
#endif /* HAVE_DIRENT_H */
}
int orte_snapc_base_get_all_snapshot_ref_seqs(char *base_dir, char *snapshot_name, int *num_seqs, int **snapshot_ref_seqs)
{
int exit_status = ORTE_SUCCESS;
char * metadata_file = NULL;
FILE * meta_data = NULL;
int s, next_seq_int;
if( NULL == base_dir ) {
if( NULL == orte_snapc_base_global_snapshot_dir ) {
exit_status = ORTE_ERROR;
goto cleanup;
}
base_dir = strdup(orte_snapc_base_global_snapshot_dir);
}
asprintf(&metadata_file, "%s/%s/%s",
base_dir,
snapshot_name,
orte_snapc_base_metadata_filename);
if (NULL == (meta_data = fopen(metadata_file, "r")) ) {
opal_output(0, "Error: Unable to open the file <%s>\n", metadata_file);
exit_status = ORTE_ERROR;
goto cleanup;
}
/* First pass to count the number of sequence numbers */
*num_seqs = 0;
while(0 <= (next_seq_int = get_next_valid_seq_number(meta_data)) ){
*num_seqs += 1;
}
/* If there are no valid seq numbers then just return here */
if( 0 == *num_seqs ) {
exit_status = ORTE_SUCCESS;
goto cleanup;
}
rewind(meta_data);
/* Second pass to add them to the list */
(*snapshot_ref_seqs) = (int *) malloc(sizeof(int) * (*num_seqs));
s = 0;
while(0 <= (next_seq_int = get_next_valid_seq_number(meta_data)) ){
(*snapshot_ref_seqs)[s] = next_seq_int;
++s;
}
cleanup:
if(NULL != meta_data) {
fclose(meta_data);
meta_data = NULL;
}
if(NULL != metadata_file) {
free(metadata_file);
metadata_file = NULL;
}
return exit_status;
}
int orte_snapc_base_unique_global_snapshot_name(char **name_str, pid_t pid) int orte_snapc_base_unique_global_snapshot_name(char **name_str, pid_t pid)
{ {
if( NULL == orte_snapc_base_global_snapshot_ref ) { if( NULL == orte_snapc_base_global_snapshot_ref ) {
@ -767,7 +926,12 @@ int orte_snapc_base_extract_metadata(orte_snapc_base_global_snapshot_t *global_s
break; break;
} }
else if(0 == strncmp(SNAPC_METADATA_TIME, token, strlen(SNAPC_METADATA_TIME)) ) { else if(0 == strncmp(SNAPC_METADATA_TIME, token, strlen(SNAPC_METADATA_TIME)) ) {
; if( NULL == global_snapshot->start_time) {
global_snapshot->start_time = strdup(value);
}
else {
global_snapshot->end_time = strdup(value);
}
} }
else if(0 == strncmp(SNAPC_METADATA_PROCESS, token, strlen(SNAPC_METADATA_PROCESS)) ) { else if(0 == strncmp(SNAPC_METADATA_PROCESS, token, strlen(SNAPC_METADATA_PROCESS)) ) {
orte_process_name_t proc; orte_process_name_t proc;

Просмотреть файл

@ -165,6 +165,12 @@ struct orte_snapc_base_global_snapshot_1_0_0_t {
/** Sequence Number */ /** Sequence Number */
int seq_num; int seq_num;
/** Start Timestamp */
char * start_time;
/** End Timestamp */
char * end_time;
}; };
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_1_0_0_t; typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_1_0_0_t;
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_t; typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_t;

Просмотреть файл

@ -68,7 +68,7 @@
#include "orte/util/hnp_contact.h" #include "orte/util/hnp_contact.h"
#include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_globals.h"
#include "orte/util/name_fns.h" #include "orte/util/name_fns.h"
#include "orte/util/show_help.h" #include "opal/util/show_help.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
#include "orte/mca/rml/rml.h" #include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h" #include "orte/mca/rml/rml_types.h"
@ -102,6 +102,8 @@ static int notify_process_for_checkpoint(int term);
static int pretty_print_status(void); static int pretty_print_status(void);
static int pretty_print_reference(void); static int pretty_print_reference(void);
static int list_all_snapshots(void);
static orte_hnp_contact_t *orterun_hnp = NULL; static orte_hnp_contact_t *orterun_hnp = NULL;
static char * global_snapshot_handle = NULL; static char * global_snapshot_handle = NULL;
static int global_sequence_num = 0; static int global_sequence_num = 0;
@ -126,6 +128,7 @@ typedef struct {
bool status; /* Display status messages while checkpoint is progressing */ bool status; /* Display status messages while checkpoint is progressing */
int output; int output;
int ckpt_status; int ckpt_status;
bool list_only; /* List available checkpoints only */
} orte_checkpoint_globals_t; } orte_checkpoint_globals_t;
orte_checkpoint_globals_t orte_checkpoint_globals; orte_checkpoint_globals_t orte_checkpoint_globals;
@ -180,7 +183,13 @@ opal_cmd_line_init_t cmd_line_opts[] = {
&orte_checkpoint_globals.pid, OPAL_CMD_LINE_TYPE_INT, &orte_checkpoint_globals.pid, OPAL_CMD_LINE_TYPE_INT,
"This should be the pid of the mpirun whose applications you wish " "This should be the pid of the mpirun whose applications you wish "
"to checkpoint." }, "to checkpoint." },
{ NULL, NULL, NULL,
'l', NULL, "list",
0,
&orte_checkpoint_globals.list_only, OPAL_CMD_LINE_TYPE_BOOL,
"Display a list of checkpoint files available on this machine" },
/* End of list */ /* End of list */
{ NULL, NULL, NULL, '\0', NULL, NULL, 0, { NULL, NULL, NULL, '\0', NULL, NULL, 0,
NULL, OPAL_CMD_LINE_TYPE_NULL, NULL, OPAL_CMD_LINE_TYPE_NULL,
@ -200,6 +209,18 @@ main(int argc, char *argv[])
goto cleanup; goto cleanup;
} }
/*************************************
* Listing only Checkpoint References
*************************************/
if( orte_checkpoint_globals.list_only ) {
if (ORTE_SUCCESS != (ret = list_all_snapshots())) {
exit_status = ret;
goto cleanup;
}
exit_status = ORTE_SUCCESS;
goto cleanup;
}
/*************************** /***************************
* Find the HNP that we want to connect to, if it exists * Find the HNP that we want to connect to, if it exists
***************************/ ***************************/
@ -238,7 +259,7 @@ main(int argc, char *argv[])
} }
if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint( orte_checkpoint_globals.term)) ) { if(ORTE_SUCCESS != (ret = notify_process_for_checkpoint( orte_checkpoint_globals.term)) ) {
orte_show_help("help-orte-checkpoint.txt", "ckpt_failure", true, opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
orte_checkpoint_globals.pid, ret); orte_checkpoint_globals.pid, ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
@ -255,7 +276,7 @@ main(int argc, char *argv[])
} }
if( ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) { if( ORTE_SNAPC_CKPT_STATE_ERROR == orte_checkpoint_globals.ckpt_status ) {
orte_show_help("help-orte-checkpoint.txt", "ckpt_failure", true, opal_show_help("help-orte-checkpoint.txt", "ckpt_failure", true,
orte_checkpoint_globals.pid, ORTE_ERROR); orte_checkpoint_globals.pid, ORTE_ERROR);
exit_status = ORTE_ERROR; exit_status = ORTE_ERROR;
goto cleanup; goto cleanup;
@ -299,6 +320,7 @@ static int parse_args(int argc, char *argv[]) {
orte_checkpoint_globals.status = false; orte_checkpoint_globals.status = false;
orte_checkpoint_globals.output = -1; orte_checkpoint_globals.output = -1;
orte_checkpoint_globals.ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE; orte_checkpoint_globals.ckpt_status = ORTE_SNAPC_CKPT_STATE_NONE;
orte_checkpoint_globals.list_only = false;
/* Parse the command line options */ /* Parse the command line options */
opal_cmd_line_create(&cmd_line, cmd_line_opts); opal_cmd_line_create(&cmd_line, cmd_line_opts);
@ -334,12 +356,17 @@ static int parse_args(int argc, char *argv[]) {
/* get the remaining bits */ /* get the remaining bits */
opal_cmd_line_get_tail(&cmd_line, &argc, &argv); opal_cmd_line_get_tail(&cmd_line, &argc, &argv);
if(orte_checkpoint_globals.list_only ) {
exit_status = ORTE_SUCCESS;
goto cleanup;
}
#if OPAL_ENABLE_FT == 0 #if OPAL_ENABLE_FT == 0
/* Warn and exit if not configured with Checkpoint/Restart */ /* Warn and exit if not configured with Checkpoint/Restart */
{ {
char *args = NULL; char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line); args = opal_cmd_line_get_usage_msg(&cmd_line);
orte_show_help("help-orte-checkpoint.txt", "usage-no-cr", opal_show_help("help-orte-checkpoint.txt", "usage-no-cr",
true, args); true, args);
free(args); free(args);
exit_status = ORTE_ERROR; exit_status = ORTE_ERROR;
@ -352,7 +379,7 @@ static int parse_args(int argc, char *argv[]) {
(0 >= argc && ORTE_JOBID_INVALID == orte_checkpoint_globals.req_hnp)) { (0 >= argc && ORTE_JOBID_INVALID == orte_checkpoint_globals.req_hnp)) {
char *args = NULL; char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line); args = opal_cmd_line_get_usage_msg(&cmd_line);
orte_show_help("help-orte-checkpoint.txt", "usage", true, opal_show_help("help-orte-checkpoint.txt", "usage", true,
args); args);
free(args); free(args);
exit_status = ORTE_ERROR; exit_status = ORTE_ERROR;
@ -379,7 +406,7 @@ static int parse_args(int argc, char *argv[]) {
orte_checkpoint_globals.pid = atoi(argv[0]); orte_checkpoint_globals.pid = atoi(argv[0]);
if ( 0 >= orte_checkpoint_globals.pid ) { if ( 0 >= orte_checkpoint_globals.pid ) {
orte_show_help("help-orte-checkpoint.txt", "invalid_pid", true, opal_show_help("help-orte-checkpoint.txt", "invalid_pid", true,
orte_checkpoint_globals.pid); orte_checkpoint_globals.pid);
exit_status = ORTE_ERROR; exit_status = ORTE_ERROR;
goto cleanup; goto cleanup;
@ -390,7 +417,7 @@ static int parse_args(int argc, char *argv[]) {
*/ */
if(orte_checkpoint_globals.nowait) { if(orte_checkpoint_globals.nowait) {
orte_checkpoint_globals.nowait = false; orte_checkpoint_globals.nowait = false;
orte_show_help("help-orte-checkpoint.txt", "not_impl", opal_show_help("help-orte-checkpoint.txt", "not_impl",
true, true,
"Disconnected checkpoint"); "Disconnected checkpoint");
} }
@ -648,7 +675,7 @@ static void process_ckpt_update_cmd(orte_process_name_t* sender,
* If the job is not able to be checkpointed, then return * If the job is not able to be checkpointed, then return
*/ */
if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status) { if( ORTE_SNAPC_CKPT_STATE_NO_CKPT == orte_checkpoint_globals.ckpt_status) {
orte_show_help("help-orte-checkpoint.txt", "non-ckptable", opal_show_help("help-orte-checkpoint.txt", "non-ckptable",
true, true,
orte_checkpoint_globals.pid); orte_checkpoint_globals.pid);
exit_status = ORTE_ERROR; exit_status = ORTE_ERROR;
@ -724,7 +751,7 @@ notify_process_for_checkpoint(int term)
} }
if( ORTE_SUCCESS != exit_status ) { if( ORTE_SUCCESS != exit_status ) {
orte_show_help("help-orte-checkpoint.txt", "unable_to_connect", true, opal_show_help("help-orte-checkpoint.txt", "unable_to_connect", true,
orte_checkpoint_globals.pid); orte_checkpoint_globals.pid);
} }
@ -790,3 +817,62 @@ static int pretty_print_reference(void) {
return ORTE_SUCCESS; return ORTE_SUCCESS;
} }
static int list_all_snapshots(void) {
int ret, exit_status = ORTE_SUCCESS;
char **snapshot_refs = NULL;
int i, num_snapshot_refs = 0;
int *snapshot_ref_seqs = NULL;
int s, num_snapshot_ref_seqs = 0;
/* Get all of the snapshot references */
if( ORTE_SUCCESS != (ret = orte_snapc_base_get_all_snapshot_refs(NULL, &num_snapshot_refs, &snapshot_refs) ) ) {
opal_output(0, "Error: Unable to list the checkpoints in the directory <%s>\n",
orte_snapc_base_global_snapshot_dir);
exit_status = ret;
goto cleanup;
}
/* For each snapshot reference, get a list of the valid seq numbers */
for(i = 0; i < num_snapshot_refs; ++i) {
if( ORTE_SUCCESS != (ret = orte_snapc_base_get_all_snapshot_ref_seqs(NULL, snapshot_refs[i],
&num_snapshot_ref_seqs,
&snapshot_ref_seqs) ) ) {
opal_output(0, "Error: Unable to list the sequence numbers for the checkpoint <%s> in directory <%s>\n",
snapshot_refs[i],
orte_snapc_base_global_snapshot_dir);
exit_status = ret;
goto cleanup;
}
/* Pretty print the result */
printf("Snapshot Ref.: %s\t[", snapshot_refs[i]);
if( 0 >= num_snapshot_ref_seqs ) {
printf("No Valid Checkpoints");
}
for(s = 0; s < num_snapshot_ref_seqs; ++s) {
if( s != 0 ) {
printf(",");
}
printf("%d", snapshot_ref_seqs[s]);
}
printf("]\n");
if( NULL != snapshot_ref_seqs ) {
free(snapshot_ref_seqs);
snapshot_ref_seqs = NULL;
}
}
cleanup:
if( NULL != snapshot_ref_seqs ) {
free(snapshot_ref_seqs);
snapshot_ref_seqs = NULL;
}
if( NULL != snapshot_refs ) {
free(snapshot_refs);
snapshot_refs = NULL;
}
return exit_status;
}

Просмотреть файл

@ -65,7 +65,7 @@
#include "orte/mca/snapc/snapc.h" #include "orte/mca/snapc/snapc.h"
#include "orte/mca/snapc/base/base.h" #include "orte/mca/snapc/base/base.h"
#include "orte/mca/filem/base/base.h" #include "orte/mca/filem/base/base.h"
#include "orte/util/show_help.h" #include "opal/util/show_help.h"
#include "orte/util/proc_info.h" #include "orte/util/proc_info.h"
/****************** /******************
@ -77,6 +77,7 @@ static int parse_args(int argc, char *argv[]);
static int check_file(orte_snapc_base_global_snapshot_t *snapshot); static int check_file(orte_snapc_base_global_snapshot_t *snapshot);
static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot); static int create_appfile(orte_snapc_base_global_snapshot_t *snapshot);
static int spawn_children(orte_snapc_base_global_snapshot_t *snapshot, pid_t *child_pid); static int spawn_children(orte_snapc_base_global_snapshot_t *snapshot, pid_t *child_pid);
static int snapshot_info(orte_snapc_base_global_snapshot_t *snapshot);
/***************************************** /*****************************************
* Global Vars for Command line Arguments * Global Vars for Command line Arguments
@ -91,6 +92,7 @@ typedef struct {
int seq_number; int seq_number;
char *hostfile; char *hostfile;
int output; int output;
bool info_only;
} orte_restart_globals_t; } orte_restart_globals_t;
orte_restart_globals_t orte_restart_globals; orte_restart_globals_t orte_restart_globals;
@ -140,6 +142,12 @@ opal_cmd_line_init_t cmd_line_opts[] = {
&orte_restart_globals.hostfile, OPAL_CMD_LINE_TYPE_STRING, &orte_restart_globals.hostfile, OPAL_CMD_LINE_TYPE_STRING,
"Provide a hostfile to use for launch" }, "Provide a hostfile to use for launch" },
{ NULL, NULL, NULL,
'i', NULL, "info",
0,
&orte_restart_globals.info_only, OPAL_CMD_LINE_TYPE_BOOL,
"Display information about the checkpoint" },
/* End of list */ /* End of list */
{ NULL, NULL, NULL, { NULL, NULL, NULL,
'\0', NULL, NULL, '\0', NULL, NULL,
@ -175,12 +183,21 @@ main(int argc, char *argv[])
* Check for existence of the file * Check for existence of the file
*/ */
if( ORTE_SUCCESS != (ret = check_file(snapshot)) ) { if( ORTE_SUCCESS != (ret = check_file(snapshot)) ) {
orte_show_help("help-orte-restart.txt", "invalid_filename", true, opal_show_help("help-orte-restart.txt", "invalid_filename", true,
orte_restart_globals.filename); orte_restart_globals.filename);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
} }
if(orte_restart_globals.info_only ) {
if (ORTE_SUCCESS != (ret = snapshot_info(snapshot))) {
exit_status = ret;
goto cleanup;
}
exit_status = ORTE_SUCCESS;
goto cleanup;
}
/****************************** /******************************
* Create the app file to use with mpirun/orterun * Create the app file to use with mpirun/orterun
******************************/ ******************************/
@ -207,7 +224,7 @@ main(int argc, char *argv[])
} }
if( ORTE_SUCCESS != (ret = spawn_children(snapshot, &child_pid)) ) { if( ORTE_SUCCESS != (ret = spawn_children(snapshot, &child_pid)) ) {
orte_show_help("help-orte-restart.txt", "restart_cmd_failure", true, opal_show_help("help-orte-restart.txt", "restart_cmd_failure", true,
orte_restart_globals.filename, ret); orte_restart_globals.filename, ret);
exit_status = ret; exit_status = ret;
goto cleanup; goto cleanup;
@ -220,8 +237,10 @@ main(int argc, char *argv[])
* Cleanup * Cleanup
***************/ ***************/
cleanup: cleanup:
if(NULL != snapshot ) if(NULL != snapshot ) {
OBJ_RELEASE(snapshot); OBJ_RELEASE(snapshot);
snapshot = NULL;
}
if (OPAL_SUCCESS != (ret = finalize())) { if (OPAL_SUCCESS != (ret = finalize())) {
return ret; return ret;
@ -323,7 +342,8 @@ static int parse_args(int argc, char *argv[])
false, /* preload */ false, /* preload */
-1, /* seq_number */ -1, /* seq_number */
NULL, /* hostfile */ NULL, /* hostfile */
-1 }; /* output*/ -1, /* output*/
false };/* info only */
orte_restart_globals = tmp; orte_restart_globals = tmp;
@ -365,7 +385,7 @@ static int parse_args(int argc, char *argv[])
{ {
char *args = NULL; char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line); args = opal_cmd_line_get_usage_msg(&cmd_line);
orte_show_help("help-orte-restart.txt", "usage-no-cr", opal_show_help("help-orte-restart.txt", "usage-no-cr",
true, args); true, args);
free(args); free(args);
return ORTE_ERROR; return ORTE_ERROR;
@ -377,7 +397,7 @@ static int parse_args(int argc, char *argv[])
1 >= argc) { 1 >= argc) {
char *args = NULL; char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line); args = opal_cmd_line_get_usage_msg(&cmd_line);
orte_show_help("help-orte-restart.txt", "usage", true, opal_show_help("help-orte-restart.txt", "usage", true,
args); args);
free(args); free(args);
return ORTE_ERROR; return ORTE_ERROR;
@ -388,7 +408,7 @@ static int parse_args(int argc, char *argv[])
if ( 1 > argc ) { if ( 1 > argc ) {
char *args = NULL; char *args = NULL;
args = opal_cmd_line_get_usage_msg(&cmd_line); args = opal_cmd_line_get_usage_msg(&cmd_line);
orte_show_help("help-orte-restart.txt", "usage", true, opal_show_help("help-orte-restart.txt", "usage", true,
args); args);
free(args); free(args);
return ORTE_ERROR; return ORTE_ERROR;
@ -397,7 +417,7 @@ static int parse_args(int argc, char *argv[])
orte_restart_globals.filename = strdup(argv[0]); orte_restart_globals.filename = strdup(argv[0]);
if ( NULL == orte_restart_globals.filename || if ( NULL == orte_restart_globals.filename ||
0 >= strlen(orte_restart_globals.filename) ) { 0 >= strlen(orte_restart_globals.filename) ) {
orte_show_help("help-orte-restart.txt", "invalid_filename", true, opal_show_help("help-orte-restart.txt", "invalid_filename", true,
orte_restart_globals.filename); orte_restart_globals.filename);
return ORTE_ERROR; return ORTE_ERROR;
} }
@ -589,3 +609,109 @@ static int spawn_children(orte_snapc_base_global_snapshot_t *snapshot, pid_t *ch
return exit_status; return exit_status;
} }
int snapshot_info(orte_snapc_base_global_snapshot_t *snapshot)
{
int ret, exit_status = ORTE_SUCCESS;
int num_seqs, processes, i;
int *snapshot_ref_seqs;
opal_list_item_t* item = NULL;
orte_snapc_base_local_snapshot_t *vpid_snapshot;
if (orte_restart_globals.seq_number == -1) {
if( ORTE_SUCCESS != (ret = orte_snapc_base_get_all_snapshot_ref_seqs(NULL, orte_restart_globals.filename, &num_seqs, &snapshot_ref_seqs) ) ) {
exit_status = ret;
goto cleanup;
}
opal_output(orte_restart_globals.output,
"Sequences: %d\n",
num_seqs);
} else {
num_seqs = 1;
snapshot_ref_seqs = &orte_restart_globals.seq_number;
}
for (i=0; i<num_seqs; i++) {
snapshot->seq_num = snapshot_ref_seqs[i];
while (NULL != (item = opal_list_remove_first(&snapshot->local_snapshots))) {
OBJ_RELEASE(item);
}
if( NULL != snapshot->start_time ) {
free( snapshot->start_time );
snapshot->start_time = NULL;
}
if( NULL != snapshot->end_time ) {
free( snapshot->end_time );
snapshot->end_time = NULL;
}
opal_output(orte_restart_globals.output,
"Seq: %d\n",
snapshot->seq_num);
if( ORTE_SUCCESS != (ret = orte_snapc_base_extract_metadata( snapshot ) ) ) {
exit_status = ret;
goto cleanup;
}
item = opal_list_get_first(&snapshot->local_snapshots);
vpid_snapshot = (orte_snapc_base_local_snapshot_t*)item;
if (NULL != snapshot->start_time ) {
opal_output(orte_restart_globals.output,
"Begin Timestamp: %s\n",
snapshot->start_time);
}
if (NULL != vpid_snapshot->opal_crs ) {
opal_output(orte_restart_globals.output,
"OPAL CRS Component: %s\n",
vpid_snapshot->opal_crs);
}
if (NULL != snapshot->reference_name) {
opal_output(orte_restart_globals.output,
"Snapshot Reference: %s\n",
snapshot->reference_name);
}
if (NULL != snapshot->local_location) {
opal_output(orte_restart_globals.output,
"Snapshot Location: %s\n",
snapshot->local_location);
}
if (NULL != snapshot->end_time ) {
opal_output(orte_restart_globals.output,
"End Timestamp: %s\n",
snapshot->end_time);
}
processes = 0;
for(item = opal_list_get_first(&snapshot->local_snapshots);
item != opal_list_get_end(&snapshot->local_snapshots);
item = opal_list_get_next(item) ) {
processes++;
}
opal_output(orte_restart_globals.output,
"Processes: %d\n",
processes);
for(item = opal_list_get_first(&snapshot->local_snapshots);
item != opal_list_get_end(&snapshot->local_snapshots);
item = opal_list_get_next(item) ) {
vpid_snapshot = (orte_snapc_base_local_snapshot_t*)item;
opal_output_verbose(10, orte_restart_globals.output,
"Process: %u.%u",
vpid_snapshot->process_name.jobid,
vpid_snapshot->process_name.vpid);
}
}
cleanup:
return exit_status;
}